<a href="https://colab.research.google.com/github/ucaokylong/LLM_learning/blob/main/Fine_tune_gpt_multitask_autoregressive_and_classification_with_dualLoss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown 16X2D5KrBsP0nTojfrYu9zrBMVxcC7JsX

Downloading...
From: https://drive.google.com/uc?id=16X2D5KrBsP0nTojfrYu9zrBMVxcC7JsX
To: /content/train_nor_811.xlsx
  0% 0.00/259k [00:00<?, ?B/s]100% 259k/259k [00:00<00:00, 87.5MB/s]


In [2]:
!gdown 1DOkLy8-aO1_KlwvkuxVIA9jvvGycDRzp

Downloading...
From: https://drive.google.com/uc?id=1DOkLy8-aO1_KlwvkuxVIA9jvvGycDRzp
To: /content/test_nor_811.xlsx
  0% 0.00/37.6k [00:00<?, ?B/s]100% 37.6k/37.6k [00:00<00:00, 62.8MB/s]


In [3]:
!gdown 1OOvkJmkQE5O3nQDrHnbu2tjuDkMM1PAU

Downloading...
From: https://drive.google.com/uc?id=1OOvkJmkQE5O3nQDrHnbu2tjuDkMM1PAU
To: /content/valid_nor_811.xlsx
  0% 0.00/38.0k [00:00<?, ?B/s]100% 38.0k/38.0k [00:00<00:00, 63.8MB/s]


In [4]:
import pandas as pd
def get_data(path):
    df = pd.read_excel(path, sheet_name=None)['Sheet1']
    df.columns = ['index', 'Emotion', 'Sentence']
    df.drop(columns=['index'], inplace=True)
    return df

train_df = get_data('./train_nor_811.xlsx')
valid_df = get_data('./valid_nor_811.xlsx')
test_df = get_data('./test_nor_811.xlsx')


In [5]:
train_df.head()

Unnamed: 0,Emotion,Sentence
0,Other,cho mình xin bài nhạc tên là gì với ạ
1,Disgust,cho đáng đời con quỷ . về nhà lôi con nhà mày ...
2,Disgust,lo học đi . yêu đương lol gì hay lại thích học...
3,Enjoyment,uớc gì sau này về già vẫn có thể như cụ này :))
4,Enjoyment,mỗi lần có video của con là cứ coi đi coi lại ...


In [6]:
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

In [7]:
import pandas as pd
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=120):
        """
        Dataset for dual-loss training (language modeling and classification).
        """
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        """
        Returns:
            {
                'text': Original text,
                'input_ids': Tokenized input IDs for the model,
                'attention_masks': Attention masks for padding,
                'lm_labels': Shifted input IDs for language modeling loss,
                'classification_labels': Target labels for classification loss
            }
        """
        row = self.df.iloc[index]
        text, classification_label = self.get_input_data(row)

        # Tokenize the input for language modeling and classification
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].flatten()
        # Shift labels for language modeling (next token prediction)
        lm_labels = input_ids.clone()
        lm_labels[lm_labels == self.tokenizer.pad_token_id] = -100  # Ignore pad tokens in loss

        return {
            'text': text,
            'input_ids': input_ids,
            'attention_masks': encoding['attention_mask'].flatten(),
            'lm_labels': lm_labels,
            'classification_labels': torch.tensor(classification_label, dtype=torch.long),
        }

    def labelencoder(self, text):
        """
        Maps emotion labels to integer indices.
        """
        label_map = {
            'Enjoyment': 0,
            'Disgust': 1,
            'Sadness': 2,
            'Anger': 3,
            'Surprise': 4,
            'Fear': 5,
        }
        return label_map.get(text, 6)  # Default to 6 for unknown labels

    def get_input_data(self, row):
        """
        Preprocesses the input text and retrieves the classification label.
        """
        # Text preprocessing: remove special characters, lowercasing
        text = row['Sentence']
        text = ' '.join(text.lower().split())  # Simplified preprocessing
        label = self.labelencoder(row['Emotion'])
        return text, label

In [8]:
from transformers import GPT2Tokenizer
import pandas as pd
from torch.utils.data import DataLoader


# Initialize tokenizer and dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is defined
# Initialize datasets



In [9]:
train_dataset = SentimentDataset(train_df, tokenizer, max_len=32)
validation_dataset = SentimentDataset(valid_df, tokenizer, max_len=32)
test_dataset = SentimentDataset(test_df, tokenizer, max_len=32)

In [10]:
train_dataset[0]

{'text': 'cho mình xin bài nhạc tên là gì với ạ',
 'input_ids': tensor([ 6679,   285,   127,   105,    77,    71,  2124,   259,   275, 24247,
            72,   299,    71,   157,   118,    94,    66,   256, 25792,    77,
           300, 24247,   308,   127,   105,   410,   157,   119,   249,    72,
         28053,   118]),
 'attention_masks': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]),
 'lm_labels': tensor([ 6679,   285,   127,   105,    77,    71,  2124,   259,   275, 24247,
            72,   299,    71,   157,   118,    94,    66,   256, 25792,    77,
           300, 24247,   308,   127,   105,   410,   157,   119,   249,    72,
         28053,   118]),
 'classification_labels': tensor(6)}

In [11]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [12]:
import torch
import torch.nn as nn



In [13]:
from transformers import GPT2Tokenizer

In [15]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
Successfully installed transformers-4.47.1


In [14]:
from transformers import GPT2Model

In [15]:
class TransformerWithDualLoss(nn.Module):
    def __init__(self, transformer_model_name, num_classes):
        """
        Transformer model with dual loss: language modeling loss and classification loss.
        """
        super(TransformerWithDualLoss, self).__init__()
        self.transformer = GPT2Model.from_pretrained(transformer_model_name)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_classes)
        self.lm_head = nn.Linear(self.transformer.config.hidden_size, self.transformer.config.vocab_size)

    def forward(self, input_ids, attention_mask=None, lm_labels=None, classification_labels=None):
        """
        Forward pass for both language modeling and classification tasks.

        Args:
            input_ids: Input token IDs (batch_size, seq_length).
            attention_mask: Attention mask (batch_size, seq_length).
            lm_labels: Labels for the language modeling task (batch_size, seq_length).
            classification_labels: Labels for the classification task (batch_size).

        Returns:
            Dictionary containing losses and logits:
                - lm_loss: Language modeling loss.
                - classification_loss: Classification loss.
                - lm_logits: Language modeling logits.
                - classification_logits: Classification logits.
        """
        # Forward pass through the transformer
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_length, hidden_size)

        # Language modeling logits and loss
        lm_loss = None
        if lm_labels is not None:
            lm_logits = self.lm_head(last_hidden_state)  # Predict next token probabilities
            lm_loss_fct = nn.CrossEntropyLoss()
            # Shift logits and labels for language modeling
            shift_logits = lm_logits[:, :-1, :].contiguous()
            shift_labels = lm_labels[:, 1:].contiguous()
            lm_loss = lm_loss_fct(shift_logits.view(-1, lm_logits.size(-1)), shift_labels.view(-1))
        else:
            lm_logits = None

        # Classification logits and loss
        classification_loss = None
        pooled_output = last_hidden_state.mean(dim=1)  # Mean pooling for sentence-level representation
        classification_logits = self.classifier(pooled_output)  # Predict class probabilities
        if classification_labels is not None:
            classification_loss_fct = nn.CrossEntropyLoss()
            classification_loss = classification_loss_fct(classification_logits, classification_labels)

        # Return both losses and logits
        return {
            "lm_loss": lm_loss,
            "classification_loss": classification_loss,
            "lm_logits": lm_logits,
            "classification_logits": classification_logits
        }


In [16]:
model = TransformerWithDualLoss(
    transformer_model_name="gpt2", num_classes=7
    )

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [17]:
from torch.nn.utils import clip_grad_norm_

In [24]:
from torch.nn.utils import clip_grad_norm_

def train(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    epochs=3,
    alpha=0.5,
    clip_grad=1.0
):
    """
    Train the TransformerWithDualLoss model with validation.

    Args:
        model: The model instance.
        train_loader: DataLoader for the training dataset.
        val_loader: DataLoader for the validation dataset.
        optimizer: Optimizer instance.
        device: Device to train on (CPU/GPU).
        epochs: Number of training epochs.
        alpha: Weight for combining losses (0.5 = equal weight for both losses).
        clip_grad: Gradient clipping value.
    """
    model.to(device)

    for epoch in range(epochs):
        # Training phase
        model.train()
        train_total_loss = 0.0
        print(f"Epoch {epoch + 1}/{epochs} - Training Phase")
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_masks'].to(device)
            lm_labels = batch['lm_labels'].to(device)
            classification_labels = batch['classification_labels'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                lm_labels=lm_labels,
                classification_labels=classification_labels,
            )

            # Combine losses
            lm_loss = outputs["lm_loss"]
            classification_loss = outputs["classification_loss"]
            if lm_loss is None:
                total_batch_loss = classification_loss
            elif classification_loss is None:
                total_batch_loss = lm_loss
            else:
                total_batch_loss = alpha * lm_loss + (1 - alpha) * classification_loss

            # Backward pass
            optimizer.zero_grad()
            total_batch_loss.backward()
            clip_grad_norm_(model.parameters(), clip_grad)
            optimizer.step()

            # Update total loss
            train_total_loss += total_batch_loss.item()

            # Log batch loss
            print(f"  Batch {batch_idx + 1}/{len(train_loader)} - Loss: {total_batch_loss.item():.4f}")

        avg_train_loss = train_total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} - Average Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_total_loss = 0.0
        with torch.no_grad():
            for batch_idx, batch in enumerate(val_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_masks'].to(device)
                lm_labels = batch['lm_labels'].to(device)
                classification_labels = batch['classification_labels'].to(device)

                # Forward pass
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    lm_labels=lm_labels,
                    classification_labels=classification_labels,
                )

                # Combine losses
                lm_loss = outputs["lm_loss"]
                classification_loss = outputs["classification_loss"]
                if lm_loss is None:
                    total_batch_loss = classification_loss
                elif classification_loss is None:
                    total_batch_loss = lm_loss
                else:
                    total_batch_loss = alpha * lm_loss + (1 - alpha) * classification_loss

                # Update total validation loss
                val_total_loss += total_batch_loss.item()

        avg_val_loss = val_total_loss / len(val_loader)
        print(f"Epoch {epoch + 1} - Average Validation Loss: {avg_val_loss:.4f}")


In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Train the model
train(
    model,
    train_loader,
    validation_loader, optimizer, device, epochs=3, alpha=0.5)

Epoch 1/3 - Training Phase
  Batch 1/174 - Loss: 11.4017
  Batch 2/174 - Loss: 10.0754
  Batch 3/174 - Loss: 8.5706
  Batch 4/174 - Loss: 7.2906
  Batch 5/174 - Loss: 6.8574
  Batch 6/174 - Loss: 6.6988
  Batch 7/174 - Loss: 6.4868
  Batch 8/174 - Loss: 6.4314
  Batch 9/174 - Loss: 6.2490
  Batch 10/174 - Loss: 6.3019
  Batch 11/174 - Loss: 6.2384
  Batch 12/174 - Loss: 6.2138
  Batch 13/174 - Loss: 6.1347
  Batch 14/174 - Loss: 6.1212
  Batch 15/174 - Loss: 6.1461
  Batch 16/174 - Loss: 6.1262
  Batch 17/174 - Loss: 6.0127
  Batch 18/174 - Loss: 6.1062
  Batch 19/174 - Loss: 5.9157
  Batch 20/174 - Loss: 6.0139
  Batch 21/174 - Loss: 5.9201
  Batch 22/174 - Loss: 5.8319
  Batch 23/174 - Loss: 5.9330
  Batch 24/174 - Loss: 5.8356
  Batch 25/174 - Loss: 5.7651
  Batch 26/174 - Loss: 5.6375
  Batch 27/174 - Loss: 5.5288
  Batch 28/174 - Loss: 5.5433
  Batch 29/174 - Loss: 5.4497
  Batch 30/174 - Loss: 5.4031
  Batch 31/174 - Loss: 5.3996
  Batch 32/174 - Loss: 5.3769
  Batch 33/174 - Los