### Introduction

I would like to explore emotion prediction in Chinese domain.

In [None]:
#!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio
!pip install transformers
!pip install tqdm scikit-learn datasets

In [None]:
#@title Import Package
import torch
print(torch.__version__)
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
import multiprocessing as mp
from importlib import reload
from collections import Counter
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from transformers import (WEIGHTS_NAME,
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
from transformers import BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Access data
test_split = pd.read_csv("/content/drive/MyDrive/NLP/Project4/Model/cn-data-revised/test_split.csv")

train_split = pd.read_csv("/content/drive/MyDrive/NLP/Project4/Model/cn-data-revised/train_split.csv")
valid_split = pd.read_csv("/content/drive/MyDrive/NLP/Project4/Model/cn-data-revised/valid_split.csv")

In [None]:
train_split.head(5) # 'Emotion', 'Utterance'

In [None]:
def joinTextBySpeakerAndEmotion(df):

  # Initialize the group column
  df['group'] = 0
  current_group = 1

  # Iterate through the DataFrame
  for i in range(len(df) - 1):
      df.at[i, 'group'] = current_group
      # print(current_group)

      # Check if the next row should be in the same group
      if df.at[i, 'Speaker'] != df.at[i + 1, 'Speaker'] or df.at[i, 'Emotion'] != df.at[i + 1, 'Emotion']:
          current_group += 1

  # Assign group number to the last row
  df.at[len(df) - 1, 'group'] = current_group

  # Now group by 'group', 'Speaker', and 'Emotion', and combine 'translation'
  # grouped = df.groupby(['group', 'Speaker', 'Emotion']).agg({
  grouped = df.groupby(['group']).agg({
      'Utterance' : lambda y: ', '.join(y) + '.',
      # For other columns, you can decide how to aggregate them
      'TV_ID': 'first',
      'Dialogue_ID': 'first',
      'Utterance_ID': 'first',
      'Speaker': 'first',
      'Emotion': 'first'
      # ... handle other columns as needed
  }).reset_index(drop=True)

  # The resulting DataFrame
  return grouped

In [None]:
train_split = joinTextBySpeakerAndEmotion(train_split)
valid_split = joinTextBySpeakerAndEmotion(valid_split)
test_split  = joinTextBySpeakerAndEmotion(test_split)

In [None]:
print('number of train data:', len(train_split))
print('number of validation data:', len(valid_split))
print('number of test data:', len(test_split))

In [None]:
# @title Convert pandas dataframes to Hugging Face Dataset
# Select only the 'Utterance' and 'Emotion' columns
train_split = train_split[['Utterance', 'Emotion']]
valid_split = valid_split[['Utterance', 'Emotion']]
test_split  = test_split[['Utterance', 'Emotion']]

# randomly select 1/3 data from training set
random_seed = 42
# Randomly sample train:valid = 4:1
train_split = train_split.sample(frac=3/4, random_state=random_seed)
# valid_split = valid_split.sample(frac=3/4, random_state=random_seed)

print('number of train data:', len(train_split))
print('number of validation data:', len(valid_split))
print('number of test data:', len(test_split))

# map emotions astonished ->
'''
id2label = {
    0: 'admiration',
    1: 'amusement',
    2: 'anger',
    3: 'annoyance',
    4: 'approval',
    5: 'astonished',  # not in GoEmotion
    6: 'caring',
    7: 'confusion',
    8: 'curiosity',
    9: 'depress',     # not in GoEmotion
    10: 'desire',
    11: 'disappointment',
    12: 'disapproval',
    13: 'disgust',
    14: 'embarrassment',
    15: 'excitement',
    16: 'fear',
    17: 'grateful',   # not in GoEmotion
    18: 'gratitude',
    19: 'grief',
    20: 'happy',      # not in GoEmotion
    21: 'joy',
    22: 'love',
    23: 'nervousness',
    24: 'neutral',
    25: 'optimism',
    26: 'pride',
    27: 'realization',
    28: 'relaxed',    # not in GoEmotion
    29: 'relief',
    30: 'remorse',
    31: 'sadness',
    32: 'surprise',
    33: 'worried'   # not in GoEmotion
  }
'''
# Define the mapping of old values to new values
emotion_replacements = {
    'happy': 'joy',
    'grateful': 'gratitude',
    'relaxed': 'relief',
    'depress': 'grief',
    'astonished': 'surprise',
    'worried': 'nervousness'
}

# Apply the replacements to the 'Emotion' column of each DataFrame
train_split['Emotion'] = train_split['Emotion'].replace(emotion_replacements)
valid_split['Emotion'] = valid_split['Emotion'].replace(emotion_replacements)
test_split['Emotion'] = test_split['Emotion'].replace(emotion_replacements)

# rename columns
train_split = train_split.rename(columns={"Utterance": "text", "Emotion": "labels"})
valid_split = valid_split.rename(columns={"Utterance": "text", "Emotion": "labels"})
test_split = test_split.rename(columns={"Utterance": "text", "Emotion": "labels"})

# convert to numberical labels
# Create label2id and id2label dictionaries
# label2id = {label: [idx] for idx, label in enumerate(train_split['labels'].unique())}
# id2label = {map(int,idx): label for label, idx in label2id.items()}

id2label = {0:"admiration",
            1:"amusement",
            2:"anger",
            3:"annoyance",
            4:"approval",
            5:"caring",
            6:"confusion",
            7:"curiosity",
            8:"desire",
            9:"disappointment",
            10:"disapproval",
            11:"disgust",
            12:"embarrassment",
            13:"excitement",
            14:"fear",
            15:"gratitude",
            16:"grief",
            17:"joy",
            18:"love",
            19:"nervousness",
            20:"optimism",
            21:"pride",
            22:"realization",
            23:"relief",
            24:"remorse",
            25:"sadness",
            26:"surprise",
            27:"neutral"}
label2id = {label: [idx] for idx, label in id2label.items()}
# print(label2id)

# convert labels in the dataframes to numerical labels
train_split['labels'] = train_split['labels'].map(label2id)
valid_split['labels'] = valid_split['labels'].map(label2id)
test_split['labels'] = test_split['labels'].map(label2id)

from datasets import Dataset
# create data set
train_dataset = Dataset.from_pandas(train_split)
valid_dataset = Dataset.from_pandas(valid_split)
test_dataset = Dataset.from_pandas(test_split)

# Combine into a DatasetDict
dataset = {"train": train_dataset, "validation": valid_dataset, "test" : test_dataset}

In [None]:
# goEmotion data
# goEmotion = load_dataset("go_emotions", "simplified")
# print(Counter([x['labels'].__len__() for x in goEmotion['train']]))
# print(goEmotion.keys())
# print(goEmotion['train'][0])
# print("------")
# current Chinese data, compared with goEmotion data
# @title look into dataset
# print(Counter([x['labels'].__len__() for x in dataset['train']]))
print(dataset.keys())
print(dataset['train'][0])

#### Baseline Model

I would like to reuse Roberta Model from project 3. First, I will convert Chinese text into English text. Then, I will be able to use the Roberta model directly.

#### Advanced Model

Use model that is trained directly on Chinese text. https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md

In [None]:
from transformers import BertTokenizer, BertModel

# Load Chinese tokenizer
MODEL_NAME = "hfl/chinese-roberta-wwm-ext-large"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

num_labels = len(label2id)
# Load the base BERT model
model = BertModel.from_pretrained(MODEL_NAME)
#model = BertForSequenceClassification.from_pretrained(MODEL_NAME)

In [None]:
from torch.utils.data import Dataset
class emotionDataset(Dataset):
    """Class to load the dataset and get batches of paras"""

    def __init__(self, list_data,
                 tokenizer, max_length):

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = list_data
        self.pad_token = 1

    def __len__(self):
        """Return length of dataset."""
        return self.data.__len__()

    def __getitem__(self, i):
        """Return sample from dataset at index i."""
        example = self.data[i]
        inputs = self.tokenizer.encode_plus(example['text'],
                                            add_special_tokens=True,
                                            truncation=True,
                                            max_length=self.max_length)

        input_ids = inputs["input_ids"]
        input_ids = input_ids[:self.max_length]
        attention_mask = [1] * len(input_ids)

        padding_length = self.max_length - len(input_ids)
        input_ids = input_ids + ([self.pad_token] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)

        assert len(input_ids) == self.max_length, "Error with input length {} vs {}".format(len(input_ids), self.max_length)

        nli_label = example['labels'][0]

        return_dict = {'input_ids':torch.LongTensor(input_ids),
                       'attention_mask':torch.LongTensor(attention_mask),
                       'labels': torch.LongTensor([nli_label])}

        return return_dict

In [None]:
# Train dataset
train_dataset = emotionDataset(list_data=dataset['train'],
                               tokenizer=tokenizer,
                               max_length=100)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                          num_workers=mp.cpu_count())

# Validation dataset
val_dataset = emotionDataset(list_data=dataset['validation'],
                             tokenizer=tokenizer,
                             max_length=100)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,
                        num_workers=mp.cpu_count())

# Test dataset
test_dataset = emotionDataset(list_data=dataset['test'],
                               tokenizer=tokenizer,
                               max_length=100)
test_loader = DataLoader(test_dataset, batch_size=32,
                         shuffle=False, num_workers=mp.cpu_count())

(train_loader, val_loader)

In [None]:
example_batch = next(iter(train_loader))
example_batch['input_ids'].shape, example_batch['attention_mask'].shape, example_batch['labels'].shape

In [None]:
# @title tried different learning rate 1e-5, 2e-5
'''
args = {'weight_decay':0.0,
        'learning_rate':2e-5,
        'epochs':5,
        'gradient_accumulation_steps':1,
        'adam_epsilon':2e-8}
args['t_total'] = len(train_loader) // args['gradient_accumulation_steps'] * args['epochs']
args['warmup_steps'] = int(0.10*args['t_total'])

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'],
                                            num_training_steps=args['t_total'])
'''
args = {
    'learning_rate': 2e-5,  # Learning rate
    'epochs': 5,            # Number of epochs
    'weight_decay': 0.01,   # Weight decay for regularization
    'adam_epsilon': 1e-8,   # Epsilon value for Adam optimizer
    'warmup_steps': 1000,      # Number of warmup steps
    'max_grad_norm': 1.0,   # Max gradient norm for gradient clipping
    'dropout_rate': 0.1     # Dropout rate
}

# Adjust learning rate with a scheduler
total_steps = len(train_loader) * args['epochs']

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=args['warmup_steps'],
                                            num_training_steps=total_steps)

In [None]:
torch.cuda.empty_cache()

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
class_counts = [0.2, 0.2, 12, 13, 0.2, 0.2, 0.8, 1, 0.2, 0.8, 1.2, 6.2, 0.8, 0.2, 1.7, 0.4, 9.4, 3.3, 0.2, 6.2, 0.2, 0.2, 0.2, 6.8, 0.8, 1.9, 3.1, 29]
total_count = sum(class_counts)
class_weights = [total_count / class_count for class_count in class_counts]

# Normalize the weights so that the smallest one is 1.0
max_weight = max(class_weights)
normalized_weights = [w / max_weight for w in class_weights]

class_weights_tensor = torch.tensor(normalized_weights)

class CustomBertClassifier(nn.Module):
    def __init__(self, bert_model, num_labels,class_weights=class_weights_tensor):
        super(CustomBertClassifier, self).__init__()
        self.bert = bert_model
        self.num_labels = num_labels
        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(self.bert.config.hidden_size, 768)
        self.elu = nn.ELU()
        self.out_proj = nn.Linear(768, num_labels)

        # If class weights are provided, use them to initialize CrossEntropyLoss
        if class_weights is not None:
            self.loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        x = self.dropout(pooled_output)
        x = self.dense(x)
        x = self.elu(x)
        logits = self.out_proj(x)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return loss, logits

    def set_dropout_rate(self, dropout_rate):
        self.dropout.p = dropout_rate
        self.bert.config.hidden_dropout_prob = dropout_rate
        self.bert.config.attention_probs_dropout_prob = dropout_rate


# Initialize the custom model:
# Initialize the custom model with the base BERT model
custom_model = CustomBertClassifier(model, num_labels)

# Set the dropout rate
dropout_rate = args['dropout_rate']
custom_model.set_dropout_rate(dropout_rate)

# Move the model to GPU and apply DataParallel
model = nn.DataParallel(custom_model).to(device)

In [None]:
# model_save_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints/'  # model with config from proj3 that has little training improvement
# model_save_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints2/'  # model fine tuned with two layers (128 ALU) added with lr = 1e-5
# model_save_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints3' # model fine tuned with two layers (521 ELU) added with lr = 1e-5
# model_save_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints4' # model fine tuned with two layers (768 ELU) added with lr = 1e-5 now drop out rate 01->0.6, and new loss func
# model_save_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints5' # model fine tuned with two layers (768 ELU) added with lr = 2e-5 now drop out rate 0.2, and new loss func and new data
model_save_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints6'
os.makedirs(model_save_path, exist_ok=True)

1. Checkpoints2  

Epoch 1/5 - Train Loss: 2.3372, Val Loss: 2.4042

Epoch 2/5 - Train Loss: 2.1814, Val Loss: 2.3931

Epoch 3/5 - Train Loss: 2.0031, Val Loss: 2.4502

Epoch 4/5 - Train Loss: 1.7970, Val Loss: 2.5422

Epoch 5/5 - Train Loss: 1.6418, Val Loss: 2.6374

2. Checkpoints3  

Epoch 1/5 - Train Loss: 2.3058, Val Loss: 2.3740

Epoch 2/5 - Train Loss: 2.1385, Val Loss: 2.3510

Epoch 3/5 - Train Loss: 1.9602, Val Loss: 2.4255

Epoch 4/5 - Train Loss: 1.7630, Val Loss: 2.5286

Next change the loss function to take into account the imbalanced data in the dataset.  

3. Checkpoints4  

Epoch 1/8 - Train Loss: 3.2579, Val Loss: 3.2368

Epoch 2/8 - Train Loss: 3.0837, Val Loss: 3.2269

Epoch 3/8 - Train Loss: 2.7848, Val Loss: 3.3330

Epoch 4/8 - Train Loss: 2.4037, Val Loss: 3.4085

Epoch 5/8 - Train Loss: 2.0744, Val Loss: 3.4990

Epoch 6/8 - Train Loss: 1.8241, Val Loss: 3.5701

Epoch 7/8 - Train Loss: 1.6487, Val Loss: 3.6001

Epoch 8/8 - Train Loss: 1.5383, Val Loss: 3.6185

In [None]:
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import torch

# Assuming args, model, train_loader, val_loader, optimizer, scheduler, device, and model_save_path are defined

train_losses = []
val_losses = []

for each_epoch in range(args['epochs']):
    # Training
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        model.zero_grad()
        loss, logits = model(**batch)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, logits = model(**batch)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {each_epoch + 1}/{args['epochs']} - "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}")

    # Save model
    epoch_save_path = os.path.join(model_save_path, f'model_epoch_{each_epoch+1}.pt')
    torch.save(model.state_dict(), epoch_save_path)
    print(f"Saved model checkpoint to {epoch_save_path}")

In [None]:
# Plotting the training and validation losses
plt.figure(figsize=(10, 6))
plt.plot(range(1, args['epochs'] + 1), train_losses, label='Training Loss', marker='o')
plt.plot(range(1, args['epochs'] + 1), val_losses, label='Validation Loss', marker='o')

plt.title('Training and Validation Loss Per Epoch')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from transformers import BertModel

# Load model
base_bert_model = BertModel.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
custom_model = CustomBertClassifier(base_bert_model, num_labels)
# Wrap the model with nn.DataParallel and move to the device
model = nn.DataParallel(custom_model).to(device)

# Load the saved model weights
model_path = '/content/drive/MyDrive/NLP/Project4/Checkpoints5/model_epoch_2.pt'
# Load the saved model weights
model.load_state_dict(torch.load(model_path))

# Move the model to GPU and apply DataParallel if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(custom_model).to(device)

In [None]:
def evaluate_model(model, test_loader, device, num_samples=10):
    """
    Evaluate the model on a test dataset.

    Args:
        model: The PyTorch model to be evaluated.
        test_loader: DataLoader for the test dataset.
        device: The device (CPU/GPU) to run the evaluation on.
        num_samples: Number of samples to print for inspection.

    Returns:
        A string containing the classification report.
    """
    model.eval()  # Set the model to evaluation mode

    all_preds = []
    all_labels = []
    sample_outputs = []

    with torch.no_grad():  # No need to track the gradients
        for i, batch in enumerate(tqdm(test_loader, desc="Evaluating", leave=False)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass, calculate logit predictions
            outputs = model(input_ids, attention_mask)
            logits = outputs[1]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            labels = labels.to('cpu').numpy()

            # Convert logits to predictions
            preds = np.argmax(logits, axis=1).flatten()

            all_preds.extend(preds)
            all_labels.extend(labels.flatten())

            # Save some sample outputs for inspection
            if i < num_samples:
                sample_outputs.append((preds, labels.flatten()))

    # Print sample predictions
    for j, (pred, label) in enumerate(sample_outputs):
        print(f"Sample {j+1}:")
        print(f"Predictions: {pred}")
        print(f"True Labels: {label}\n")

    # Calculate metrics
    return classification_report(all_preds, all_labels, target_names=list(id2label.values()), digits=4)

# Evaluate the model
report = evaluate_model(model, test_loader, device)
print(report)