# Transformer

## Importation

In [None]:
!pip install datasets
!pip install scikit-learn

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import load_dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
import random

## Dataset

In [None]:
def split_dataset(dataset, train_ratio: float = 0.8,
                  test_ratio: float = 0.1,
                  val_ratio: float = 0.1):
    length = len(dataset["train"])
    # shuffle the dataset
    dataset["train"] = dataset["train"].shuffle()
    train_set = dataset["train"].select(range(int(length * train_ratio)))
    test_set = dataset["train"].select(range(int(length * train_ratio), int(length * (train_ratio + test_ratio))))
    val_set = dataset["train"].select(range(int(length * (train_ratio + test_ratio)), length))

    return train_set, test_set, val_set

def load_twitter_dataset(split = "split"):
    if split not in ["split", "unsplit"]:
        raise ValueError("split must be either 'split' or 'unsplit'")
    ds = load_dataset("dair-ai/emotion", split)

    # Get the train, validation and test sets
    train_set, test_set, val_set = None, None, None

    if split == "split":
        train_set = ds["train"]
        test_set = ds["test"]
        val_set = ds["validation"]
    else:
        # Split the training, validation and test sets, because the "unsplit" dataset only has a training set
        train_set, test_set, val_set = split_dataset(ds)

    # Make sure the dataset is in the right format and in memory
    train_set = train_set.map(lambda x: {"text": x["text"], "label": x["label"]})
    test_set = test_set.map(lambda x: {"text": x["text"], "label": x["label"]})
    val_set = val_set.map(lambda x: {"text": x["text"], "label": x["label"]})

    emotions = [
        "sadness",
        "joy",
        "love",
        "anger",
        "fear",
        "surprise",
    ]


    return train_set, test_set, val_set, emotions

def load_goemotions_dataset(split = "simplified"):
    if split not in ["simplified", "raw"]:
        raise ValueError("split must be either 'simplified' or 'raw'")
    emotions = [
        "admiration",
        "amusement",
        "anger",
        "annoyance",
        "approval",
        "caring",
        "confusion",
        "curiosity",
        "desire",
        "disappointment",
        "disapproval",
        "disgust",
        "embarrassment",
        "excitement",
        "fear",
        "gratitude",
        "grief",
        "joy",
        "love",
        "nervousness",
        "optimism",
        "pride",
        "realization",
        "relief",
        "remorse",
        "sadness",
        "surprise",
        "neutral",
    ]
    ds = load_dataset("google-research-datasets/go_emotions", split)

    train_set, test_set, val_set = None, None, None

    if split == "simplified":
        # Get the train and test sets
        train_set = ds["train"]
        test_set = ds["test"]
        val_set = ds["validation"]

        train_set = train_set.map(lambda x: {"text": x["text"], "label": x["labels"][random.randint(0, len(x["labels"]) - 1)]})
        test_set = test_set.map(lambda x: {"text": x["text"], "label": x["labels"][random.randint(0, len(x["labels"]) - 1)]})
        val_set = val_set.map(lambda x: {"text": x["text"], "label": x["labels"][random.randint(0, len(x["labels"]) - 1)]})
    else:
        # Get the train, validation and test sets
        train_set, test_set, val_set = split_dataset(ds)

        # the google dataset raw labels are not stored in a list, but rather there are multiple keys: 'admiration', 'amusement', etc.
        # where if it is present, the value is 1, otherwise it is 0
        # so we need to convert this to a list of labels
        def row_to_label(row):
            for i in range(len(emotions)):
                if row[emotions[i]] == 1:
                    return i


        # Make sure the dataset is in the right format and in memory
        train_set = train_set.map(lambda x: {"text": x["text"], "label": row_to_label(x)})
        test_set = test_set.map(lambda x: {"text": x["text"], "label": row_to_label(x)})
        val_set = val_set.map(lambda x: {"text": x["text"], "label": row_to_label(x)})

    # remove rows that have no label
    train_set = train_set.filter(lambda x: x["label"] != None)
    test_set = test_set.filter(lambda x: x["label"] != None)
    val_set = val_set.filter(lambda x: x["label"] != None)

    return train_set, test_set, val_set, emotions


# train_set, test_set, val_set, __emotions = load_twitter_dataset("split")
google_train_set, google_test_set, google_val_set, google__emotions = load_goemotions_dataset("simplified")
train_set, test_set, val_set, __emotions = google_train_set, google_test_set, google_val_set, google__emotions

EMOTION_TO_INDEX = {
    emotion: index for index, emotion in enumerate(__emotions)
}

INDEX_TO_EMOTION = {
    index: emotion for emotion, index in EMOTION_TO_INDEX.items()
}

## Vocabulary and tokenization functions

In [29]:
import spacy
from collections import Counter
from multiprocessing import Pool

nlp = spacy.load("en_core_web_sm")

def tokenize(text: str) -> list[str]:
    tokens = nlp(text)
    return [token.lemma_.lower().strip() for token in tokens]

def tokenize_doc(doc):
    return [token.lemma_.lower().strip() for token in doc]

def build_vocab(texts, max_vocab_size):
    with Pool() as p:
        preprocessed_texts = p.map(tokenize_doc, nlp.pipe(texts, batch_size=1024))
        counter = Counter(token for tokens in preprocessed_texts for token in tokens)
    most_common: list[tuple[str, int]] = counter.most_common(max_vocab_size - 2)
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(most_common)}  # Reserve 0 and 1
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    return vocab, preprocessed_texts

def encode(text, vocab, max_len):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    if len(ids) > max_len:
        return ids[:max_len]
    return ids + [vocab["<PAD>"]] * (max_len - len(ids))

## Build Vocabulary

In [30]:
# Build vocabulary
all_texts = train_set["text"]
VOCAB_SIZE = 15000
MAX_LEN = 128
vocab, pre_processed_train_texts = build_vocab(all_texts, VOCAB_SIZE)

## Pytorch Dataset Class

In [None]:
def tensor_encode(text: str, vocab: dict[str, int], max_len: int) -> torch.Tensor:
    base = torch.tensor(encode(text, vocab, max_len), dtype=torch.long)
    return base

from tqdm.notebook import tqdm

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

        self.processed_texts = []
        # with Pool() as p:
        #     self.processed_texts = p.starmap(tensor_encode, [(text, vocab, max_len) for text in texts])
        for text in tqdm(nlp.pipe(texts, batch_size=1024)):
            self.processed_texts.append(tensor_encode(text, vocab, max_len))
        self.processed_labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            "input_ids": self.processed_texts[idx],
            "labels": self.processed_labels[idx],
        }

# Prepare datasets
train_dataset = EmotionDataset(train_set["text"], train_set["label"], vocab, MAX_LEN)
val_dataset = EmotionDataset(val_set["text"], val_set["label"], vocab, MAX_LEN)
test_dataset = EmotionDataset(test_set["text"], test_set["label"], vocab, MAX_LEN)

## Pytorch data preparation

In [43]:

# DataLoaders
BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

NUM_CLASSES = len(set(train_set["label"]))  # Number of unique emotions

## Transformer model

In [44]:

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes, max_len, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(self._get_positional_encoding(max_len, embed_dim))

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_dim, num_classes)

    def _get_positional_encoding(self, max_len, embed_dim):
        import math
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, x, attention_mask=None):
        x = self.embedding(x) + self.positional_encoding[:, : x.size(1), :]
        if attention_mask is not None:
            # Convert attention_mask to the expected format for `nn.TransformerEncoder`
            attention_mask = attention_mask == 0  # Mask padded tokens (True for padding)
        x = self.transformer_encoder(x, src_key_padding_mask=attention_mask)
        x = x.mean(dim=1)  # Global pooling
        logits = self.fc(x)
        return logits

## Hyperparameters

In [None]:
EMBED_DIM = 64
NUM_HEADS = 4
NUM_LAYERS = 2
DROPOUT = 0.1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 20

print("Using device:", DEVICE)

## Model, Loss and Optimizer

In [50]:
# Model, loss, and optimizer
EPOCHS = 10
model = TransformerModel(
    vocab_size=len(vocab),
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES,
    max_len=MAX_LEN,
    dropout=DROPOUT,
).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

## Training function

In [47]:
from tqdm.notebook import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, verbose=True, scheduler=None):
    print("Training the model") if verbose else None
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader) if verbose else train_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        with torch.no_grad():
            print ("Validating the model") if verbose else None
            for batch in val_loader:
                input_ids = batch["input_ids"].to(DEVICE)
                labels = batch["labels"].to(DEVICE)

                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels.cpu().numpy())

        if scheduler is not None:
            scheduler.step()

        val_acc = accuracy_score(val_labels, val_preds)
        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")


In [None]:

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, EPOCHS)

# save the model
# torch.save(model.state_dict(), "model_split.pth")

In [52]:
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

def get_model_performance(model, data_loader, index_to_emotion, device):
    model.eval()
    emotion_correct = defaultdict(int)
    emotion_total = defaultdict(int)
    total_predictions = defaultdict(int)

    false_positives = defaultdict(int)
    false_negatives = defaultdict(int)

    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            # Update emotion-specific counts
            for true_label, pred_label in zip(labels, preds):
                true_emotion = index_to_emotion[true_label]
                predicted_emotion = index_to_emotion[pred_label]
                emotion_total[true_emotion] += 1
                true_labels.append(true_emotion)
                predicted_labels.append(predicted_emotion)
                total_predictions[true_emotion] += 1
                if true_emotion == predicted_emotion:
                    emotion_correct[true_emotion] += 1
                else:
                    false_positives[predicted_emotion] += 1
                    false_negatives[true_emotion] += 1

     # Calculate accuracy, recall, precision and F1 score for each emotion
    metrics = {}
    for label in __emotions:
        tp = emotion_correct[label]
        fp = false_positives[label]
        fn = false_negatives[label]

        # Calculate precision
        precision = tp / (tp + fp) if tp + fp > 0 else 0

        # Calculate recall
        recall = tp / (tp + fn) if tp + fn > 0 else 0

        # Calculate F1 score
        f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        metrics[label] = {
            "accuracy": emotion_correct[label] / total_predictions[label],
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score
        }

    # Calculate total accuracy
    total_accuracy = sum(emotion_correct.values()) / sum(total_predictions.values())

    return metrics, total_accuracy, true_labels, predicted_labels

def evaluate_model(model, data_loader, index_to_emotion, device, title="Model Performance"):
    metrics, total_accuracy, true_labels, predicted_labels = get_model_performance(
        model, data_loader, index_to_emotion, device
    )

    # Plot accuracy per emotion
    plt.figure(figsize=(10, 6))
    plt.bar(metrics.keys(), [metric["accuracy"] * 100 for metric in metrics.values()])
    plt.title("Accuracy per Emotion", fontsize=16)
    plt.xlabel("Emotion", fontsize=12)
    plt.ylabel("Accuracy (%)", fontsize=12)
    plt.xticks(rotation=90)
    plt.ylim(0, 100)
    plt.tight_layout()
    plt.savefig("accuracy_per_emotion.png", dpi=300, transparent=True, bbox_inches="tight")
    plt.show()

    disp = ConfusionMatrixDisplay.from_predictions(true_labels, predicted_labels, labels=__emotions, normalize="true", cmap="Blues")
    fig = disp.figure_
    fig.savefig("confusion_matrix.png", dpi=300, transparent=True, bbox_inches="tight")
    # uncomment for a larger plot
    # fig = disp.figure_
    # fig.set_figwidth(40)
    # fig.set_figheight(40)
    # # disp.plot(cmap="Blues")
    # disp.im_.set_clim(0, 1)
    # plt.title(title)
    # plt.xticks(rotation=60)
    # plt.show()

    # Create a table of metrics
    metrics_table = []
    for emotion, metric in metrics.items():
        metrics_table.append([emotion, f"{metric['accuracy']:.2f}", f"{metric['precision']:.2f}", f"{metric['recall']:.2f}", f"{metric['f1_score']:.2f}"])
    metrics_table.append(["Total", f"{total_accuracy:.2f}", "", "", ""])
    plt.figure(figsize=(15, 10))
    plt.axis('off')
    # plot table, adding a little height padding
    plt.table(cellText=metrics_table, colLabels=["Emotion", "Accuracy", "Precision", "Recall", "F1 Score"], cellLoc="center", loc="center")
    plt.title(title, fontsize=15)
    plt.show()

    # Print the results
    print(f"Overall Accuracy: {total_accuracy:.2f}%")
    for emotion, metric in metrics.items():
        print(f"{emotion.capitalize()}:")
        print(f"  - Accuracy: {metric['accuracy']:.2f}")
        print(f"  - Precision: {metric['precision']:.2f}")
        print(f"  - Recall: {metric['recall']:.2f}")
        print(f"  - F1 Score: {metric['f1_score']:.2f}")
        print()

    return total_accuracy

In [None]:
evaluate_model(
    model=model,
    data_loader=test_loader,
    index_to_emotion=INDEX_TO_EMOTION,
    device=DEVICE
)

## GridWorld for Hyperparameters

In [None]:
import time

def grid_search():
    embed_dims = [32, 64, 128]
    num_heads = [2, 4, 8]
    num_layers = [1, 2, 3]
    dropout_rates = [0.1, 0.2, 0.3]
    w_decay = [0, 0.01, 0.1]

    params = [(dropout_rate, embed_dim, wd, num_head, num_layer)
              for dropout_rate in dropout_rates
              for embed_dim in embed_dims
              for wd in w_decay
              for num_head in num_heads
              for num_layer in num_layers
    ]

    print(f"Total number of combinations: {len(params)}")
    best_accuracy = 0
    best_params = None
    avg_time_taken = 0
    num_runs = 0
    for dropout_rate, embed_dim, wd, num_head, num_layer in params:
        start = time.time()
        model = TransformerModel(
            vocab_size=len(vocab),
            embed_dim=embed_dim,
            num_heads=num_head,
            num_layers=num_layer,
            num_classes=NUM_CLASSES,
            max_len=MAX_LEN,
            dropout=dropout_rate,
        ).to(DEVICE)
        model.to(DEVICE)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=wd)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=2000, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=2000, shuffle=False)

        train_model(model, train_loader, val_loader, criterion, optimizer, 1, verbose=False)
        metrics, accuracy, true_labels, predicted_labels = get_model_performance(model, test_loader, INDEX_TO_EMOTION, DEVICE)
        print(f"Accuracy: {accuracy:.2f}% for params: {embed_dim=} {num_head=} {num_layer=} {dropout_rate=} {wd=}")
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = (embed_dim, num_head, num_layer, dropout_rate, wd)
        end = time.time()
        time_taken = end - start
        avg_time_taken *= num_runs
        avg_time_taken += end - start
        num_runs += 1
        avg_time_taken /= num_runs
        print(f"Time taken: {time_taken:.2f}s, estimated time left: {avg_time_taken * (len(params) - num_runs) / 60:.2f}min")

    print(f"Best accuracy: {best_accuracy:.2f}%")
    print(f"Best parameters: {best_params}")

grid_search()
