In [1]:
# ==== Libraries ====
import os
import math
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
)

# ==== Check device ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cpu


In [2]:
# ==== Load 7-class (6 + neutral) grouped dataset ====
# Adjust paths if your CSVs are elsewhere
train_path = "../GoemotionsDataset/goemotions_train_grouped.csv"
val_path   = "../GoemotionsDataset/goemotions_val_grouped.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)

print("Training shape:", train_df.shape)
print("Validation shape:", val_df.shape)
train_df.head()

Training shape: (48836, 2)
Validation shape: (5427, 2)


Unnamed: 0,text,labels
0,My favourite food is anything I didn't have to...,neutral
1,"Now if he does off himself, everyone will thin...",neutral
2,WHY THE FUCK IS BAYLESS ISOING,anger
3,To make her feel threatened,fear
4,Dirty Southern Wankers,anger


In [3]:
# ==== Check label distribution & unique labels ====
print("Train label counts:")
print(train_df["labels"].value_counts().sort_index())
print("\nValidation label counts:")
print(val_df["labels"].value_counts().sort_index())

print("\nUnique labels in train:", sorted(train_df["labels"].unique()))
print("Unique labels in val:  ", sorted(val_df["labels"].unique()))

Train label counts:
labels
anger        5674
disgust       627
fear          670
joy         18437
neutral     15985
sadness      2774
surprise     4669
Name: count, dtype: int64

Validation label counts:
labels
anger        663
disgust       83
fear          85
joy         1980
neutral     1787
sadness      302
surprise     527
Name: count, dtype: int64

Unique labels in train: ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
Unique labels in val:   ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']


In [4]:
# ==== Tokenizer ====
# Same baseline RoBERTa model you used before (28-label GoEmotions)
ROBERTA_MODEL_NAME = "SamLowe/roberta-base-go_emotions"

tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL_NAME)
print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [5]:
# ==== Encode labels into numeric IDs ====
# If your CSV already has numeric labels 0-6, this will just remap them consistently.
label_names = sorted(train_df["labels"].unique())
label_to_id = {label: idx for idx, label in enumerate(label_names)}
id_to_label = {v: k for k, v in label_to_id.items()}

print("Label mapping:", label_to_id)

# Apply mapping
train_df["labels"] = train_df["labels"].map(label_to_id)
val_df["labels"]   = val_df["labels"].map(label_to_id)

# Optional safety clean-up (if there are any NaNs after mapping)
valid_labels = set(range(len(label_to_id)))
train_df = train_df[train_df["labels"].isin(valid_labels)]
val_df   = val_df[val_df["labels"].isin(valid_labels)]

print("\nAfter mapping & clean-up:")
print("Train labels:", sorted(train_df["labels"].unique()))
print("Val labels:  ", sorted(val_df["labels"].unique()))
print("Number of classes:", len(label_to_id))


# ==== PyTorch Dataset class ====
class GoEmotionsEkmanDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        # Expecting a 'text' column and a 'labels' column
        self.texts  = df["text"].astype(str).tolist()
        self.labels = df["labels"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


# ==== Create dataset objects ====
train_dataset = GoEmotionsEkmanDataset(train_df, tokenizer)
val_dataset   = GoEmotionsEkmanDataset(val_df, tokenizer)

print("Datasets created successfully!")
print("Train samples:", len(train_dataset))
print("Val samples:  ", len(val_dataset))

Label mapping: {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}

After mapping & clean-up:
Train labels: [0, 1, 2, 3, 4, 5, 6]
Val labels:   [0, 1, 2, 3, 4, 5, 6]
Number of classes: 7
Datasets created successfully!
Train samples: 48836
Val samples:   5427


In [6]:
# ==== Metrics ====
def compute_metrics(pred):
    labels = pred.label_ids
    preds  = np.argmax(pred.predictions, axis=1)

    acc  = accuracy_score(labels, preds)
    f1   = f1_score(labels, preds, average="weighted")
    prec = precision_score(labels, preds, average="weighted", zero_division=0)
    rec  = recall_score(labels, preds, average="weighted", zero_division=0)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": prec,
        "recall": rec,
    }

In [8]:
# ==== Model training setup for 7-class classification (RoBERTa) ====
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
import math
import torch
import os

# ---- Hyperparameters (you can adjust these or overwrite from the grid-search cell later) ----
best_lr  = 2e-5          # starting guess for RoBERTa
best_wd  = 0.01
epochs   = 10
per_device_bs = 16

output_dir = "../Roberta/roberta_fine_tuned_6class_lr_2e-5_wd_0.01"
os.makedirs(output_dir, exist_ok=True)

# ==== Config ====
config = AutoConfig.from_pretrained(ROBERTA_MODEL_NAME)
config.num_labels = len(label_to_id)
config.problem_type = "single_label_classification"
config.id2label = id_to_label
config.label2id = label_to_id

# ==== Model ====
# IMPORTANT: ignore_mismatched_sizes=True so the old 28-label head is discarded
model_roberta = AutoModelForSequenceClassification.from_pretrained(
    ROBERTA_MODEL_NAME,
    config=config,
    ignore_mismatched_sizes=True,   # <-- fixes the size mismatch error
).to(device)

print("RoBERTa model loaded with", config.num_labels, "labels.")


# ==== TrainingArguments builder with version compatibility ====
def build_args():
    steps_per_epoch = math.ceil(len(train_dataset) / per_device_bs)

    # Newer Transformers versions
    try:
        return TrainingArguments(
            output_dir=output_dir,
            learning_rate=best_lr,
            weight_decay=best_wd,
            per_device_train_batch_size=per_device_bs,
            per_device_eval_batch_size=per_device_bs,
            num_train_epochs=epochs,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            logging_dir=f"{output_dir}/logs",
            logging_strategy="steps",
            logging_steps=100,
            report_to="none",
        )
    except TypeError:
        # Slightly older versions (evaluation_strategy might be eval_strategy)
        try:
            return TrainingArguments(
                output_dir=output_dir,
                learning_rate=best_lr,
                weight_decay=best_wd,
                per_device_train_batch_size=per_device_bs,
                per_device_eval_batch_size=per_device_bs,
                num_train_epochs=epochs,
                eval_strategy="epoch",
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                greater_is_better=True,
                save_total_limit=1,
                logging_dir=f"{output_dir}/logs",
                logging_steps=100,
            )
        except TypeError:
            # Legacy fallback with explicit save_steps
            return TrainingArguments(
                output_dir=output_dir,
                learning_rate=best_lr,
                weight_decay=best_wd,
                per_device_train_batch_size=per_device_bs,
                per_device_eval_batch_size=per_device_bs,
                num_train_epochs=epochs,
                eval_steps=steps_per_epoch,
                evaluation_strategy="steps",
                save_steps=steps_per_epoch,
                save_total_limit=1,
                logging_dir=f"{output_dir}/logs",
                logging_steps=100,
            )

args_roberta = build_args()

# ==== Early stopping (patience = 3 as requested) ====
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# ==== Trainer ====
trainer_roberta = Trainer(
    model=model_roberta,
    args=args_roberta,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

print("Trainer initialised successfully!")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_roberta = Trainer(


RoBERTa model loaded with 7 labels.
Trainer initialised successfully!


In [9]:
# ==== Train the model ====
print("Starting RoBERTa fine-tuning for 7-class (6 + neutral) dataset...\n")
train_result = trainer_roberta.train()

# ==== Save the best model ====
save_path = os.path.join(output_dir, "best_model")
trainer_roberta.save_model(save_path)
tokenizer.save_pretrained(save_path)  # (handy for later inference)

print(f"\nModel saved successfully to: {save_path}")

# ==== Final evaluation on validation set ====
print("\nEvaluating best model on validation data...\n")
final_metrics = trainer_roberta.evaluate()
print("Final validation metrics:")
for k, v in final_metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.6f}")
    else:
        print(f"{k}: {v}")


Starting RoBERTa fine-tuning for 7-class (6 + neutral) dataset...





Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# ==== Sanity check for label encoding ====
print("Unique labels in training set:", sorted(train_df["labels"].unique()))
print("Unique labels in validation set:", sorted(val_df["labels"].unique()))
print("Expected number of classes:", len(label_to_id))

valid_labels = set(range(len(label_to_id)))
train_df = train_df[train_df["labels"].isin(valid_labels)]
val_df   = val_df[val_df["labels"].isin(valid_labels)]

print("\nAfter cleanup:")
print("Train labels:", sorted(train_df["labels"].unique()))
print("Val labels:  ", sorted(val_df["labels"].unique()))


In [14]:
# ==== Optional: Hyperparameter grid search for RoBERTa (short runs, version-safe) ====

from transformers import TrainingArguments, Trainer
import torch, os

param_grid = [
    {"lr": 1e-5, "wd": 0.01},
    {"lr": 2e-5, "wd": 0.01},
    {"lr": 3e-5, "wd": 0.01},
    {"lr": 5e-5, "wd": 0.01},
    {"lr": 2e-5, "wd": 0.10},
]

search_epochs  = 4      # short runs
search_bs      = 16
search_results = []

print("Starting RoBERTa LR/WD grid search...\n")

for cfg in param_grid:
    lr = cfg["lr"]
    wd = cfg["wd"]
    run_name = f"lr_{lr}_wd_{wd}"

    tmp_output_dir = os.path.join("../Roberta", f"roberta_search_{run_name}")
    os.makedirs(tmp_output_dir, exist_ok=True)

    # ---- Fresh config for each run ----
    config_tmp = AutoConfig.from_pretrained(ROBERTA_MODEL_NAME)
    config_tmp.num_labels   = len(label_to_id)
    config_tmp.problem_type = "single_label_classification"
    config_tmp.id2label     = id_to_label
    config_tmp.label2id     = label_to_id

    # ---- Fresh model (7-label head; ignore old 28-label head) ----
    model_tmp = AutoModelForSequenceClassification.from_pretrained(
        ROBERTA_MODEL_NAME,
        config=config_tmp,
        ignore_mismatched_sizes=True,
    ).to(device)

    # ---- Minimal TrainingArguments: no evaluation_strategy, no load_best_model_at_end ----
    args_tmp = TrainingArguments(
        output_dir=tmp_output_dir,
        learning_rate=lr,
        weight_decay=wd,
        per_device_train_batch_size=search_bs,
        per_device_eval_batch_size=search_bs,
        num_train_epochs=search_epochs,
        logging_dir=f"{tmp_output_dir}/logs",
        logging_steps=200,
    )

    trainer_tmp = Trainer(
        model=model_tmp,
        args=args_tmp,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print(f"\n>>> Running config: {run_name}")
    trainer_tmp.train()

    # Evaluate once at the end of training
    eval_metrics = trainer_tmp.evaluate()
    print("Validation F1:", eval_metrics.get("f1", None))

    search_results.append({"config": cfg, "metrics": eval_metrics})

    # Clean up GPU memory between runs
    del trainer_tmp, model_tmp
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# ==== Pick best config by validation F1 ====
best_run = max(search_results, key=lambda x: x["metrics"].get("f1", -1))
best_lr = best_run["config"]["lr"]
best_wd = best_run["config"]["wd"]

print("\nBest config based on validation F1:")
print("  learning_rate:", best_lr)
print("  weight_decay :", best_wd)
print("  metrics      :", best_run["metrics"])

Starting RoBERTa LR/WD grid search...



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_tmp = Trainer(


KeyboardInterrupt: 