In [1]:
# !pip install datasets transformers torch scikit-learn --quiet

In [2]:
# !pip install --upgrade accelerate transformers torch --quiet

In [1]:
!pip install -U datasets transformers accelerate torch torchvision torchaudio scikit-learn tqdm pandas numpy matplotlib --quiet

In [2]:
import torch, transformers, datasets, sklearn

print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)
print("scikit-learn:", sklearn.__version__)
print("CUDA available:", torch.cuda.is_available())
print("MPS available (Apple GPU):", torch.backends.mps.is_available())

PyTorch: 2.9.0
Transformers: 4.57.1
Datasets: 4.2.0
scikit-learn: 1.7.2
CUDA available: False
MPS available (Apple GPU): True


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

In [4]:
# Load the dataset directly from Hugging Face
goemotions = load_dataset("go_emotions")

# Inspect available splits
print(goemotions)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})


In [5]:
from datasets import concatenate_datasets

goemotions_combined = concatenate_datasets([goemotions["train"], goemotions["validation"]])
print(f"Combined train+validation samples: {len(goemotions_combined)}")
print(f"Test samples: {len(goemotions['test'])}")

Combined train+validation samples: 48836
Test samples: 5427


In [6]:
df = goemotions_combined.to_pandas()

# Extract label names from the original GoEmotions schema
labels = goemotions["train"].features["labels"].feature.names
print("Total labels:", len(labels))
print("Example labels:", labels[:10])

# Display sample
df.head()

Total labels: 28
Example labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment']


Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj


In [7]:
# Simplify each sample to its first label for single-label classification
df["main_label"] = df["labels"].apply(lambda x: x[0] if len(x) > 0 else None)
df = df.dropna(subset=["main_label"])
df = df.rename(columns={"text": "content"})

# View distribution
df["main_label"].value_counts().head()

main_label
27    14415
0      4618
4      2951
1      2541
15     2393
Name: count, dtype: int64

In [8]:
# train_df, val_df = train_test_split(
#     df,
#     test_size=0.2,
#     random_state=42,
#     stratify=df["main_label"]
# )

# print(f"Train: {len(train_df)}, Validation: {len(val_df)}")

In [9]:
train_df = df.copy()                      
val_df = goemotions["test"].to_pandas()   

print(f" Train samples: {len(train_df)}, Validation/Test samples: {len(val_df)}")

 Train samples: 48836, Validation/Test samples: 5427


In [10]:
MODEL_NAME = "SamLowe/roberta-base-go_emotions"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Running on device: {device}")

Running on device: cpu


In [12]:
if "text" in val_df.columns and "content" not in val_df.columns:
    val_df = val_df.rename(columns={"text": "content"})

In [13]:
# Convert validation texts to list
val_texts = val_df["content"].fillna("").astype(str).tolist()

batch_size = 32
all_pred_ids = []

model.eval()
with torch.no_grad():
    for i in range(0, len(val_texts), batch_size):
        batch_texts = val_texts[i:i+batch_size]
        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        logits = model(**enc).logits
        preds = torch.argmax(logits, dim=-1)
        all_pred_ids.extend(preds.cpu().tolist())

# Map IDs to label names
id2label = model.config.id2label
predicted_labels = [id2label[int(i)] for i in all_pred_ids]

# Add predictions to DataFrame
val_df["predicted_id"] = all_pred_ids
val_df["predicted_label"] = predicted_labels

val_df.head()

Unnamed: 0,content,labels,id,predicted_id,predicted_label
0,I‚Äôm really sorry about your situation :( Altho...,[25],eecwqtt,24,remorse
1,It's wonderful because it's awful. At not with.,[0],ed5f85d,0,admiration
2,"Kings fan here, good luck to you guys! Will be...",[13],een27c3,20,optimism
3,"I didn't know that, thank you for teaching me ...",[15],eelgwd1,15,gratitude
4,They got bored from haunting earth for thousan...,[27],eem5uti,27,neutral


In [16]:
if "main_label" not in val_df.columns:
    # Each test sample's 'labels' field is a list ‚Üí extract first label like we did for training
    val_df["main_label"] = val_df["labels"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
    val_df = val_df.dropna(subset=["main_label"])

In [17]:
y_true = val_df["main_label"].astype(int).to_numpy()
y_pred = val_df["predicted_id"].astype(int).to_numpy()

print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="weighted", zero_division=0))
print("Recall   :", recall_score(y_true, y_pred, average="weighted", zero_division=0))
print("F1 Score :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

print("\nDetailed Classification Report:\n")

ordered_labels = [id2label[i] for i in sorted(id2label.keys(), key=int)]
print(classification_report(
    y_true,
    y_pred,
    target_names=ordered_labels,
    zero_division=0
))

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Accuracy : nan
Precision: nan
Recall   : nan
F1 Score : nan

Detailed Classification Report:



ValueError: Number of classes, 0, does not match size of target_names, 28. Try specifying the labels parameter

In [15]:
# ‚úÖ Rebuild main_label safely
if "main_label" not in val_df.columns and "labels" in val_df.columns:
    val_df["main_label"] = val_df["labels"].apply(
        lambda x: int(x[0]) if isinstance(x, list) and len(x) > 0 else None
    )

# Check columns exist before proceeding
cols = val_df.columns.tolist()
print("Available columns:", cols)

# Keep only rows with valid ground truth + prediction
val_df = val_df.dropna(subset=["main_label", "predicted_id"], how="any")
print(f"‚úÖ Remaining samples after cleanup: {len(val_df)}")

if len(val_df) == 0:
    print("‚ö†Ô∏è No samples available for evaluation ‚Äî check earlier prediction step.")
else:
    y_true = val_df["main_label"].astype(int).to_numpy()
    y_pred = val_df["predicted_id"].astype(int).to_numpy()

    print(f"üßÆ Evaluating on {len(y_true)} samples...")

    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average="weighted", zero_division=0))
    print("Recall   :", recall_score(y_true, y_pred, average="weighted", zero_division=0))
    print("F1 Score :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

    print("\nDetailed Classification Report:\n")

    id2label = model.config.id2label
    available_classes = sorted(set(y_true) & set(y_pred))
    ordered_labels = [id2label[i] for i in available_classes]

    print(
        classification_report(
            y_true,
            y_pred,
            labels=available_classes,
            target_names=ordered_labels,
            zero_division=0
        )
    )

Available columns: ['content', 'labels', 'id', 'predicted_id', 'predicted_label', 'main_label']
‚úÖ Remaining samples after cleanup: 0
‚ö†Ô∏è No samples available for evaluation ‚Äî check earlier prediction step.


The following cells above had the use of pre-trained model of Roberta on the GoEmotions dataset which had 28 classes. The model when tested using a held-out validation dataset provided a good accuracy metrics overall. 

But, I will try and fine-tune this model now to see if I can improve the accuracy of this model any better or not. 

In [22]:
from datasets import Dataset

# Convert pandas ‚Üí Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df[["content", "main_label"]])
val_dataset   = Dataset.from_pandas(val_df[["content", "main_label"]])

# Tokenisation function
def tokenize_function(batch):
    return tokenizer(
        batch["content"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Apply tokenisation
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)

# Rename columns for Trainer compatibility
train_dataset = train_dataset.rename_column("main_label", "labels")
val_dataset   = val_dataset.rename_column("main_label", "labels")

# Keep only required columns
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/34728 [00:00<?, ? examples/s]

Map:   0%|          | 0/8682 [00:00<?, ? examples/s]

In [23]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds  = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [32]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW 
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Optimiser
optim = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

Using device: mps


In [34]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)

optim = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()   # üëà single-label loss

best_f1 = 0.73
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optim.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        labels = inputs.pop("labels")             # remove labels from inputs
        outputs = model(**inputs)
        logits = outputs.logits                   # [batch, 28]
        loss = criterion(logits, labels)          # ‚úÖ works with single int labels
        loss.backward()
        optim.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} training loss: {avg_train_loss:.4f}")

    # ---- Validation ----
    model.eval()
    preds_all, labels_all = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = labels.cpu().numpy()
            preds_all.extend(preds)
            labels_all.extend(labels)

    acc = accuracy_score(labels_all, preds_all)
    f1  = f1_score(labels_all, preds_all, average="weighted")
    print(f"Validation Accuracy: {acc:.4f} | F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        model.save_pretrained("./roberta_goemotions_best_manual")
        tokenizer.save_pretrained("./roberta_goemotions_best_manual")
        print(f"üíæ Model improved ‚Üí saved (F1={best_f1:.4f})")
    else:
        print("‚ö†Ô∏è No improvement this epoch.")

Using device: mps


Epoch 1/3:   0%|          | 0/4341 [00:00<?, ?it/s]


Epoch 1 training loss: 1.0543
Validation Accuracy: 0.6730 | F1: 0.6754
‚ö†Ô∏è No improvement this epoch.


Epoch 2/3:   0%|          | 0/4341 [00:00<?, ?it/s]


Epoch 2 training loss: 0.8723
Validation Accuracy: 0.6835 | F1: 0.6796
‚ö†Ô∏è No improvement this epoch.


Epoch 3/3:   0%|          | 0/4341 [00:00<?, ?it/s]


Epoch 3 training loss: 0.7002
Validation Accuracy: 0.6697 | F1: 0.6664
‚ö†Ô∏è No improvement this epoch.
