In [None]:
# install dependencies
!pip install torch torchvision torchaudio
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
# import libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
import evaluate
import numpy as np
import gc
from collections import Counter
from torch import nn
from transformers import Trainer

In [None]:
# load dataset
dataset_original = load_dataset("go_emotions", "simplified")
label_names = dataset_original["train"].features["labels"].feature.names

#check dataset
print(dataset_original)
print(dataset_original["train"][0])

# Simplified label mapping for GoEmotions
simplified_map = {
    # anger
    "anger": 0, "annoyance": 0, "confusion": 0, "disapproval": 0,

    # disgust
    "disgust": 1,

    # fear
    "fear": 2, "nervousness": 2, "embarrassment": 2,

    # joy
    "joy": 3, "amusement": 3, "approval": 3, "excitement": 3,
    "gratitude": 3, "optimism": 3, "relief": 3, "pride": 3, "realization": 3,

    # sadness
    "sadness": 4, "disappointment": 4, "grief": 4, "remorse": 4,

    # surprise
    "surprise": 5, "curiosity": 5,

    # neutral
    "neutral": 6, "desire": 6, "caring": 6, "admiration": 6,

    #love
    "love": 7
}

# map original label to simplified label
def map_to_simplified(example):
    original_label_index = example['labels'][0]
    original_label_name = label_names[original_label_index]
    example['labels'] = [simplified_map[original_label_name]]
    return example

dataset_simplified = dataset_original.map(map_to_simplified)
print(dataset_simplified['train']['labels'][:10])

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})
{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

[[6], [6], [0], [2], [0], [5], [3], [6], [6], [6]]


In [None]:
# CHECK THE DISTRIBUTION
label_counts = Counter([label[0] for label in dataset_simplified['train']['labels']])
print("\nLabel distribution after mapping:", label_counts)

# Detailed breakdown
labels_list = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral", "love"]
print("\nDetailed distribution:")
for i in range(8):
    count = label_counts.get(i, 0)
    print(f"  {labels_list[i]}: {count} examples")


Label distribution after mapping: Counter({6: 18462, 3: 10474, 0: 6604, 5: 2523, 4: 2371, 7: 1533, 2: 863, 1: 580})

Detailed distribution:
  anger: 6604 examples
  disgust: 580 examples
  fear: 863 examples
  joy: 10474 examples
  sadness: 2371 examples
  surprise: 2523 examples
  neutral: 18462 examples
  love: 1533 examples


In [None]:
# Calculate class weights to handle imbalance
train_labels = [label[0] for label in dataset_simplified['train']['labels']]
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights)

Class weights: tensor([1.4725, 9.3556, 6.2877, 0.5551, 2.2886, 7.2254, 0.2275, 3.5396])


In [None]:
#Tokenize text

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset_simplified.map(tokenize_function, batched=True)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [None]:
# Set format for PyTorch
tokenized_dataset = tokenized_dataset.rename_column("labels", "label")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

In [None]:
# Load pretrained model

# Hardcode the number of labels
num_labels = 8

# Hardcode the emotion names (in the right order!)
labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral", "love"]

# Create mappings
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Load model with mappings
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Quick check
print(model.config.id2label)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'surprise', 6: 'neutral', 7: 'love'}


In [None]:
# define metrics

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
from torch import nn
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Use your calculated class weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [None]:
# Training setup

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/emotion_model",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps", # Explicitly set save strategy
    save_steps=1000, # Make save_steps a multiple of eval_steps
    save_total_limit=20,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/emotion_model/logs",
    logging_steps=200,  # Log more frequently
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",  # Use accuracy to select best model
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = WeightedTrainer(


In [None]:
# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

print("Memory cleared - starting training...")

Memory cleared - starting training...


In [None]:
# train model

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
1000,1.1574,1.11787,0.456053,0.463986
2000,1.0324,1.090194,0.526073,0.524266
3000,0.8602,1.034842,0.550949,0.53779
4000,0.9158,1.073248,0.524415,0.521093
5000,0.8961,1.060105,0.552423,0.536014
6000,0.6765,1.12398,0.566796,0.537936
7000,0.65,1.211892,0.558504,0.535886
8000,0.698,1.195037,0.571402,0.550757
9000,0.4364,1.543091,0.574719,0.555632
10000,0.4692,1.576421,0.595909,0.567294


In [None]:
# Evaluate
results = trainer.evaluate()
print(results)

# Save model
trainer.save_model("emotion_model")
tokenizer.save_pretrained("emotion_model")

{'eval_loss': 1.835105061531067, 'eval_accuracy': 0.6130458817025981, 'eval_f1': 0.5702699359921618, 'eval_runtime': 78.8023, 'eval_samples_per_second': 68.869, 'eval_steps_per_second': 4.315, 'epoch': 5.0}


('emotion_model/tokenizer_config.json',
 'emotion_model/special_tokens_map.json',
 'emotion_model/vocab.txt',
 'emotion_model/added_tokens.json',
 'emotion_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
save_path = "/content/drive/MyDrive/emotion_model"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model saved in Google Drive at {save_path}")

✅ Model saved in Google Drive at /content/drive/MyDrive/emotion_model


In [None]:
import os
print(os.listdir("/content/drive/MyDrive/emotion_model"))

['config.json', 'model.safetensors', 'training_args.bin', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt', 'tokenizer.json', 'checkpoint-2714', 'checkpoint-5428', 'checkpoint-8142', 'checkpoint-10856', 'checkpoint-13570']
