In [7]:
# === CLEAN TRAINING CELL FOR MELD (text-only, 7 emotions) ===
import os, pandas as pd, numpy as np, torch
from collections import Counter
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch.nn as nn

os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("Torch CUDA available?", torch.cuda.is_available())

# ---- Paths & labels ----
DATA_DIR = "meld_data"
TRAIN_CSV = os.path.join(DATA_DIR, "train_sent_emo.csv")
DEV_CSV = os.path.join(DATA_DIR, "dev_sent_emo.csv")
TEST_CSV = os.path.join(DATA_DIR, "test_sent_emo.csv")

CLASSES = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
label2id = {c: i for i, c in enumerate(CLASSES)}
id2label = {i: c for c, i in label2id.items()}

# ---- Load data ----
use_cols = ["Utterance", "Emotion"]
train_df = pd.read_csv(TRAIN_CSV)[use_cols].dropna()
dev_df = pd.read_csv(DEV_CSV)[use_cols].dropna()
test_df = pd.read_csv(TEST_CSV)[use_cols].dropna()

for df in (train_df, dev_df, test_df):
    df["label"] = df["Emotion"].map(label2id)
    df.rename(columns={"Utterance": "text"}, inplace=True)

ds = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_df[["text", "label"]].reset_index(drop=True)
        ),
        "validation": Dataset.from_pandas(
            dev_df[["text", "label"]].reset_index(drop=True)
        ),
        "test": Dataset.from_pandas(test_df[["text", "label"]].reset_index(drop=True)),
    }
)

Torch CUDA available? False


In [8]:
# ---- Model & tokenizer ----
model_name = "roberta-large"  # or "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


def tokenize(batch):
    return tokenizer(
        batch["text"], truncation=True, max_length=256, padding="max_length"
    )


ds_tok = ds.map(tokenize, batched=True, remove_columns=["text"])

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(CLASSES), id2label=id2label, label2id=label2id
)
# NOTE: "Some weights ... newly initialized" is expected before fine-tuning.

# ---- Class weights (imbalanced data) ----
counts = Counter(train_df["label"].tolist())
total = sum(counts.values())
class_weights = torch.tensor(
    [total / (len(CLASSES) * counts[i]) for i in range(len(CLASSES))], dtype=torch.float
)


# ---- Metrics ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }


# ---- Weighted-loss Trainer ----
class WeightedCELossTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.loss_fct = nn.CrossEntropyLoss(
            weight=(
                self.class_weights.to(self.model.device)
                if self.class_weights is not None
                else None
            )
        )

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels", inputs.get("label"))
        model_inputs = {k: v for k, v in inputs.items() if k not in ("labels", "label")}
        outputs = model(**model_inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

Map:   0%|          | 0/9989 [00:00<?, ? examples/s]

Map:   0%|          | 0/1109 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# ---- TrainingArguments (simple & correct for 4.55.4) ----
bs = 16 if torch.cuda.is_available() else 8

args = TrainingArguments(
    output_dir="./runs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="precision_macro",
    greater_is_better=True,
    num_train_epochs=30,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.05,
)


trainer = WeightedCELossTrainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],  # ✅ tokenized dataset
    eval_dataset=ds_tok["validation"],  # ✅ tokenized dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# class_weights=class_weights,


# ---- Train & Evaluate ----
# train_result = trainer.train()
# print(train_result.metrics)
# print("\nBest checkpoint:", trainer.state.best_model_checkpoint)

# print("\nDEV metrics:", trainer.evaluate(ds_tok["validation"]))
test_metrics = trainer.evaluate(ds_tok["test"])
print("\nTEST metrics:", test_metrics)

preds = trainer.predict(ds_tok["test"]).predictions.argmax(axis=-1)
print("\nClassification report (TEST):")
print(classification_report(test_df["label"], preds, target_names=CLASSES, digits=4))

early stopping required metric_for_best_model, but did not find eval_precision_macro so early stopping is disabled



TEST metrics: {'eval_loss': 1.9737850427627563, 'eval_model_preparation_time': 0.009, 'eval_accuracy': 0.06206896551724138, 'eval_f1': 0.03081856534937973, 'eval_runtime': 3610.6385, 'eval_samples_per_second': 0.723, 'eval_steps_per_second': 0.091}

Classification report (TEST):
              precision    recall  f1-score   support

       anger     0.1012    0.2029    0.1350       345
     disgust     0.0000    0.0000    0.0000        68
        fear     0.0000    0.0000    0.0000        50
         joy     0.0000    0.0000    0.0000       402
     neutral     0.0000    0.0000    0.0000      1256
     sadness     0.0557    0.3221    0.0950       208
    surprise     0.0350    0.0890    0.0502       281

    accuracy                         0.0621      2610
   macro avg     0.0274    0.0877    0.0400      2610
weighted avg     0.0216    0.0621    0.0308      2610



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
trainer.evaluate(ds_tok["test"])