In [1]:
!pip -q install -U transformers datasets scikit-learn pandas accelerate


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m112.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0m

In [12]:
MODEL = "bert-base-uncased"

TEXT_COL  = "Utterance"
LABEL_COL = "Emotion"


LABELS = ["anger","disgust","fear","joy","neutral","sadness","surprise"]

MELD_TRAIN, MELD_DEV, MELD_TEST = "/content/iemocap_ekman7_emoberta_train.csv","/content/iemocap_ekman7_emoberta_val.csv","/content/iemocap_ekman7_emoberta_test.csv"
IEMO_TRAIN, IEMO_DEV, IEMO_TEST = "/content/train_sent_emo_cleaned_processed.csv","/content/dev_sent_emo_cleaned_processed.csv","/content/test_sent_emo_cleaned_processed.csv"

OUT_DIR = "bert_joint_meld_iemocap"

MAX_LEN = 256
EPOCHS  = 4
LR      = 2e-5
BATCH   = 16
SEED    = 42

UPSAMPLE_IEMOCAP = 2       # repeats IEMOCAP train 3x
USE_CLASS_WEIGHTS = True   # combats class imbalance


In [13]:
import pandas as pd
from datasets import Dataset, DatasetDict

TEXT_COL  = "Utterance"
LABEL_COL = "Emotion"

def read_ds(path, dataset_name):
    df = pd.read_csv(path).dropna(subset=[TEXT_COL, LABEL_COL]).copy()


    df = df[[TEXT_COL, LABEL_COL]].copy()

    # normalize text + labels
    df[TEXT_COL] = df[TEXT_COL].astype(str)
    df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip().str.lower()

    df["dataset"] = dataset_name
    return Dataset.from_pandas(df, preserve_index=False)

meld = DatasetDict(
    train=read_ds(MELD_TRAIN, "meld"),
    validation=read_ds(MELD_DEV, "meld"),
    test=read_ds(MELD_TEST, "meld"),
)

iem = DatasetDict(
    train=read_ds(IEMO_TRAIN, "iemocap"),
    validation=read_ds(IEMO_DEV, "iemocap"),
    test=read_ds(IEMO_TEST, "iemocap"),
)

print(meld["train"].features)
print(iem["train"].features)



{'Utterance': Value('string'), 'Emotion': Value('string'), 'dataset': Value('string')}
{'Utterance': Value('string'), 'Emotion': Value('string'), 'dataset': Value('string')}


In [14]:
import numpy as np
from datasets import concatenate_datasets
from transformers import set_seed

set_seed(SEED)

iem_train = iem["train"]
if UPSAMPLE_IEMOCAP > 1:
    iem_train = concatenate_datasets([iem_train] * UPSAMPLE_IEMOCAP)

combined = DatasetDict(
    train=concatenate_datasets([meld["train"], iem_train]).shuffle(seed=SEED),
    validation=concatenate_datasets([meld["validation"], iem["validation"]]).shuffle(seed=SEED),
)

print("Combined train:", len(combined["train"]))


Combined train: 24855


In [15]:
from transformers import AutoTokenizer

label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

tok = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

def encode(batch):
    enc = tok(batch[TEXT_COL], truncation=True, max_length=MAX_LEN)
    enc["labels"] = [label2id[x] for x in batch[LABEL_COL]]
    return enc

def tokenize(ds):
    keep = {TEXT_COL, LABEL_COL, "dataset"}
    remove_cols = [c for c in ds["train"].column_names if c not in keep]
    return ds.map(encode, batched=True, remove_columns=remove_cols)

combined_tok = tokenize(combined)
meld_tok = tokenize(meld)
iem_tok  = tokenize(iem)


Map:   0%|          | 0/24855 [00:00<?, ? examples/s]

Map:   0%|          | 0/2108 [00:00<?, ? examples/s]

Map:   0%|          | 0/4879 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1650 [00:00<?, ? examples/s]

Map:   0%|          | 0/9988 [00:00<?, ? examples/s]

Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

In [16]:
from sklearn.metrics import f1_score, accuracy_score

def metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "acc": accuracy_score(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
        "weighted_f1": f1_score(y_true, y_pred, average="weighted"),
    }


In [19]:
import torch
from transformers import Trainer

import numpy as np, torch

def compute_class_weights(label_ids, n):
    counts = np.bincount(label_ids, minlength=n)
    w = np.sqrt(counts.sum() / np.maximum(counts, 1))  # milder than inverse-freq
    w = np.clip(w, 0.5, 3.0)                           # cap extremes
    w = w / w.mean()
    return torch.tensor(w, dtype=torch.float)


class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs["logits"]
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

class_weights = None
if USE_CLASS_WEIGHTS:
    class_weights = compute_class_weights(combined_tok["train"]["labels"], len(LABELS))
    print("Class weights:", class_weights.tolist())


Class weights: [0.9416760802268982, 1.144299030303955, 1.144299030303955, 0.8966344594955444, 0.5844935178756714, 1.144299030303955, 1.144299030303955]


In [18]:
import numpy as np
import pandas as pd

y = np.array(combined_tok["train"]["labels"])
counts = np.bincount(y, minlength=len(LABELS))
pct = counts / counts.sum() * 100
for i,l in enumerate(LABELS):
    print(f"{l:8s} count={counts[i]:6d}  pct={pct[i]:6.2f}%  weight={class_weights[i].item():.3f}")


anger    count=  4078  pct= 16.41%  weight=0.339
disgust  count=   544  pct=  2.19%  weight=2.540
fear     count=   559  pct=  2.25%  weight=2.472
joy      count=  4498  pct= 18.10%  weight=0.307
neutral  count= 10585  pct= 42.59%  weight=0.131
sadness  count=  2105  pct=  8.47%  weight=0.656
surprise count=  2486  pct= 10.00%  weight=0.556


In [22]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(LABELS),
    label2id=label2id,
    id2label=id2label
)

args = TrainingArguments(
    output_dir=OUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",

    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    weight_decay=0.01,
    warmup_ratio=0.06,

    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED,
)

if class_weights is None and not USE_CLASS_WEIGHTS:
    class_weights = torch.ones(len(LABELS))

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=combined_tok["train"],
    eval_dataset=combined_tok["validation"],
    tokenizer=tok,
    compute_metrics=metrics,
    class_weights=class_weights,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Macro F1,Weighted F1
1,1.0654,1.173547,0.590607,0.457089,0.584688
2,0.7341,1.236521,0.599146,0.50109,0.599824
3,0.3662,1.44736,0.608159,0.505062,0.609664
4,0.2665,1.588827,0.605787,0.497775,0.605778


TrainOutput(global_step=6216, training_loss=0.6961199418381527, metrics={'train_runtime': 711.5498, 'train_samples_per_second': 139.723, 'train_steps_per_second': 8.736, 'total_flos': 1866088444568250.0, 'train_loss': 0.6961199418381527, 'epoch': 4.0})

In [23]:
print("\n=== MELD test ===")
print(trainer.evaluate(meld_tok["test"]))

print("\n=== IEMOCAP test ===")
print(trainer.evaluate(iem_tok["test"]))



=== MELD test ===


{'eval_loss': 1.3370929956436157, 'eval_acc': 0.6042424242424242, 'eval_macro_f1': 0.4230544611741843, 'eval_weighted_f1': 0.6182595747127336, 'eval_runtime': 1.7214, 'eval_samples_per_second': 958.537, 'eval_steps_per_second': 60.417, 'epoch': 4.0}

=== IEMOCAP test ===
{'eval_loss': 1.6696290969848633, 'eval_acc': 0.6080459770114942, 'eval_macro_f1': 0.4463408697648325, 'eval_weighted_f1': 0.6052260841779984, 'eval_runtime': 3.1257, 'eval_samples_per_second': 835.008, 'eval_steps_per_second': 52.468, 'epoch': 4.0}


In [25]:
trainer.save_model("BEST_CHECKPOINT")
tok.save_pretrained("BEST_CHECKPOINT")


('BEST_CHECKPOINT/tokenizer_config.json',
 'BEST_CHECKPOINT/special_tokens_map.json',
 'BEST_CHECKPOINT/vocab.txt',
 'BEST_CHECKPOINT/added_tokens.json',
 'BEST_CHECKPOINT/tokenizer.json')