In [None]:
# ==========================
# DistilBERT / BERT / RoBERTa (MELD) fine-tuning
# Cell 1: Install/upgrade dependencies (Transformers, Datasets, Optuna, etc.)
# Tip: change MODEL_BASE to switch backbone/checkpoint
# ==========================

!pip -q install -U transformers datasets accelerate scikit-learn pandas optuna

import os, random, shutil
import numpy as np
import pandas as pd
import torch
import optuna

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, set_seed, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m151.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 3.0.0 which is incompatible.
grad

In [None]:
# ==========================
# DistilBERT / BERT / RoBERTa (MELD) fine-tuning
# Cell 2: Setup / utilities
# Tip: change MODEL_BASE to switch backbone/checkpoint
# ==========================

# Paths Meld
TRAIN_CSV = "/content/train_sent_emo.csv"
VAL_CSV   = "/content/dev_sent_emo.csv"
TEST_CSV  = "/content/test_sent_emo.csv"

# Columns
DIALOG_COL  = "Dialogue_ID"
UTTID_COL   = "Utterance_ID"
SPEAKER_COL = "Speaker"
TEXT_COL    = "Utterance"
LABEL_COL   = "Emotion"

# MELD Ekman-7

LABELS = ["neutral", "joy", "sadness", "anger", "surprise", "fear","disgust"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Model
MODEL_BASE = "roberta-base"

# Paper constants
WEIGHT_DECAY = 0.01         # L2 regularization rate λ
EPOCHS = 7                  #  epochs
WARMUP_RATIO = 0.20
LR_SCHED = "linear"

# Optuna
N_TRIALS = 5
LR_LOW, LR_HIGH = 1e-6, 1e-4


# Training defaults
MAX_LEN = 512
BATCH_TRAIN = 8
BATCH_EVAL  = 16
GRAD_ACCUM  = 1

SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)


DEVICE: cuda


In [None]:
# ==========================
#  Load MELD CSVs into pandas DataFrames
# ==========================

train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

print("Rows:", len(train_df), len(val_df), len(test_df))


Rows: 9989 1109 2610


In [None]:
# ==========================
# 
#  Load tokenizer/model checkpoint and metrics
#
# ==========================

tok = AutoTokenizer.from_pretrained(MODEL_BASE, use_fast=True, add_prefix_space=True)
collator = DataCollatorWithPadding(tokenizer=tok)

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "acc": accuracy_score(y_true, y_pred),
        "weighted_f1": f1_score(y_true, y_pred, average="weighted"),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# ==========================
#  Build context-augmented dataset (speaker tags + target-aware formatting)
# ==========================

import pandas as pd
import numpy as np
from datasets import Dataset

def build_context_dataset_with_text_target_has_speaker(
    df, tokenizer, max_length=512, speaker_caps=True, debug_n=3
):
    df = df.copy()

    # normalize
    df[TEXT_COL] = df[TEXT_COL].astype(str)
    df[SPEAKER_COL] = df[SPEAKER_COL].astype(str)
    df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip().str.lower()

    # ordering
    df[UTTID_COL] = pd.to_numeric(df[UTTID_COL], errors="coerce")
    df = df.dropna(subset=[UTTID_COL]).copy()
    df[UTTID_COL] = df[UTTID_COL].astype(int)

    df = df[df[LABEL_COL].isin(LABELS)].copy()
    df = df.sort_values([DIALOG_COL, UTTID_COL]).reset_index(drop=True)

    cls_id = tokenizer.cls_token_id  # <s>
    sep_id = tokenizer.sep_token_id  # </s>

    #  reserve CLS + final outer </s>
    max_tokens = max_length - 2

    #  "space token" to avoid BPE glue between separately-encoded utterances

    space_ids = tokenizer.encode(" ", add_special_tokens=False)
    if len(space_ids) == 0:

        space_ids = []

    all_input_ids, all_attn, all_labels = [], [], []
    all_texts, all_dialog, all_turn = [], [], []

    dbg_printed = 0
    lengths = []

    for d_id, g in df.groupby(DIALOG_COL, sort=False):
        speakers = g[SPEAKER_COL].tolist()
        utts     = g[TEXT_COL].tolist()
        labs     = g[LABEL_COL].tolist()
        turns    = g[UTTID_COL].tolist()

        if speaker_caps:
            speakers = [s.upper() for s in speakers]

        # segment text WITH speaker for all
        seg_text = [f"{s}: {u}" for s, u in zip(speakers, utts)]

        # IMPORTANT: encode each segment WITHOUT specials
        seg_ids  = [tokenizer.encode(x, add_special_tokens=False) for x in seg_text]
        n = len(seg_ids)

        for t in range(n):
            # target ids (WITH speaker)
            target_ids = seg_ids[t][:]

            #  make room for the two target-boundary </s> ... </s>
            if len(target_ids) + 2 > max_tokens:
                target_ids = target_ids[: max(0, max_tokens - 2)]


            seq_ids  = [sep_id] + target_ids + [sep_id]
            #  spaced separators for raw text
            seq_text = " </s> " + seg_text[t] + " </s> "

            left, right = t - 1, t + 1
            blocked_left = blocked_right = False

            while True:
                changed = False

                # ---- prepend left  ----
                if left >= 0 and not blocked_left:
                    cand = seg_ids[left]
                    #  add a space between utterances to avoid BPE glue
                    need = len(cand) + (len(space_ids) if len(seq_ids) > 0 else 0)

                    if len(seq_ids) + need <= max_tokens:
                        # cand + space + current
                        if space_ids:
                            seq_ids  = cand + space_ids + seq_ids
                        else:
                            seq_ids  = cand + seq_ids
                        seq_text = seg_text[left] + " " + seq_text
                        left -= 1
                        changed = True
                    else:
                        blocked_left = True

                # ---- append right WITHOUT adding SEP per utterance ----
                if right < n and not blocked_right:
                    cand = seg_ids[right]
                    need = len(cand) + (len(space_ids) if len(seq_ids) > 0 else 0)

                    if len(seq_ids) + need <= max_tokens:
                        if space_ids:
                            seq_ids  = seq_ids + space_ids + cand
                        else:
                            seq_ids  = seq_ids + cand
                        seq_text = seq_text + " " + seg_text[right]
                        right += 1
                        changed = True
                    else:
                        blocked_right = True

                if not changed:
                    break

            # outer roberta: <s> ... </s>
            input_ids = [cls_id] + seq_ids + [sep_id]
            input_ids = input_ids[:max_length]

            all_input_ids.append(input_ids)
            all_attn.append([1]*len(input_ids))
            all_labels.append(label2id[labs[t]])

            #  raw text stored WITHOUT outer <s> ... </s> 

            all_texts.append("<s> " + seq_text.strip() + " </s>")
            all_dialog.append(d_id)
            all_turn.append(turns[t])
            lengths.append(len(input_ids))

            if dbg_printed < debug_n:
                print("="*80)
                print(f"DEBUG {dbg_printed+1} | dialog={d_id} | uttid={turns[t]} | label={labs[t]}")
                print("RAW strict (repr so you see </s>):")
                print(repr(all_texts[-1][:1200]))
                print("\nDECODED (first 120 tokens):")
                print(tokenizer.decode(input_ids[:120], skip_special_tokens=False))
                dbg_printed += 1

    print("\nToken length stats:",
          f"min={int(np.min(lengths))}, mean={float(np.mean(lengths)):.1f}, max={int(np.max(lengths))}, n={len(lengths)}")

    return Dataset.from_dict({
        "dialogue_id": all_dialog,
        "utterance_id": all_turn,
        "context_text_raw": all_texts,
        "input_ids": all_input_ids,
        "attention_mask": all_attn,
        "labels": all_labels
    })


def save_constructed_csv(ds, out_csv, id2label=None):
    d = ds.to_dict()
    df_out = pd.DataFrame({
        "dialogue_id": d["dialogue_id"],
        "utterance_id": d["utterance_id"],
        "label_id": d["labels"],
        "label": [id2label.get(int(x), str(x)) if isinstance(id2label, dict) else str(x) for x in d["labels"]],
        "context_text_raw": d["context_text_raw"],
    })
    df_out.to_csv(out_csv, index=False)
    print("✅ Saved:", out_csv, "| rows:", len(df_out))


# ----------- BUILD (prints debug examples) -----------
train_ds_full = build_context_dataset_with_text_target_has_speaker(train_df, tok, max_length=MAX_LEN, speaker_caps=True, debug_n=3)
val_ds_full   = build_context_dataset_with_text_target_has_speaker(val_df,   tok, max_length=MAX_LEN, speaker_caps=True, debug_n=1)
test_ds_full  = build_context_dataset_with_text_target_has_speaker(test_df,  tok, max_length=MAX_LEN, speaker_caps=True, debug_n=1)

print("Sizes:", len(train_ds_full), len(val_ds_full), len(test_ds_full))

# ----------- SAVE CSV locally  -----------
save_constructed_csv(train_ds_full, "/content/train_constructed_context_targetSpeaker_FIXED.csv", id2label=id2label)
save_constructed_csv(val_ds_full,   "/content/val_constructed_context_targetSpeaker_FIXED.csv",   id2label=id2label)
save_constructed_csv(test_ds_full,  "/content/test_constructed_context_targetSpeaker_FIXED.csv",  id2label=id2label)

!ls -lh /content/*_targetSpeaker_FIXED.csv


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

In [None]:
# ==========================
#  Quick sanity check: inspect one processed example
# ==========================

# pick one example from your built dataset
ex = train_ds_full[0]
ids = ex["input_ids"]

print(tok.decode(ids[:120], skip_special_tokens=False))

print("len(input_ids):", len(ex["input_ids"]))
print("len(attn):", len(ex["attention_mask"]))


<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>  THE INTERVIEWER: You must’ve had your hands full.  CHANDLER: That I did. That I did.  THE INTERVIEWER: So let’s talk a little bit about your duties.  CHANDLER: My duties?  All right.  THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. 
len(input_ids): 251
len(attn): 251


In [None]:
# ==========================
#  Dataset sanity checks: fingerprints, decode sample, label distribution
# ==========================

import hashlib
import numpy as np

def ds_fingerprint(ds, n=50):
    m = hashlib.md5()
    for i in range(min(n, len(ds))):
        m.update((",".join(map(str, ds[i]["input_ids"]))).encode())
        m.update(str(ds[i]["labels"]).encode())
    return m.hexdigest()

print("train size:", len(train_ds_full), "val size:", len(val_ds_full))
print("fingerprints:")
print(" train:", ds_fingerprint(train_ds_full))
print(" val  :", ds_fingerprint(val_ds_full))

# quick decode sanity
print("\nDECODE sample 0 (first 200 tokens):")
print(tok.decode(train_ds_full[0]["input_ids"][:200], skip_special_tokens=False))

# label distribution sanity (first 5k for speed)
y = [train_ds_full[i]["labels"] for i in range(min(len(train_ds_full), 5000))]
print("\nLabel id dist (sample):", dict(zip(*np.unique(y, return_counts=True))))


train size: 9989 val size: 1109
fingerprints:
 train: 4b66420845d37c79b9976e672c085ea8
 val  : 5efe931aa1e609b7e07f746750e30cdb

DECODE sample 0 (first 200 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>  THE INTERVIEWER: You must’ve had your hands full.  CHANDLER: That I did. That I did.  THE INTERVIEWER: So let’s talk a little bit about your duties.  CHANDLER: My duties?  All right.  THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.  CHANDLER: I see.  THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.  CHANDLER: Good to know.  THE INTERVIEWER: We can go into detail  CHANDLER: No don’t I beg of you!  THE INTERVIEWER: All right then, we

Label id dist (sample): {np.int64(0): np.int64(2380), np.int64(1): np.int64(829), np.int64(2): np.int64(330), np.int64(3): np.int64(549), np.int64(4): np.int64(626), np.int64(5): np.int64(146), n

In [None]:
# ==========================
# train/evaluate
# ==========================

def objective(trial):
    set_seed(SEED)

    lr = trial.suggest_float("lr", 1e-6, 1e-4, log=True)

    max_len = 512
    batch_train = 8
    batch_eval  = 16
    grad_acc    = 1

    train_ds = build_context_dataset_with_text_target_has_speaker(train_df, tok, max_length=max_len, speaker_caps=True)
    val_ds   = build_context_dataset_with_text_target_has_speaker(val_df,   tok, max_length=max_len, speaker_caps=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_BASE,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)

    args = TrainingArguments(
        output_dir=f"optuna_lr_trial_{trial.number}",
        eval_strategy="epoch",
        save_strategy="no",

        learning_rate=lr,
        num_train_epochs=5,
        per_device_train_batch_size=batch_train,
        per_device_eval_batch_size=batch_eval,
        gradient_accumulation_steps=grad_acc,

        weight_decay=0.01,
        warmup_ratio=0.20,
        lr_scheduler_type="linear",

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tok,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    out = trainer.evaluate(val_ds)

    #  minimize cross-entropy loss on validation
    return out["eval_loss"]


In [None]:
# ==========================
# Run Optuna study and select best hyperparameters
# ==========================

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=N_TRIALS)

print("Best lr:", study.best_params["lr"])
print("Best val loss:", study.best_value)


[I 2026-01-23 09:12:29,744] A new study created in memory with name: no-name-87a306b8-29b0-44f4-983c-c6052c50bd63


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5547,1.640659,0.423805,0.252297,0.085045
2,1.4693,1.535491,0.431921,0.314046,0.143602
3,1.4095,1.461454,0.464382,0.353277,0.174149
4,1.2458,1.278754,0.576195,0.524128,0.321245
5,1.1784,1.251752,0.586114,0.540996,0.340493


[I 2026-01-23 09:17:40,542] Trial 0 finished with value: 1.2517520189285278 and parameters: {'lr': 1.5312186697729886e-06}. Best is trial 0 with value: 1.2517520189285278.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.3196,1.236257,0.607755,0.563275,0.363794
2,1.0504,1.106281,0.624887,0.586076,0.38727
3,0.8699,1.114065,0.63661,0.613577,0.481096
4,0.6833,1.190071,0.635708,0.621456,0.461181
5,0.5089,1.263965,0.639315,0.625062,0.486822


[I 2026-01-23 09:22:48,749] Trial 1 finished with value: 1.263965129852295 and parameters: {'lr': 1.3788900683869114e-05}. Best is trial 0 with value: 1.2517520189285278.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.3125,1.379438,0.532011,0.462909,0.270739
2,1.5476,1.630486,0.423805,0.252297,0.085045
3,1.5578,1.66102,0.423805,0.252297,0.085045
4,1.5264,1.65382,0.423805,0.252297,0.085045
5,1.5359,1.64141,0.423805,0.252297,0.085045


[I 2026-01-23 09:27:58,076] Trial 2 finished with value: 1.6414096355438232 and parameters: {'lr': 6.457999908802166e-05}. Best is trial 0 with value: 1.2517520189285278.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.508,1.514349,0.431019,0.281988,0.112236
2,1.1165,1.179013,0.593327,0.555245,0.357777
3,1.0014,1.134028,0.623986,0.586823,0.393217
4,0.894,1.150208,0.633003,0.602186,0.406559
5,0.8115,1.140484,0.629396,0.598562,0.40415


[I 2026-01-23 09:33:06,681] Trial 3 finished with value: 1.140484094619751 and parameters: {'lr': 5.482717813019306e-06}. Best is trial 3 with value: 1.140484094619751.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2243,1.289075,0.590622,0.543027,0.34575
2,1.1467,1.203003,0.606853,0.5656,0.371433
3,0.9605,1.243413,0.616772,0.583316,0.452064
4,0.7234,1.410203,0.593327,0.575008,0.414874
5,0.4776,1.599689,0.619477,0.602965,0.444816


[I 2026-01-23 09:38:14,768] Trial 4 finished with value: 1.5996887683868408 and parameters: {'lr': 4.9607717817852516e-05}. Best is trial 3 with value: 1.140484094619751.


Best lr: 5.482717813019306e-06
Best val loss: 1.140484094619751


In [None]:

# ==========================
#   Save model+tokenizer per EPOCH and per SEED
# - Saves: /content/epoch_checkpoints_seed{seed}/epoch_01, epoch_02, ...
# - Also keeps the Trainer's "best checkpoint" and copies it to *_BEST
# ==========================

import os, shutil
import pandas as pd
from transformers import TrainerCallback, TrainingArguments, Trainer

best_lr = study.best_params["lr"]

SEEDS = [42, 43, 44, 45, 46]
MAX_LEN = 512

# Build datasets ONCE (same for all seeds)
train_ds = build_context_dataset_with_text_target_has_speaker(train_df, tok, max_length=MAX_LEN, speaker_caps=True)
val_ds   = build_context_dataset_with_text_target_has_speaker(val_df,   tok, max_length=MAX_LEN, speaker_caps=True)
test_ds  = build_context_dataset_with_text_target_has_speaker(test_df,  tok, max_length=MAX_LEN, speaker_caps=True)

rows = []

# ---------- callback: save at end of each epoch ----------
class SaveByEpochCallback(TrainerCallback):
    def __init__(self, out_root, tokenizer):
        self.out_root = out_root
        self.tokenizer = tokenizer
        os.makedirs(out_root, exist_ok=True)

    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs["model"]
        ep = state.epoch
        ep_i = int(round(ep)) if ep is not None else 0

        save_dir = os.path.join(self.out_root, f"epoch_{ep_i:02d}")
        os.makedirs(save_dir, exist_ok=True)

        model.save_pretrained(save_dir)
        self.tokenizer.save_pretrained(save_dir)
        print(f"✅ Saved epoch checkpoint to: {save_dir}")
        return control


for seed in SEEDS:
    print("\n" + "="*20, "SEED", seed, "="*20)
    set_seed(seed)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_BASE,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)

    out_dir = f"roberta_meld_final_seed{seed}"

    #  where we save epoch checkpoints for this seed
    epoch_root = f"/content/epoch_checkpoints_seed{seed}"
    if os.path.exists(epoch_root):
        shutil.rmtree(epoch_root)
    os.makedirs(epoch_root, exist_ok=True)

    epoch_saver = SaveByEpochCallback(epoch_root, tok)

    args = TrainingArguments(
        output_dir=out_dir,

        # use the official arg name (safer than eval_strategy)
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,  # keeps only 2 trainer checkpoints (we keep all epochs separately)

        load_best_model_at_end=True,
        metric_for_best_model="weighted_f1",
        greater_is_better=True,

        learning_rate=best_lr,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,

        weight_decay=0.01,
        warmup_ratio=0.20,
        lr_scheduler_type="linear",

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=seed,
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tok,
        compute_metrics=compute_metrics,
        callbacks=[epoch_saver],   #  save model+tokenizer per epoch
    )

    trainer.train()

    best_ckpt = trainer.state.best_model_checkpoint
    print("Best checkpoint (trainer):", best_ckpt)

    # ===== Save BEST model folder  =====
    best_dir = f"{out_dir}_BEST"
    if os.path.exists(best_dir):
        shutil.rmtree(best_dir)
    shutil.copytree(best_ckpt, best_dir)
    tok.save_pretrained(best_dir)
    print("✅ Saved BEST folder to:", best_dir)

    # Show epoch folders saved for this seed
    print("✅ Epoch checkpoints saved in:", epoch_root)
    !ls -1 "$epoch_root" | head

    # ===== Test (only after training) =====
    test_metrics = trainer.evaluate(test_ds)
    print("TEST:", test_metrics)

    rows.append({
        "seed": seed,
        "best_ckpt": best_ckpt,
        "best_dir": best_dir,
        "epoch_root": epoch_root,
        "test_acc": float(test_metrics["eval_acc"]),
        "test_weighted_f1": float(test_metrics["eval_weighted_f1"]),
        "test_macro_f1": float(test_metrics["eval_macro_f1"]),
    })

df = pd.DataFrame(rows)
display(df)

print("\nMEAN:")
display(df[["test_acc","test_weighted_f1","test_macro_f1"]].mean().to_frame("mean"))

print("\nSTD:")
display(df[["test_acc","test_weighted_f1","test_macro_f1"]].std().to_frame("std"))



DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict (repr so you see </s>):
'<s> </s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s>  THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. CHANDLER: Really?! THE INTERVIEWER: Absolutely.  You can relax </s>'

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition fro

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5309,1.558906,0.423805,0.252297,0.085045
2,1.1347,1.180762,0.611362,0.573903,0.374167
3,1.016,1.129711,0.626691,0.589365,0.397288
4,0.8882,1.14863,0.627592,0.601054,0.427182
5,0.7843,1.191525,0.628494,0.607594,0.461296
6,0.758,1.157616,0.637511,0.61561,0.459843
7,0.6997,1.213728,0.630298,0.610259,0.464973


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed42/checkpoint-7494
✅ Saved BEST folder to: roberta_meld_final_seed42_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed42
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1344406604766846, 'eval_acc': 0.650191570881226, 'eval_weighted_f1': 0.6369315875903244, 'eval_macro_f1': 0.4462754578316832, 'eval_runtime': 3.3865, 'eval_samples_per_second': 770.703, 'eval_steps_per_second': 48.427, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4917,1.580039,0.4211,0.288358,0.116297
2,1.121,1.192025,0.602344,0.555015,0.362344
3,1.0126,1.124837,0.631199,0.597008,0.402997
4,0.9321,1.124644,0.632101,0.603581,0.413351
5,0.8428,1.148727,0.635708,0.605402,0.443386
6,0.7781,1.170817,0.640216,0.613942,0.464247
7,0.6855,1.140224,0.650135,0.629942,0.487833


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed43/checkpoint-8743
✅ Saved BEST folder to: roberta_meld_final_seed43_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed43
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1489425897598267, 'eval_acc': 0.6524904214559387, 'eval_weighted_f1': 0.6410403495776756, 'eval_macro_f1': 0.4607877071500737, 'eval_runtime': 3.2984, 'eval_samples_per_second': 791.291, 'eval_steps_per_second': 49.721, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4562,1.408934,0.51037,0.403659,0.206536
2,1.0931,1.123991,0.637511,0.611225,0.416285
3,0.9562,1.144925,0.619477,0.592064,0.393581
4,0.8531,1.127971,0.634806,0.608026,0.425155
5,0.7758,1.14521,0.639315,0.620216,0.469129
6,0.6848,1.195663,0.631199,0.613186,0.465355
7,0.6346,1.198571,0.637511,0.623483,0.485906


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed44/checkpoint-8743
✅ Saved BEST folder to: roberta_meld_final_seed44_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed44
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1867961883544922, 'eval_acc': 0.6532567049808429, 'eval_weighted_f1': 0.6459490997242914, 'eval_macro_f1': 0.4719797828680748, 'eval_runtime': 3.3111, 'eval_samples_per_second': 788.259, 'eval_steps_per_second': 49.53, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4745,1.552359,0.441839,0.299886,0.130189
2,1.0755,1.137187,0.624887,0.592779,0.391527
3,0.9645,1.083005,0.640216,0.607662,0.409522
4,0.8713,1.124163,0.641118,0.607491,0.428182
5,0.7809,1.143821,0.64202,0.619297,0.469652
6,0.7074,1.17429,0.64202,0.624946,0.482548
7,0.6413,1.183119,0.639315,0.622962,0.480685


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed45/checkpoint-7494
✅ Saved BEST folder to: roberta_meld_final_seed45_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed45
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1661546230316162, 'eval_acc': 0.6429118773946361, 'eval_weighted_f1': 0.6343424462993786, 'eval_macro_f1': 0.44388329029297224, 'eval_runtime': 3.3677, 'eval_samples_per_second': 775.006, 'eval_steps_per_second': 48.698, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5166,1.584179,0.427412,0.262209,0.092835
2,1.1422,1.139344,0.623986,0.589345,0.39468
3,1.0137,1.118995,0.63661,0.603078,0.406094
4,0.8801,1.141139,0.64202,0.609267,0.443433
5,0.8097,1.095982,0.648332,0.631,0.481581
6,0.7729,1.124524,0.653742,0.635893,0.504608


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_06


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5166,1.584179,0.427412,0.262209,0.092835
2,1.1422,1.139344,0.623986,0.589345,0.39468
3,1.0137,1.118995,0.63661,0.603078,0.406094
4,0.8801,1.141139,0.64202,0.609267,0.443433
5,0.8097,1.095982,0.648332,0.631,0.481581
6,0.7729,1.124524,0.653742,0.635893,0.504608
7,0.6831,1.141993,0.651037,0.6332,0.502614


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed46/checkpoint-7494
✅ Saved BEST folder to: roberta_meld_final_seed46_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed46
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1374412775039673, 'eval_acc': 0.6467432950191571, 'eval_weighted_f1': 0.6369860282134358, 'eval_macro_f1': 0.458966379841883, 'eval_runtime': 3.2727, 'eval_samples_per_second': 797.504, 'eval_steps_per_second': 50.111, 'epoch': 7.0}


Unnamed: 0,seed,best_ckpt,best_dir,epoch_root,test_acc,test_weighted_f1,test_macro_f1
0,42,roberta_meld_final_seed42/checkpoint-7494,roberta_meld_final_seed42_BEST,/content/epoch_checkpoints_seed42,0.650192,0.636932,0.446275
1,43,roberta_meld_final_seed43/checkpoint-8743,roberta_meld_final_seed43_BEST,/content/epoch_checkpoints_seed43,0.65249,0.64104,0.460788
2,44,roberta_meld_final_seed44/checkpoint-8743,roberta_meld_final_seed44_BEST,/content/epoch_checkpoints_seed44,0.653257,0.645949,0.47198
3,45,roberta_meld_final_seed45/checkpoint-7494,roberta_meld_final_seed45_BEST,/content/epoch_checkpoints_seed45,0.642912,0.634342,0.443883
4,46,roberta_meld_final_seed46/checkpoint-7494,roberta_meld_final_seed46_BEST,/content/epoch_checkpoints_seed46,0.646743,0.636986,0.458966



MEAN:


Unnamed: 0,mean
test_acc,0.649119
test_weighted_f1,0.63905
test_macro_f1,0.456379



STD:


Unnamed: 0,std
test_acc,0.004296
test_weighted_f1,0.004541
test_macro_f1,0.011486
