In [None]:
!pip -q install -U transformers datasets accelerate scikit-learn pandas optuna

import os, random, shutil
import numpy as np
import pandas as pd
import torch
import optuna

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, set_seed, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m140.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m124.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolve

In [None]:
# Paths Meld
TRAIN_CSV = "/content/train_sent_emo.csv"
VAL_CSV   = "/content/dev_sent_emo.csv"
TEST_CSV  = "/content/test_sent_emo.csv"

# Columns
DIALOG_COL  = "Dialogue_ID"
UTTID_COL   = "Utterance_ID"
SPEAKER_COL = "Speaker"
TEXT_COL    = "Utterance"
LABEL_COL   = "Emotion"

# MELD Ekman-7

LABELS = ["neutral", "joy", "sadness", "anger", "surprise", "fear","disgust"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Model
MODEL_BASE = "roberta-base"

# Paper constants
WEIGHT_DECAY = 0.01         # L2 regularization rate λ
EPOCHS = 7                  #  epochs
WARMUP_RATIO = 0.20         # 20% warmup, then linear decay
LR_SCHED = "linear"

# Optuna (paper)
N_TRIALS = 5
LR_LOW, LR_HIGH = 1e-6, 1e-4


# Training defaults (A100-friendly)
MAX_LEN = 512
BATCH_TRAIN = 8
BATCH_EVAL  = 16
GRAD_ACCUM  = 1

SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)


DEVICE: cuda


In [None]:
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

print("Rows:", len(train_df), len(val_df), len(test_df))


Rows: 9989 1109 2610


In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_BASE, use_fast=True, add_prefix_space=True)
collator = DataCollatorWithPadding(tokenizer=tok)

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "acc": accuracy_score(y_true, y_pred),
        "weighted_f1": f1_score(y_true, y_pred, average="weighted"),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset

def build_context_dataset_with_text_target_has_speaker(
    df, tokenizer, max_length=512, speaker_caps=True, debug_n=3
):
    df = df.copy()

    # normalize
    df[TEXT_COL] = df[TEXT_COL].astype(str)
    df[SPEAKER_COL] = df[SPEAKER_COL].astype(str)
    df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip().str.lower()

    # ordering
    df[UTTID_COL] = pd.to_numeric(df[UTTID_COL], errors="coerce")
    df = df.dropna(subset=[UTTID_COL]).copy()
    df[UTTID_COL] = df[UTTID_COL].astype(int)

    df = df[df[LABEL_COL].isin(LABELS)].copy()
    df = df.sort_values([DIALOG_COL, UTTID_COL]).reset_index(drop=True)

    cls_id = tokenizer.cls_token_id  # <s>
    sep_id = tokenizer.sep_token_id  # </s>

    max_tokens = max_length - 1  # reserve CLS

    all_input_ids, all_attn, all_labels = [], [], []
    all_texts, all_dialog, all_turn = [], [], []

    dbg_printed = 0
    lengths = []

    for d_id, g in df.groupby(DIALOG_COL, sort=False):
        speakers = g[SPEAKER_COL].tolist()
        utts     = g[TEXT_COL].tolist()
        labs     = g[LABEL_COL].tolist()
        turns    = g[UTTID_COL].tolist()

        if speaker_caps:
            speakers = [s.upper() for s in speakers]

        # segment text WITH speaker for all
        seg_text = [f"{s}: {u}" for s, u in zip(speakers, utts)]
        seg_ids  = [tokenizer.encode(x, add_special_tokens=False) for x in seg_text]
        n = len(seg_ids)

        for t in range(n):
            # STRICT PAPER: sequence = [SEP] + tokenize(x_t) + [SEP]
            # εδώ: target έχει speaker name (όπως ζήτησες)
            target_ids = seg_ids[t][:]

            # truncate so we can fit the two SEP
            if len(target_ids) + 2 > max_tokens:
                target_ids = target_ids[: max(0, max_tokens - 2)]

            seq_ids  = [sep_id] + target_ids + [sep_id]
            seq_text = "</s>" + seg_text[t] + "</s>"

            left, right = t - 1, t + 1
            blocked_left = blocked_right = False

            while True:
                changed = False

                # prepend left WITHOUT adding SEP per utterance
                if left >= 0 and not blocked_left:
                    cand = seg_ids[left]
                    if len(seq_ids) + len(cand) <= max_tokens:
                        seq_ids  = cand + seq_ids
                        seq_text = seg_text[left] + seq_text
                        left -= 1
                        changed = True
                    else:
                        blocked_left = True

                # append right WITHOUT adding SEP per utterance
                if right < n and not blocked_right:
                    cand = seg_ids[right]
                    if len(seq_ids) + len(cand) <= max_tokens:
                        seq_ids  = seq_ids + cand
                        seq_text = seq_text + seg_text[right]
                        right += 1
                        changed = True
                    else:
                        blocked_right = True

                if not changed:
                    break

            input_ids = [cls_id] + seq_ids
            input_ids = input_ids[:max_length]

            all_input_ids.append(input_ids)
            all_attn.append([1]*len(input_ids))
            all_labels.append(label2id[labs[t]])

            # raw in strict style (paper-ish). Starts with <s>
            all_texts.append("<s>" + seq_text)
            all_dialog.append(d_id)
            all_turn.append(turns[t])
            lengths.append(len(input_ids))

            if dbg_printed < debug_n:
                print("="*80)
                print(f"DEBUG {dbg_printed+1} | dialog={d_id} | uttid={turns[t]} | label={labs[t]}")
                print("RAW strict:")
                print(all_texts[-1][:1200])
                print("\nDECODED (first 120 tokens):")
                print(tokenizer.decode(input_ids[:120], skip_special_tokens=False))
                dbg_printed += 1

    print("\nToken length stats:",
          f"min={int(np.min(lengths))}, mean={float(np.mean(lengths)):.1f}, max={int(np.max(lengths))}, n={len(lengths)}")

    return Dataset.from_dict({
        "dialogue_id": all_dialog,
        "utterance_id": all_turn,
        "context_text_raw": all_texts,
        "input_ids": all_input_ids,
        "attention_mask": all_attn,
        "labels": all_labels
    })



def save_constructed_csv(ds, out_csv, id2label=None):
    d = ds.to_dict()
    df_out = pd.DataFrame({
        "dialogue_id": d["dialogue_id"],
        "utterance_id": d["utterance_id"],
        "label_id": d["labels"],
        "label": [id2label.get(int(x), str(x)) if isinstance(id2label, dict) else str(x) for x in d["labels"]],
        "context_text_raw": d["context_text_raw"],
    })
    df_out.to_csv(out_csv, index=False)
    print("✅ Saved:", out_csv, "| rows:", len(df_out))


# ----------- BUILD (prints debug examples) -----------
train_ds_full = build_context_dataset_with_text_target_has_speaker(train_df, tok, max_length=MAX_LEN, speaker_caps=True, debug_n=3)
val_ds_full   = build_context_dataset_with_text_target_has_speaker(val_df,   tok, max_length=MAX_LEN, speaker_caps=True, debug_n=1)
test_ds_full  = build_context_dataset_with_text_target_has_speaker(test_df,  tok, max_length=MAX_LEN, speaker_caps=True, debug_n=1)

print("Sizes:", len(train_ds_full), len(val_ds_full), len(test_ds_full))

# ----------- SAVE CSV locally in Colab -----------
save_constructed_csv(train_ds_full, "/content/train_constructed_context_targetSpeaker.csv", id2label=id2label)
save_constructed_csv(val_ds_full,   "/content/val_constructed_context_targetSpeaker.csv",   id2label=id2label)
save_constructed_csv(test_ds_full,  "/content/test_constructed_context_targetSpeaker.csv",  id2label=id2label)

!ls -lh /content/*constructed_context_targetSpeaker.csv


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

In [None]:
# pick one example from your built dataset
ex = train_ds_full[0]
ids = ex["input_ids"]

print(tok.decode(ids[:120], skip_special_tokens=False))

# also check lengths are reasonable
print("len(input_ids):", len(ex["input_ids"]))
print("len(attn):", len(ex["attention_mask"]))


<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see
len(input_ids): 237
len(attn): 237


In [None]:
import hashlib
import numpy as np

def ds_fingerprint(ds, n=50):
    m = hashlib.md5()
    for i in range(min(n, len(ds))):
        m.update((",".join(map(str, ds[i]["input_ids"]))).encode())
        m.update(str(ds[i]["labels"]).encode())
    return m.hexdigest()

print("train size:", len(train_ds_full), "val size:", len(val_ds_full))
print("fingerprints:")
print(" train:", ds_fingerprint(train_ds_full))
print(" val  :", ds_fingerprint(val_ds_full))

# quick decode sanity
print("\nDECODE sample 0 (first 200 tokens):")
print(tok.decode(train_ds_full[0]["input_ids"][:200], skip_special_tokens=False))

# label distribution sanity (first 5k for speed)
y = [train_ds_full[i]["labels"] for i in range(min(len(train_ds_full), 5000))]
print("\nLabel id dist (sample):", dict(zip(*np.unique(y, return_counts=True))))


train size: 9989 val size: 1109
fingerprints:
 train: cccd9727df86c0707d29da994cee67e0
 val  : 9e59501324961135316b2ae32a6d4dde

DECODE sample 0 (first 200 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER: You must’ve had your hands full. CHANDLER: That I did. That I did. THE INTERVIEWER: So let’s talk a little bit about your duties. CHANDLER: My duties?  All right. THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties. CHANDLER: I see. THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them. CHANDLER: Good to know. THE INTERVIEWER: We can go into detail CHANDLER: No don’t I beg of you! THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday

Label id dist (sample): {np.int64(0): np.int64(2380), np.int64(1): np.int64(829), np.int64(2): np.int64(330), np.int64(3): np.int64(549), np.int64(4): np.int64(62

In [None]:
print("Raw label counts (train_df):")
print(train_df[LABEL_COL].astype(str).value_counts().head(20))

print("\nUnique labels (raw, lower):")
u = train_df[LABEL_COL].astype(str).str.lower().unique()
print(sorted(u)[:50])

print("\nLabels NOT in LABELS after lower+strip:")
tmp = train_df[LABEL_COL].astype(str).str.lower().str.strip()
bad = tmp[~tmp.isin(LABELS)]
print(bad.value_counts().head(20))


Raw label counts (train_df):
Emotion
neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: count, dtype: int64

Unique labels (raw, lower):
['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

Labels NOT in LABELS after lower+strip:
Series([], Name: count, dtype: int64)


In [None]:
def objective(trial):
    set_seed(SEED)

    lr = trial.suggest_float("lr", 1e-6, 1e-4, log=True)

    # (A100 safe) keep these fixed όπως paper
    max_len = 512
    batch_train = 8
    batch_eval  = 16
    grad_acc    = 1

    train_ds = build_context_dataset_with_text_target_has_speaker(train_df, tok, max_length=max_len, speaker_caps=True)
    val_ds   = build_context_dataset_with_text_target_has_speaker(val_df,   tok, max_length=max_len, speaker_caps=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_BASE,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)

    args = TrainingArguments(
        output_dir=f"optuna_lr_trial_{trial.number}",
        eval_strategy="epoch",
        save_strategy="no",

        learning_rate=lr,
        num_train_epochs=5,
        per_device_train_batch_size=batch_train,
        per_device_eval_batch_size=batch_eval,
        gradient_accumulation_steps=grad_acc,

        weight_decay=0.01,
        warmup_ratio=0.20,
        lr_scheduler_type="linear",

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tok,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    out = trainer.evaluate(val_ds)

    # όπως paper: minimize cross-entropy loss on validation
    return out["eval_loss"]


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=N_TRIALS)

print("Best lr:", study.best_params["lr"])
print("Best val loss:", study.best_value)


[I 2026-01-20 14:33:55,396] A new study created in memory with name: no-name-5f089beb-3f3a-460d-bbc5-ad0f4b8b77ec


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2593,1.236041,0.592426,0.54385,0.346467
2,1.0445,1.115081,0.622182,0.589867,0.395237
3,0.9019,1.136385,0.62128,0.596289,0.447983
4,0.7333,1.171695,0.63661,0.616525,0.474985
5,0.6115,1.217077,0.635708,0.620274,0.489341


[I 2026-01-20 14:39:16,852] Trial 0 finished with value: 1.2170774936676025 and parameters: {'lr': 1.0728028327811167e-05}. Best is trial 0 with value: 1.2170774936676025.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4906,1.496027,0.470694,0.356205,0.178064
2,1.1099,1.132333,0.624887,0.594199,0.397413
3,1.0106,1.105267,0.633003,0.60145,0.406914
4,0.9113,1.123612,0.638413,0.608366,0.410598
5,0.8341,1.10756,0.635708,0.60847,0.411426


[I 2026-01-20 14:44:34,949] Trial 1 finished with value: 1.1075599193572998 and parameters: {'lr': 4.972898684151652e-06}. Best is trial 1 with value: 1.1075599193572998.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2338,1.26095,0.580703,0.545669,0.350347
2,1.1407,1.238746,0.588819,0.54059,0.368985
3,0.9232,1.250932,0.599639,0.568515,0.408924
4,0.661,1.44074,0.593327,0.573242,0.424739
5,0.4392,1.765734,0.600541,0.58401,0.426419


[I 2026-01-20 14:49:57,760] Trial 2 finished with value: 1.7657341957092285 and parameters: {'lr': 4.474481578107121e-05}. Best is trial 1 with value: 1.1075599193572998.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4252,1.359338,0.537421,0.481045,0.284131
2,1.0767,1.110809,0.629396,0.599398,0.39928
3,0.9409,1.09557,0.632101,0.601334,0.414676
4,0.8144,1.120881,0.643823,0.620017,0.46262
5,0.7247,1.143676,0.633003,0.614268,0.478437


[I 2026-01-20 14:55:20,581] Trial 3 finished with value: 1.1436761617660522 and parameters: {'lr': 7.008157893248042e-06}. Best is trial 1 with value: 1.1075599193572998.


DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2022,1.33685,0.554554,0.504805,0.310457
2,1.0505,1.184602,0.613165,0.570677,0.427152
3,0.8307,1.156722,0.628494,0.610264,0.483506
4,0.6064,1.298187,0.612263,0.596834,0.45768
5,0.3897,1.534635,0.611362,0.597861,0.46628


[I 2026-01-20 15:00:54,646] Trial 4 finished with value: 1.5346354246139526 and parameters: {'lr': 2.4641669275356723e-05}. Best is trial 1 with value: 1.1075599193572998.


Best lr: 4.972898684151652e-06
Best val loss: 1.1075599193572998


In [None]:
# ==========================
# ✅ Save model+tokenizer per EPOCH and per SEED
# - Saves: /content/epoch_checkpoints_seed{seed}/epoch_01, epoch_02, ...
# - Also keeps the Trainer's "best checkpoint" and copies it to *_BEST
# ==========================

import os, shutil
import pandas as pd
from transformers import TrainerCallback, TrainingArguments, Trainer

best_lr = study.best_params["lr"]

SEEDS = [42, 43, 44, 45, 46]
MAX_LEN = 512

# Build datasets ONCE (same for all seeds)
train_ds = build_context_dataset_with_text_target_has_speaker(train_df, tok, max_length=MAX_LEN, speaker_caps=True)
val_ds   = build_context_dataset_with_text_target_has_speaker(val_df,   tok, max_length=MAX_LEN, speaker_caps=True)
test_ds  = build_context_dataset_with_text_target_has_speaker(test_df,  tok, max_length=MAX_LEN, speaker_caps=True)

rows = []

# ---------- callback: save at end of each epoch ----------
class SaveByEpochCallback(TrainerCallback):
    def __init__(self, out_root, tokenizer):
        self.out_root = out_root
        self.tokenizer = tokenizer
        os.makedirs(out_root, exist_ok=True)

    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs["model"]
        ep = state.epoch
        ep_i = int(round(ep)) if ep is not None else 0

        save_dir = os.path.join(self.out_root, f"epoch_{ep_i:02d}")
        os.makedirs(save_dir, exist_ok=True)

        model.save_pretrained(save_dir)
        self.tokenizer.save_pretrained(save_dir)
        print(f"✅ Saved epoch checkpoint to: {save_dir}")
        return control


for seed in SEEDS:
    print("\n" + "="*20, "SEED", seed, "="*20)
    set_seed(seed)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_BASE,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)

    out_dir = f"roberta_meld_final_seed{seed}"

    # ✅ where we save epoch checkpoints for this seed
    epoch_root = f"/content/epoch_checkpoints_seed{seed}"
    if os.path.exists(epoch_root):
        shutil.rmtree(epoch_root)
    os.makedirs(epoch_root, exist_ok=True)

    epoch_saver = SaveByEpochCallback(epoch_root, tok)

    args = TrainingArguments(
        output_dir=out_dir,

        # ✅ use the official arg name (safer than eval_strategy)
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,  # keeps only 2 trainer checkpoints (we keep all epochs separately)

        load_best_model_at_end=True,
        metric_for_best_model="weighted_f1",
        greater_is_better=True,

        learning_rate=best_lr,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,

        weight_decay=0.01,
        warmup_ratio=0.20,
        lr_scheduler_type="linear",

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=seed,
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=collator,
        tokenizer=tok,
        compute_metrics=compute_metrics,
        callbacks=[epoch_saver],   # ✅ save model+tokenizer per epoch
    )

    trainer.train()

    best_ckpt = trainer.state.best_model_checkpoint
    print("Best checkpoint (trainer):", best_ckpt)

    # ===== Save BEST model folder (clean) =====
    best_dir = f"{out_dir}_BEST"
    if os.path.exists(best_dir):
        shutil.rmtree(best_dir)
    shutil.copytree(best_ckpt, best_dir)
    tok.save_pretrained(best_dir)
    print("✅ Saved BEST folder to:", best_dir)

    # Show epoch folders saved for this seed
    print("✅ Epoch checkpoints saved in:", epoch_root)
    !ls -1 "$epoch_root" | head

    # ===== Test (only after training) =====
    test_metrics = trainer.evaluate(test_ds)
    print("TEST:", test_metrics)

    rows.append({
        "seed": seed,
        "best_ckpt": best_ckpt,
        "best_dir": best_dir,
        "epoch_root": epoch_root,
        "test_acc": float(test_metrics["eval_acc"]),
        "test_weighted_f1": float(test_metrics["eval_weighted_f1"]),
        "test_macro_f1": float(test_metrics["eval_macro_f1"]),
    })

df = pd.DataFrame(rows)
display(df)

print("\nMEAN:")
display(df[["test_acc","test_weighted_f1","test_macro_f1"]].mean().to_frame("mean"))

print("\nSTD:")
display(df[["test_acc","test_weighted_f1","test_macro_f1"]].std().to_frame("std"))



DEBUG 1 | dialog=0 | uttid=0 | label=neutral
RAW strict:
<s></s>CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s>THE INTERVIEWER: You must’ve had your hands full.CHANDLER: That I did. That I did.THE INTERVIEWER: So let’s talk a little bit about your duties.CHANDLER: My duties?  All right.THE INTERVIEWER: Now you’ll be heading a whole division, so you’ll have a lot of duties.CHANDLER: I see.THE INTERVIEWER: But there’ll be perhaps 30 people under you so you can dump a certain amount on them.CHANDLER: Good to know.THE INTERVIEWER: We can go into detailCHANDLER: No don’t I beg of you!THE INTERVIEWER: All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.CHANDLER: Really?!THE INTERVIEWER: Absolutely.  You can relax

DECODED (first 120 tokens):
<s></s> CHANDLER: also I was the point person on my company’s transition from the KL-5 to GR-6 system.</s> THE INTERVIEWER:

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5086,1.531017,0.427412,0.2602,0.092444
2,1.1343,1.168596,0.61587,0.581789,0.386592
3,1.0276,1.121961,0.61587,0.578337,0.386971
4,0.9171,1.120456,0.634806,0.605474,0.409377
5,0.8247,1.137018,0.623084,0.605542,0.450338
6,0.8086,1.114167,0.641118,0.622255,0.47706
7,0.7455,1.151201,0.633904,0.618066,0.479456


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed42/checkpoint-7494
✅ Saved BEST folder to: roberta_meld_final_seed42_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed42
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.078357219696045, 'eval_acc': 0.6616858237547892, 'eval_weighted_f1': 0.6464172520548721, 'eval_macro_f1': 0.4550462351046301, 'eval_runtime': 3.5993, 'eval_samples_per_second': 725.135, 'eval_steps_per_second': 45.564, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5099,1.568082,0.423805,0.252297,0.085045
2,1.1174,1.181856,0.60505,0.561359,0.36842
3,1.0295,1.102624,0.631199,0.602614,0.408188
4,0.9286,1.146169,0.632101,0.599963,0.40098
5,0.8357,1.118927,0.641118,0.608906,0.41128
6,0.7888,1.162418,0.637511,0.609911,0.436243
7,0.6933,1.14234,0.645627,0.625377,0.469063


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed43/checkpoint-8743
✅ Saved BEST folder to: roberta_meld_final_seed43_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed43
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1253620386123657, 'eval_acc': 0.6513409961685823, 'eval_weighted_f1': 0.6381059026358126, 'eval_macro_f1': 0.4355106072710531, 'eval_runtime': 3.2512, 'eval_samples_per_second': 802.79, 'eval_steps_per_second': 50.444, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4734,1.524739,0.442741,0.297155,0.124017
2,1.0973,1.127389,0.627592,0.597296,0.401364
3,0.9881,1.124941,0.620379,0.594669,0.398809
4,0.8986,1.067699,0.638413,0.610986,0.417707
5,0.8219,1.123457,0.640216,0.62039,0.479419
6,0.7242,1.148867,0.644725,0.625086,0.487236
7,0.702,1.135028,0.642922,0.628329,0.495127


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed44/checkpoint-8743
✅ Saved BEST folder to: roberta_meld_final_seed44_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed44
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1278702020645142, 'eval_acc': 0.6505747126436782, 'eval_weighted_f1': 0.6421233388095827, 'eval_macro_f1': 0.4666039923669872, 'eval_runtime': 3.3211, 'eval_samples_per_second': 785.878, 'eval_steps_per_second': 49.381, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.4663,1.564532,0.441839,0.300286,0.130455
2,1.0919,1.16519,0.612263,0.582491,0.385305
3,0.9771,1.085408,0.651037,0.622169,0.422691
4,0.9111,1.167386,0.62128,0.580879,0.391667
5,0.8103,1.158566,0.63661,0.611006,0.459189
6,0.7419,1.166516,0.641118,0.625272,0.484931
7,0.7081,1.178739,0.645627,0.626372,0.487915


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed45/checkpoint-8743
✅ Saved BEST folder to: roberta_meld_final_seed45_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed45
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.1546725034713745, 'eval_acc': 0.6509578544061303, 'eval_weighted_f1': 0.6399867978994701, 'eval_macro_f1': 0.44646654385752427, 'eval_runtime': 3.3294, 'eval_samples_per_second': 783.919, 'eval_steps_per_second': 49.258, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.5123,1.577675,0.422002,0.287453,0.116397
2,1.1945,1.146279,0.614067,0.578761,0.381823
3,1.044,1.125682,0.623986,0.586422,0.387322
4,0.9124,1.093232,0.645627,0.612143,0.415769
5,0.8637,1.072092,0.633003,0.619451,0.470335
6,0.7961,1.096621,0.64202,0.624895,0.490392
7,0.7275,1.106538,0.641118,0.627026,0.494962


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_07
Best checkpoint (trainer): roberta_meld_final_seed46/checkpoint-8743
✅ Saved BEST folder to: roberta_meld_final_seed46_BEST
✅ Epoch checkpoints saved in: /content/epoch_checkpoints_seed46
epoch_01
epoch_02
epoch_03
epoch_04
epoch_05
epoch_06
epoch_07


TEST: {'eval_loss': 1.0994726419448853, 'eval_acc': 0.6505747126436782, 'eval_weighted_f1': 0.641174287288045, 'eval_macro_f1': 0.4551888625015904, 'eval_runtime': 3.4118, 'eval_samples_per_second': 764.995, 'eval_steps_per_second': 48.069, 'epoch': 7.0}


Unnamed: 0,seed,best_ckpt,best_dir,epoch_root,test_acc,test_weighted_f1,test_macro_f1
0,42,roberta_meld_final_seed42/checkpoint-7494,roberta_meld_final_seed42_BEST,/content/epoch_checkpoints_seed42,0.661686,0.646417,0.455046
1,43,roberta_meld_final_seed43/checkpoint-8743,roberta_meld_final_seed43_BEST,/content/epoch_checkpoints_seed43,0.651341,0.638106,0.435511
2,44,roberta_meld_final_seed44/checkpoint-8743,roberta_meld_final_seed44_BEST,/content/epoch_checkpoints_seed44,0.650575,0.642123,0.466604
3,45,roberta_meld_final_seed45/checkpoint-8743,roberta_meld_final_seed45_BEST,/content/epoch_checkpoints_seed45,0.650958,0.639987,0.446467
4,46,roberta_meld_final_seed46/checkpoint-8743,roberta_meld_final_seed46_BEST,/content/epoch_checkpoints_seed46,0.650575,0.641174,0.455189



MEAN:


Unnamed: 0,mean
test_acc,0.653027
test_weighted_f1,0.641562
test_macro_f1,0.451763



STD:


Unnamed: 0,std
test_acc,0.004851
test_weighted_f1,0.003101
test_macro_f1,0.011565


In [None]:
from google.colab import drive
drive.mount('/content/drive')

SRC = "/content/epoch_checkpoints_seed42"
DST = "/content/drive/MyDrive/epoch_checkpoints_seed42_emoberta_roberta/"

!rsync -ah --progress "$SRC" "$DST"
!ls -lh "/content/drive/MyDrive/epoch_checkpoints_seed42_emoberta_roberta/"

Mounted at /content/drive
sending incremental file list
created directory /content/drive/MyDrive/epoch_checkpoints_seed42_emoberta_roberta
epoch_checkpoints_seed42/
epoch_checkpoints_seed42/epoch_01/
epoch_checkpoints_seed42/epoch_01/config.json
            984 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=48/57)
epoch_checkpoints_seed42/epoch_01/merges.txt
        456.32K 100%   21.76MB/s    0:00:00 (xfr#2, to-chk=47/57)
epoch_checkpoints_seed42/epoch_01/model.safetensors
        498.63M 100%  154.44MB/s    0:00:03 (xfr#3, to-chk=46/57)
epoch_checkpoints_seed42/epoch_01/special_tokens_map.json
            280 100%    3.33kB/s    0:00:00 (xfr#4, to-chk=45/57)
epoch_checkpoints_seed42/epoch_01/tokenizer.json
          3.56M 100%   22.33MB/s    0:00:00 (xfr#5, to-chk=44/57)
epoch_checkpoints_seed42/epoch_01/tokenizer_config.json
          1.25K 100%    8.00kB/s    0:00:00 (xfr#6, to-chk=43/57)
epoch_checkpoints_seed42/epoch_01/vocab.json
        798.29K 100%    4.05MB/s    0:00:00 (xfr#7, t