In [None]:
!pip -q install -U transformers datasets accelerate scikit-learn pandas optuna

import os, random, shutil
import numpy as np
import pandas as pd
import torch
import optuna

from collections import Counter
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, set_seed, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score
from transformers import TrainerCallback, TrainingArguments, Trainer


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m121.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m156.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolv

In [None]:
# =====================
# CONFIG (IEMOCAP 6-way)
# =====================

# CSV paths
TRAIN_CSV = "/content/iemocap_emoberta_train.csv"
VAL_CSV   = "/content/iemocap_emoberta_val.csv"
TEST_CSV  = "/content/iemocap_emoberta_test.csv"

# Column names in your CSVs
DIALOG_COL  = "Dialogue_ID"
UTTID_COL   = "Utterance_ID"
SPEAKER_COL = "Speaker"     # "F" / "M"
TEXT_COL    = "Utterance"
LABEL_COL   = "Emotion"

# IEMOCAP 6 emotions
LABELS = ["neutral", "frustration", "sadness", "anger", "excited", "happiness"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Model
MODEL_BASE = "roberta-base"

# Paper-like constants
WEIGHT_DECAY = 0.01
EPOCHS = 5
WARMUP_RATIO = 0.20
LR_SCHED = "linear"

# Optuna: tune ONLY peak LR
N_TRIALS = 5
LR_LOW, LR_HIGH = 1e-6, 1e-4

# Training defaults
MAX_LEN = 512
BATCH_TRAIN = 8
BATCH_EVAL  = 16
GRAD_ACCUM  = 1

# Reproducibility / reporting
SEED = 42
SEEDS_FINAL = [42, 43, 44, 45, 46]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)
print("LABELS:", LABELS)

# ==========================================
# EmoBERTa-style speaker names (IEMOCAP)
# We map "actor id" = SesXX + (F/M) -> name.
# Your test split is Ses05 only, so this guarantees
# ELIZABETH/WILLIAM appear in test (like the paper notes).
# ==========================================
NAME_MAP = {
    "Ses01F":"MARY",      "Ses02F":"PATRICIA", "Ses03F":"JENNIFER", "Ses04F":"LINDA",   "Ses05F":"ELIZABETH",
    "Ses01M":"JAMES",     "Ses02M":"JOHN",     "Ses03M":"ROBERT",   "Ses04M":"MICHAEL", "Ses05M":"WILLIAM",
}



DEVICE: cuda
LABELS: ['neutral', 'frustration', 'sadness', 'anger', 'excited', 'happiness']


In [None]:
# ==========================
# Load + filter to IEMOCAP-6
# ==========================

IEMO6 = LABELS

# Map common label variants -> canonical names
LABEL_MAP = {
    "neu": "neutral",
    "neutral": "neutral",
    "fru": "frustration",
    "frustrated": "frustration",
    "frustration": "frustration",
    "sad": "sadness",
    "sadness": "sadness",
    "ang": "anger",
    "anger": "anger",
    "exc": "excited",
    "excited": "excited",
    "hap": "happiness",
    "happy": "happiness",
    "happiness": "happiness",
}

def load_and_filter_iemocap6(path):
    print(f"--- Processing {path} ---")
    df = pd.read_csv(path)
    print("Original shape:", df.shape)
    print("Columns:", df.columns.tolist())

    # normalize
    df[TEXT_COL] = df[TEXT_COL].astype(str)
    df[SPEAKER_COL] = df[SPEAKER_COL].astype(str).str.strip().str.upper()
    df[UTTID_COL] = df[UTTID_COL].astype(str)
    df[DIALOG_COL] = df[DIALOG_COL].astype(str)

    df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip().str.lower().replace(LABEL_MAP)

    # keep only the 6 labels
    df = df[df[LABEL_COL].isin(IEMO6)].copy()
    print("After label filtering:", df.shape)
    print("Label counts:\n", df[LABEL_COL].value_counts())

    return df

train_df = load_and_filter_iemocap6(TRAIN_CSV)
val_df   = load_and_filter_iemocap6(VAL_CSV)
test_df  = load_and_filter_iemocap6(TEST_CSV)

print("Rows:", len(train_df), len(val_df), len(test_df))


--- Processing /content/iemocap_emoberta_train.csv ---
Original shape: (6468, 6)
Columns: ['Split', 'Dialogue_ID', 'Utterance_ID', 'Speaker', 'Utterance', 'Emotion']
After label filtering: (4778, 6)
Label counts:
 Emotion
neutral        1167
frustration    1149
sadness         739
anger           711
excited         620
happiness       392
Name: count, dtype: int64
--- Processing /content/iemocap_emoberta_val.csv ---
Original shape: (1401, 6)
Columns: ['Split', 'Dialogue_ID', 'Utterance_ID', 'Speaker', 'Utterance', 'Emotion']
After label filtering: (980, 6)
Label counts:
 Emotion
frustration    319
anger          222
neutral        157
excited        122
sadness        100
happiness       60
Name: count, dtype: int64
--- Processing /content/iemocap_emoberta_test.csv ---
Original shape: (2170, 6)
Columns: ['Split', 'Dialogue_ID', 'Utterance_ID', 'Speaker', 'Utterance', 'Emotion']
After label filtering: (1622, 6)
Label counts:
 Emotion
neutral        384
frustration    381
excited       

In [None]:


def reorder_iemocap_csv(df, dialog_col="Dialogue_ID", uttid_col="Utterance_ID", spk_col="Speaker"):
    df = df.copy()
    df[dialog_col] = df[dialog_col].astype(str)
    df[uttid_col]  = df[uttid_col].astype(str)
    df[spk_col]    = df[spk_col].astype(str).str.strip().str.upper()

    # numeric index from ..._F003 / ..._M011
    df["_idx"] = df[uttid_col].str.extract(r"_[FM](\d+)$")[0].astype(int)

    # starter speaker from Dialogue_ID like Ses01F_impro01 or Ses03M_...
    df["_starter"] = df[dialog_col].str.extract(r"^Ses\d{2}([FM])")[0].fillna("F").str.upper()
    df["_prio"] = (df[spk_col] != df["_starter"]).astype(int)  # 0 for starter, 1 for other

    df = df.sort_values([dialog_col, "_idx", "_prio"]).reset_index(drop=True)
    return df.drop(columns=["_idx", "_starter", "_prio"])


In [None]:
train_df = reorder_iemocap_csv(train_df)
val_df   = reorder_iemocap_csv(val_df)
test_df  = reorder_iemocap_csv(test_df)


In [None]:
def max_run(speakers):
    m=1; cur=1
    for a,b in zip(speakers, speakers[1:]):
        cur = cur+1 if a==b else 1
        m = max(m, cur)
    return m

def debug_order(df, name):
    runs = df.groupby("Dialogue_ID")["Speaker"].apply(lambda s: max_run(s.tolist()))
    print(name, "max-run summary:", runs.describe())
    print("Example dialogue after reorder:")
    did = df["Dialogue_ID"].iloc[0]
    print(df[df["Dialogue_ID"]==did][["Utterance_ID","Speaker"]].head(20))

debug_order(train_df, "TRAIN")


TRAIN max-run summary: count    100.000000
mean       5.610000
std        3.837337
min        1.000000
25%        3.000000
50%        4.000000
75%        6.000000
max       19.000000
Name: Speaker, dtype: float64
Example dialogue after reorder:
           Utterance_ID Speaker
0   Ses01F_impro01_F000       F
1   Ses01F_impro01_M000       M
2   Ses01F_impro01_F001       F
3   Ses01F_impro01_M001       M
4   Ses01F_impro01_F002       F
5   Ses01F_impro01_M002       M
6   Ses01F_impro01_M003       M
7   Ses01F_impro01_M004       M
8   Ses01F_impro01_F005       F
9   Ses01F_impro01_M005       M
10  Ses01F_impro01_F006       F
11  Ses01F_impro01_M006       M
12  Ses01F_impro01_F007       F
13  Ses01F_impro01_M007       M
14  Ses01F_impro01_F008       F
15  Ses01F_impro01_M008       M
16  Ses01F_impro01_F009       F
17  Ses01F_impro01_M009       M
18  Ses01F_impro01_M010       M
19  Ses01F_impro01_F011       F


In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_BASE, use_fast=True)
collator = DataCollatorWithPadding(tokenizer=tok)

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "acc": accuracy_score(y_true, y_pred),
        "weighted_f1": f1_score(y_true, y_pred, average="weighted"),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
    }

print("CLS:", tok.cls_token, tok.cls_token_id, "SEP:", tok.sep_token, tok.sep_token_id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

CLS: <s> 0 SEP: </s> 2


In [None]:
# ==========================
# Cell: TARGET-SEP-ONLY context builder (+DEBUG +CSV save)
#   - ONLY two </s>: before and after TARGET
#   - NO </s> between past/future utterances
#   - target HAS speaker name too
# ==========================

def build_context_dataset_target_sep_only(
    df,
    tokenizer,
    max_length=512,
    speaker_caps=True,
    debug_n=3,
    insert_space_between_utts=True,   # keeps readability WITHOUT adding </s>
    include_raw_text=True
):
    df = df.copy()

    # -------- normalize --------
    df[TEXT_COL] = df[TEXT_COL].astype(str)
    df[SPEAKER_COL] = df[SPEAKER_COL].astype(str).str.strip().str.upper()
    df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip().str.lower().replace(LABEL_MAP)

    # keep only wanted labels
    df = df[df[LABEL_COL].isin(LABELS)].copy()

    # -------- ordering (IEMOCAP-style if possible, else numeric) --------
    # IEMOCAP Utterance_ID like: Ses01F_impro01_F003
    turn_ex = df[UTTID_COL].astype(str).str.extract(r"_[FM](\d+)$")[0]
    if turn_ex.notna().all():
        df["_turn"] = turn_ex.astype(int)
        df["_starter"] = df[DIALOG_COL].astype(str).str.extract(r"^Ses\d{2}([FM])")[0].fillna("F").str.upper()
        df["_prio"] = (df[SPEAKER_COL] != df["_starter"]).astype(int)
        df = df.sort_values([DIALOG_COL, "_turn", "_prio"]).reset_index(drop=True)
    else:
        # fallback: numeric order
        df[UTTID_COL] = pd.to_numeric(df[UTTID_COL], errors="coerce")
        df = df.dropna(subset=[DIALOG_COL, UTTID_COL]).copy()
        df[UTTID_COL] = df[UTTID_COL].astype(int)
        df = df.sort_values([DIALOG_COL, UTTID_COL]).reset_index(drop=True)

    # -------- speaker names (EmoBERTa NAME_MAP if available; else use SPEAKER) --------
    # If NAME_MAP doesn't exist, fallback to speaker labels
    has_name_map = "NAME_MAP" in globals()

    if has_name_map:
        # actor id = SesXX + Speaker(F/M) when Utterance_ID starts with SesXX
        df["_session"] = df[UTTID_COL].astype(str).str.extract(r"^(Ses\d{2})")[0]
        df["_actor"] = (df["_session"].fillna("UNK") + df[SPEAKER_COL])
        df["_name"] = df["_actor"].map(NAME_MAP)
        df["_name"] = df["_name"].fillna(df[SPEAKER_COL])  # fallback if unmapped
    else:
        df["_name"] = df[SPEAKER_COL]

    if speaker_caps:
        df["_name"] = df["_name"].astype(str).str.upper()

    cls_id = tokenizer.cls_token_id  # <s>
    sep_id = tokenizer.sep_token_id  # </s>

    # reserve CLS only (exactly like your strict code)
    max_tokens = max_length - 1

    all_input_ids, all_attn, all_labels = [], [], []
    all_texts, all_dialog, all_turn = [], [], []

    dbg_printed = 0
    lengths = []
    sep_counts = []

    # precompute encodings with/without leading space (so utterances don't glue)
    def enc_no_space(x): return tokenizer.encode(x, add_special_tokens=False)
    def enc_with_space(x): return tokenizer.encode(" " + x, add_special_tokens=False)

    for d_id, g in df.groupby(DIALOG_COL, sort=False):
        names = g["_name"].tolist()
        utts  = g[TEXT_COL].tolist()
        labs  = g[LABEL_COL].tolist()
        turns = g[UTTID_COL].tolist()

        seg_text = [f"{nm}: {u}" for nm, u in zip(names, utts)]

        seg_ids0 = [enc_no_space(x) for x in seg_text]                     # first in a run
        seg_ids1 = [enc_with_space(x) for x in seg_text] if insert_space_between_utts else seg_ids0  # subsequent

        n = len(seg_text)

        for t in range(n):
            target_text = seg_text[t]
            target_ids  = seg_ids0[t][:]

            # MUST fit: [SEP] + target + [SEP]
            base = 2 + len(target_ids)
            if base > max_tokens:
                # truncate target to fit
                keep = max(0, max_tokens - 2)
                target_ids = target_ids[:keep]
                base = 2 + len(target_ids)

            # we will build LEFT and RIGHT as lists of indices
            left_idxs, right_idxs = [], []
            left_len = 0
            right_len = 0

            i = 0
            while True:
                changed = False
                i += 1

                # try add one past utterance
                li = t - i
                if li >= 0:
                    add_len = len(seg_ids0[li]) if len(left_idxs) == 0 else len(seg_ids1[li])
                    if base + left_len + add_len + right_len <= max_tokens:
                        left_idxs.insert(0, li)
                        left_len += add_len
                        changed = True

                # try add one future utterance
                ri = t + i
                if ri < n:
                    add_len = len(seg_ids0[ri]) if len(right_idxs) == 0 else len(seg_ids1[ri])
                    if base + left_len + right_len + add_len <= max_tokens:
                        right_idxs.append(ri)
                        right_len += add_len
                        changed = True

                if not changed:
                    break
                if li < 0 and ri >= n:
                    break

            # build LEFT ids (no sep between utts)
            left_ids = []
            for k, idx in enumerate(left_idxs):
                left_ids += (seg_ids0[idx] if k == 0 else seg_ids1[idx])

            # build RIGHT ids (no sep between utts)
            right_ids = []
            for k, idx in enumerate(right_idxs):
                right_ids += (seg_ids0[idx] if k == 0 else seg_ids1[idx])

            # final seq: LEFT + [SEP] + TARGET + [SEP] + RIGHT
            seq_ids = left_ids + [sep_id] + target_ids + [sep_id] + right_ids
            seq_ids = seq_ids[:max_tokens]

            input_ids = [cls_id] + seq_ids
            input_ids = input_ids[:max_length]

            all_input_ids.append(input_ids)
            all_attn.append([1]*len(input_ids))
            all_labels.append(label2id[labs[t]])
            all_dialog.append(d_id)
            all_turn.append(turns[t])

            # RAW text (for saving/debug): ONLY 2 </s> around target
            if include_raw_text:
                left_raw  = (" ".join([seg_text[i] for i in left_idxs]).strip() + (" " if left_idxs else "")) if insert_space_between_utts else "".join([seg_text[i] for i in left_idxs])
                right_raw = ((" " if right_idxs and insert_space_between_utts else "") + " ".join([seg_text[i] for i in right_idxs]).strip()) if insert_space_between_utts else "".join([seg_text[i] for i in right_idxs])

                raw = f"<s>{left_raw}</s>{target_text}</s>{right_raw}"
                all_texts.append(raw)

            lengths.append(len(input_ids))
            sep_counts.append(int(np.sum(np.array(input_ids) == sep_id)))

            # DEBUG prints
            if dbg_printed < debug_n:
                print("="*90)
                print(f"DEBUG {dbg_printed+1} | dialog={d_id} | target_turn={turns[t]} | label={labs[t]}")
                print(f"Left utts: {len(left_idxs)} | Right utts: {len(right_idxs)} | SEP count in input_ids: {sep_counts[-1]}")
                if include_raw_text:
                    # show PAST/CUR/FUT by splitting at </s>
                    parts = all_texts[-1].split("</s>")
                    # parts: ["<s>PAST", "TARGET", "FUTURE"]
                    print("\nRAW (constructed) split:")
                    print("PAST   :", parts[0].replace("<s>", "").strip()[:220])
                    print("CURRENT:", (parts[1].strip() if len(parts) > 1 else "")[:220])
                    print("FUTURE :", (parts[2].strip() if len(parts) > 2 else "")[:220])
                    print("\nRAW (first 900 chars):")
                    print(all_texts[-1][:900])

                print("\nDECODED (first 140 tokens):")
                print(tokenizer.decode(input_ids[:140], skip_special_tokens=False))
                dbg_printed += 1

    print("\nToken length stats:",
          f"min={int(np.min(lengths))}, mean={float(np.mean(lengths)):.1f}, max={int(np.max(lengths))}, n={len(lengths)}")
    print("SEP counts stats:",
          f"min={int(np.min(sep_counts))}, mean={float(np.mean(sep_counts)):.2f}, max={int(np.max(sep_counts))}")

    data = {
        "dialogue_id": all_dialog,
        "utterance_id": all_turn,
        "input_ids": all_input_ids,
        "attention_mask": all_attn,
        "labels": all_labels,
    }
    if include_raw_text:
        data["context_text_raw"] = all_texts

    return Dataset.from_dict(data)


def save_constructed_csv(ds, out_csv, id2label=None):
    d = ds.to_dict()
    df_out = pd.DataFrame({
        "dialogue_id": d["dialogue_id"],
        "utterance_id": d["utterance_id"],
        "label_id": d["labels"],
        "label": [id2label.get(int(x), str(x)) if isinstance(id2label, dict) else str(x) for x in d["labels"]],
        "context_text_raw": d.get("context_text_raw", [""]*len(d["labels"])),
    })
    df_out.to_csv(out_csv, index=False)
    print("✅ Saved:", out_csv, "| rows:", len(df_out))


# ----------- BUILD (prints debug examples) -----------
train_ds_full = build_context_dataset_target_sep_only(train_df, tok, max_length=MAX_LEN, speaker_caps=True, debug_n=3,
                                                      insert_space_between_utts=True, include_raw_text=True)
val_ds_full   = build_context_dataset_target_sep_only(val_df,   tok, max_length=MAX_LEN, speaker_caps=True, debug_n=1,
                                                      insert_space_between_utts=True, include_raw_text=True)
test_ds_full  = build_context_dataset_target_sep_only(test_df,  tok, max_length=MAX_LEN, speaker_caps=True, debug_n=1,
                                                      insert_space_between_utts=True, include_raw_text=True)

print("Sizes:", len(train_ds_full), len(val_ds_full), len(test_ds_full))

# ----------- SAVE CSV locally in Colab -----------
save_constructed_csv(train_ds_full, "/content/train_constructed_targetSEPonly.csv", id2label=id2label)
save_constructed_csv(val_ds_full,   "/content/val_constructed_targetSEPonly.csv",   id2label=id2label)
save_constructed_csv(test_ds_full,  "/content/test_constructed_targetSEPonly.csv",  id2label=id2label)

!ls -lh /content/*constructed_targetSEPonly.csv



DEBUG 1 | dialog=Ses01F_impro01 | target_turn=Ses01F_impro01_F000 | label=neutral
Left utts: 0 | Right utts: 25 | SEP count in input_ids: 2

RAW (constructed) split:
PAST   : 
CURRENT: MARY: Excuse me.
FUTURE : JAMES: Do you have your forms? MARY: Yeah. JAMES: Let me see them. MARY: Is there a problem? JAMES: Who told you to get in this line? JAMES: Okay. But I didn't tell you to get in this line if you are filling out this par

RAW (first 900 chars):
<s></s>MARY: Excuse me.</s> JAMES: Do you have your forms? MARY: Yeah. JAMES: Let me see them. MARY: Is there a problem? JAMES: Who told you to get in this line? JAMES: Okay. But I didn't tell you to get in this line if you are filling out this particular form. JAMES: This form is a Z.X.four. MARY: Well what's the problem?  Let me change it. JAMES: You can't--  This is not the line for Z.X.four.  If you're going to fill out the Z.X.four, you need to have a different form of ID. MARY: What?  I'm getting an ID.  This is why I'm here.  My wa

In [None]:
import random

def get_one_context_example(ds, tok, id2label, idx=None, max_chars=3000):
    if idx is None:
        idx = random.randrange(len(ds))

    ex = ds[idx]
    label_id = int(ex["labels"])
    label = id2label[label_id] if isinstance(id2label, dict) and label_id in id2label else str(label_id)

    # decode ακριβώς όπως πάει στο μοντέλο (με special tokens)
    text = tok.decode(ex["input_ids"], skip_special_tokens=False)

    print("Index:", idx)
    print("Label id:", label_id)
    print("Label:", label)
    print("\n--- Context input (decoded) ---\n")
    print(text[:max_chars])


get_one_context_example(test_ds_full, tok, id2label, idx=100)



Index: 100
Label id: 4
Label: excited

--- Context input (decoded) ---

<s>ELIZABETH: Okay, so big news. WILLIAM: What? ELIZABETH: I'm getting married. WILLIAM: Come on, what? ELIZABETH: Yeah. ELIZABETH: Yeah. WILLIAM: No way.</s>ELIZABETH: Uh-huh.</s>ELIZABETH: Just a couple days ago. WILLIAM: No way, when? when,when, When did it happen? WILLIAM: Oh my gosh. ELIZABETH: I know me neither. WILLIAM: And you actually said yes.  I can't believe it.  I never thought you would get married. ELIZABETH: Right, I thought I would be way older too. I thought I would be I be at least thirty something. WILLIAM: Oh my gosh. ELIZABETH: Yeah. WILLIAM: Oh absolutely. WILLIAM: Well, we always said we'd be in our thirtieth when we got married. And we were going to get married to each other if we didn't by then ELIZABETH: Sorry. WILLIAM: Thirty nine, yeah.  I can't believe it.  So you broke our pact. Thanks a lot. WILLIAM: Thanks a lot.  Well, okay, you've got to tell me the details.  What did he say?  Whe

In [None]:
def objective(trial):
    set_seed(SEED)

    lr = trial.suggest_float("lr", LR_LOW, LR_HIGH, log=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_BASE,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)

    args = TrainingArguments(
        output_dir=f"optuna_lr_trial_{trial.number}",
        eval_strategy="epoch",
        save_strategy="no",

        learning_rate=lr,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        gradient_accumulation_steps=GRAD_ACCUM,

        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        lr_scheduler_type=LR_SCHED,

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds_full,
        eval_dataset=val_ds_full,
        data_collator=collator,
        tokenizer=tok,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    out = trainer.evaluate(val_ds_full)
    return out["eval_loss"]   # minimize cross entropy loss


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=N_TRIALS)

best_lr = study.best_params["lr"]
print("Best lr:", best_lr)
print("Best val loss:", study.best_value)


[I 2026-01-21 10:04:06,137] A new study created in memory with name: no-name-55a33a5c-4bb7-4b50-8cad-4f6f405068b0


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2191,1.265707,0.432653,0.41585,0.387159
2,1.0326,1.185186,0.440816,0.43333,0.421062
3,0.8909,1.025729,0.568367,0.568778,0.553519
4,0.7427,1.090955,0.591837,0.578596,0.570806
5,0.5765,1.089786,0.617347,0.617871,0.598753


[I 2026-01-21 10:06:56,854] Trial 0 finished with value: 1.0897856950759888 and parameters: {'lr': 2.586343852736302e-05}. Best is trial 0 with value: 1.0897856950759888.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.755,1.361508,0.378571,0.357464,0.354127
2,1.1036,1.139011,0.536735,0.543367,0.494253
3,0.9995,1.102619,0.565306,0.556661,0.497217
4,0.9788,1.097143,0.565306,0.539204,0.471328
5,0.9597,1.09324,0.564286,0.548804,0.48337


[I 2026-01-21 10:09:40,629] Trial 1 finished with value: 1.0932395458221436 and parameters: {'lr': 2.6522535643416085e-06}. Best is trial 0 with value: 1.0897856950759888.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2531,1.656994,0.259184,0.162067,0.222814
2,1.1915,1.383577,0.37551,0.36924,0.360979
3,1.0703,1.192008,0.495918,0.482866,0.455282
4,1.0001,1.079814,0.554082,0.538379,0.487998
5,0.9101,1.140266,0.547959,0.538968,0.492699


[I 2026-01-21 10:12:24,588] Trial 2 finished with value: 1.1402655839920044 and parameters: {'lr': 9.592296981142845e-05}. Best is trial 0 with value: 1.0897856950759888.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2178,1.248782,0.437755,0.41031,0.436767
2,1.0247,1.17043,0.491837,0.479885,0.481427
3,0.8839,1.016796,0.566327,0.568026,0.549612
4,0.6982,1.085444,0.583673,0.578192,0.573192
5,0.5292,1.091195,0.603061,0.605452,0.587846


[I 2026-01-21 10:15:08,977] Trial 3 finished with value: 1.091194987297058 and parameters: {'lr': 2.7485933477214487e-05}. Best is trial 0 with value: 1.0897856950759888.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.7526,1.358523,0.382653,0.363028,0.357046
2,1.1,1.143017,0.533673,0.542832,0.496457
3,0.9979,1.100629,0.563265,0.553191,0.491413
4,0.9759,1.097785,0.568367,0.541657,0.474225
5,0.9564,1.092036,0.567347,0.550448,0.48455


[I 2026-01-21 10:17:53,596] Trial 4 finished with value: 1.0920357704162598 and parameters: {'lr': 2.7262122688355267e-06}. Best is trial 0 with value: 1.0897856950759888.


Best lr: 2.586343852736302e-05
Best val loss: 1.0897856950759888


In [None]:

rows = []

# ---------- callback: save at end of each epoch ----------
class SaveByEpochCallback(TrainerCallback):
    def __init__(self, out_root, tokenizer):
        self.out_root = out_root
        self.tokenizer = tokenizer
        os.makedirs(out_root, exist_ok=True)

    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs["model"]
        ep = state.epoch
        ep_i = int(round(ep)) if ep is not None else 0

        save_dir = os.path.join(self.out_root, f"epoch_{ep_i:02d}")
        os.makedirs(save_dir, exist_ok=True)

        model.save_pretrained(save_dir)
        self.tokenizer.save_pretrained(save_dir)
        print(f"✅ Saved epoch checkpoint to: {save_dir}")
        return control

for seed in SEEDS_FINAL:
    print("\n" + "="*20, "SEED", seed, "="*20)
    set_seed(seed)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_BASE,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)


    out_dir = f"roberta_iemocap_final_seed{seed}"


    epoch_root = f"/content/epoch_checkpoints_seed{seed}"
    if os.path.exists(epoch_root):
        shutil.rmtree(epoch_root)
    os.makedirs(epoch_root, exist_ok=True)

    epoch_saver = SaveByEpochCallback(epoch_root, tok)



    args = TrainingArguments(
        output_dir=out_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,

        load_best_model_at_end=True,
        metric_for_best_model="weighted_f1",
        greater_is_better=True,

        learning_rate=best_lr,
        num_train_epochs=7,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        gradient_accumulation_steps=GRAD_ACCUM,

        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        lr_scheduler_type=LR_SCHED,

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=seed,
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds_full,
        eval_dataset=val_ds_full,
        data_collator=collator,
        tokenizer=tok,
        compute_metrics=compute_metrics,
        callbacks=[epoch_saver],
    )

    trainer.train()

    best_ckpt = trainer.state.best_model_checkpoint
    print("Best checkpoint:", best_ckpt)

    # Save clean BEST folder
    best_dir = f"{out_dir}_BEST"
    if os.path.exists(best_dir):
        shutil.rmtree(best_dir)
    shutil.copytree(best_ckpt, best_dir)
    tok.save_pretrained(best_dir)
    print("Saved BEST folder:", best_dir)

    test_metrics = trainer.evaluate(test_ds_full)
    print("TEST:", test_metrics)

    rows.append({
        "seed": seed,
        "best_dir": best_dir,
        "test_acc": float(test_metrics["eval_acc"]),
        "test_weighted_f1": float(test_metrics["eval_weighted_f1"]),
        "test_macro_f1": float(test_metrics["eval_macro_f1"]),
    })

df = pd.DataFrame(rows)
display(df)

print("\nMEAN:")
display(df[["test_acc","test_weighted_f1","test_macro_f1"]].mean().to_frame("mean"))

print("\nSTD:")
display(df[["test_acc","test_weighted_f1","test_macro_f1"]].std().to_frame("std"))





Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2563,1.251161,0.384694,0.327899,0.353761
2,1.0709,1.185963,0.478571,0.469015,0.464839
3,0.9305,0.999772,0.57449,0.563773,0.534984
4,0.7918,1.197585,0.562245,0.553721,0.549684
5,0.5909,1.058768,0.618367,0.617703,0.600234
6,0.4352,1.213009,0.626531,0.627484,0.612663
7,0.2942,1.367649,0.614286,0.617596,0.602544


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed42/epoch_07
Best checkpoint: roberta_iemocap_final_seed42/checkpoint-3588
Saved BEST folder: roberta_iemocap_final_seed42_BEST


TEST: {'eval_loss': 1.1718772649765015, 'eval_acc': 0.6454993834771886, 'eval_weighted_f1': 0.6427046084874507, 'eval_macro_f1': 0.6299404648198165, 'eval_runtime': 3.0052, 'eval_samples_per_second': 539.737, 'eval_steps_per_second': 33.942, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.224,1.339984,0.362245,0.326596,0.337434
2,1.06,1.091467,0.54898,0.535364,0.502464
3,0.9462,1.23937,0.49898,0.49075,0.470539
4,0.7656,1.017392,0.588776,0.587842,0.567116
5,0.5723,1.015207,0.641837,0.637219,0.611307
6,0.3806,1.198361,0.628571,0.631924,0.615001
7,0.2664,1.421959,0.617347,0.621198,0.607527


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed43/epoch_07
Best checkpoint: roberta_iemocap_final_seed43/checkpoint-2990
Saved BEST folder: roberta_iemocap_final_seed43_BEST


TEST: {'eval_loss': 1.0495119094848633, 'eval_acc': 0.6504315659679408, 'eval_weighted_f1': 0.6431163709745321, 'eval_macro_f1': 0.6243667545152575, 'eval_runtime': 2.9954, 'eval_samples_per_second': 541.491, 'eval_steps_per_second': 34.052, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2339,1.140525,0.559184,0.52533,0.460333
2,1.0872,1.088836,0.572449,0.549784,0.524068
3,0.9624,1.01313,0.560204,0.538464,0.515349
4,0.7795,0.986651,0.59898,0.59856,0.57508
5,0.5826,1.073924,0.631633,0.63248,0.614786
6,0.3911,1.404115,0.607143,0.61716,0.606339
7,0.2822,1.4816,0.626531,0.633912,0.620315


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed44/epoch_07
Best checkpoint: roberta_iemocap_final_seed44/checkpoint-4186
Saved BEST folder: roberta_iemocap_final_seed44_BEST


TEST: {'eval_loss': 1.3581148386001587, 'eval_acc': 0.627003699136868, 'eval_weighted_f1': 0.625685062310035, 'eval_macro_f1': 0.6139348688507779, 'eval_runtime': 2.9697, 'eval_samples_per_second': 546.175, 'eval_steps_per_second': 34.346, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2918,1.162431,0.569388,0.551656,0.504657
2,1.1154,1.120174,0.505102,0.475953,0.420397
3,0.9643,1.096402,0.543878,0.521309,0.498609
4,0.8266,0.98414,0.59898,0.587765,0.572051
5,0.6025,1.0608,0.609184,0.616176,0.610387
6,0.4348,1.180939,0.605102,0.614451,0.611528
7,0.3135,1.226885,0.632653,0.636355,0.627343


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed45/epoch_07
Best checkpoint: roberta_iemocap_final_seed45/checkpoint-4186
Saved BEST folder: roberta_iemocap_final_seed45_BEST


TEST: {'eval_loss': 1.1959228515625, 'eval_acc': 0.6479654747225647, 'eval_weighted_f1': 0.6442308798791231, 'eval_macro_f1': 0.6248491244843811, 'eval_runtime': 3.0493, 'eval_samples_per_second': 531.932, 'eval_steps_per_second': 33.451, 'epoch': 7.0}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc,Weighted F1,Macro F1
1,1.2838,1.075985,0.559184,0.536051,0.471444
2,1.0795,1.295517,0.507143,0.490267,0.428847
3,0.9475,1.009839,0.573469,0.559699,0.524871
4,0.7206,0.996048,0.611224,0.612342,0.599157
5,0.5447,1.221761,0.57449,0.587918,0.578481
6,0.3796,1.297473,0.611224,0.61675,0.609807
7,0.2957,1.414811,0.621429,0.628318,0.615558


✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_01
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_02
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_03
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_04
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_05
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_06
✅ Saved epoch checkpoint to: /content/epoch_checkpoints_seed46/epoch_07
Best checkpoint: roberta_iemocap_final_seed46/checkpoint-4186
Saved BEST folder: roberta_iemocap_final_seed46_BEST


TEST: {'eval_loss': 1.23854398727417, 'eval_acc': 0.6491985203452528, 'eval_weighted_f1': 0.6493854579151817, 'eval_macro_f1': 0.6391471083433394, 'eval_runtime': 2.9828, 'eval_samples_per_second': 543.789, 'eval_steps_per_second': 34.196, 'epoch': 7.0}


Unnamed: 0,seed,best_dir,test_acc,test_weighted_f1,test_macro_f1
0,42,roberta_iemocap_final_seed42_BEST,0.645499,0.642705,0.62994
1,43,roberta_iemocap_final_seed43_BEST,0.650432,0.643116,0.624367
2,44,roberta_iemocap_final_seed44_BEST,0.627004,0.625685,0.613935
3,45,roberta_iemocap_final_seed45_BEST,0.647965,0.644231,0.624849
4,46,roberta_iemocap_final_seed46_BEST,0.649199,0.649385,0.639147



MEAN:


Unnamed: 0,mean
test_acc,0.64402
test_weighted_f1,0.641024
test_macro_f1,0.626448



STD:


Unnamed: 0,std
test_acc,0.009685
test_weighted_f1,0.008982
test_macro_f1,0.009178


In [None]:
from google.colab import drive
drive.mount('/content/drive')

SRC = "/content/epoch_checkpoints_seed46"
DST = "/content/drive/MyDrive/epoch_checkpoints_seed46_emoberta_roberta_iemocap/"

!rsync -ah --progress "$SRC" "$DST"
!ls -lh "/content/drive/MyDrive/epoch_checkpoints_seed46_emoberta_roberta_iemocap/"

Mounted at /content/drive
sending incremental file list
created directory /content/drive/MyDrive/epoch_checkpoints_seed46_emoberta_roberta_iemocap
epoch_checkpoints_seed46/
epoch_checkpoints_seed46/epoch_01/
epoch_checkpoints_seed46/epoch_01/config.json
            970 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=48/57)
epoch_checkpoints_seed46/epoch_01/merges.txt
        456.32K 100%   39.56MB/s    0:00:00 (xfr#2, to-chk=47/57)
epoch_checkpoints_seed46/epoch_01/model.safetensors
        498.63M 100%  411.35MB/s    0:00:01 (xfr#3, to-chk=46/57)
epoch_checkpoints_seed46/epoch_01/special_tokens_map.json
            280 100%    1.74kB/s    0:00:00 (xfr#4, to-chk=45/57)
epoch_checkpoints_seed46/epoch_01/tokenizer.json
          3.56M 100%   15.50MB/s    0:00:00 (xfr#5, to-chk=44/57)
epoch_checkpoints_seed46/epoch_01/tokenizer_config.json
          1.25K 100%    5.56kB/s    0:00:00 (xfr#6, to-chk=43/57)
epoch_checkpoints_seed46/epoch_01/vocab.json
        798.29K 100%    3.00MB/s    0:00:00 (