In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# Full fixed pipeline in one Colab cell (train on train, validate on dev, no test)
# Uses a custom collate_fn to work with TensorDataset
# ───────────────────────────────────────────────────────────────────────────────

# 1) Install required libraries
!pip install transformers datasets evaluate

# 2) Imports & mount Drive
import os
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import evaluate
from google.colab import drive

drive.mount('/content/drive')

# 3) Configuration
base_dir       = "/content/drive/MyDrive/MELD"
train_csv      = os.path.join(base_dir, "train_for_student_ss.csv")
dev_csv        = os.path.join(base_dir, "dev_for_student_ss.csv")
OUTPUT_DIR     = os.path.join(base_dir, "student_roberta_base_cased_erc_weighted_f1")

MODEL_CHECKPOINT = "roberta-base"
EMOTIONS        = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
label2id        = {emo: i for i, emo in enumerate(EMOTIONS)}
NUM_LABELS      = len(EMOTIONS)

MAX_LEN     = 128
BATCH_SIZE  = 16
EPOCHS      = 4
LR          = 2e-5

# 4) Load CSVs into pandas and convert emotion string → integer
df_train = pd.read_csv(train_csv)
df_dev   = pd.read_csv(dev_csv)

df_train["label"] = df_train["Emotion"].map(label2id)
df_dev["label"]   = df_dev["Emotion"].map(label2id)

# 5) Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 6) Tokenize each split, returning PyTorch tensors (fixed length MAX_LEN)
train_encodings = tokenizer(
    list(df_train["student_input"]),
    padding="max_length",
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)

dev_encodings = tokenizer(
    list(df_dev["student_input"]),
    padding="max_length",
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)

# 7) Extract input_ids, attention_mask, and labels for each split
train_input_ids      = train_encodings["input_ids"]      # [N_train, MAX_LEN]
train_attention_mask = train_encodings["attention_mask"] # [N_train, MAX_LEN]
train_labels         = torch.tensor(df_train["label"].values, dtype=torch.long)

dev_input_ids        = dev_encodings["input_ids"]      # [N_dev, MAX_LEN]
dev_attention_mask   = dev_encodings["attention_mask"] # [N_dev, MAX_LEN]
dev_labels           = torch.tensor(df_dev["label"].values, dtype=torch.long)

# 8) Wrap into TensorDataset objects
train_torch = TensorDataset(train_input_ids, train_attention_mask, train_labels)
dev_torch   = TensorDataset(dev_input_ids,   dev_attention_mask,   dev_labels)

# 9) Load student model & define metrics (accuracy + weighted F1)
student_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)

accuracy    = evaluate.load("accuracy")
f1_weighted = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    acc   = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1w   = f1_weighted.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "f1_weighted": f1w}

# 10) Custom collate function for TensorDataset batches
def collate_fn(batch):
    """
    Batch is a list of tuples: (input_ids, attention_mask, label)
    We need to stack them into tensors and return a dict with keys
    'input_ids', 'attention_mask', 'labels'.
    """
    input_ids = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    labels = torch.stack([item[2] for item in batch])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# 11) Set up TrainingArguments (validate on dev_torch each epoch)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
)

# 12) Initialize Trainer & start fine‐tuning
trainer = Trainer(
    model=student_model,
    args=training_args,
    train_dataset=train_torch,
    eval_dataset=dev_torch,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Student fine‐tuning complete. Model saved to:", OUTPUT_DIR)




KeyboardInterrupt: 

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# FULL STUDENT PLM TRAINING (CLASSIFYING ON Fcls WITH MANUAL TENSOR DATASETS)
# ───────────────────────────────────────────────────────────────────────────────

# 1) Install required libraries (run once)
!pip install transformers datasets evaluate

# 2) Imports & Mount Drive
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    RobertaConfig,
    PreTrainedModel,
    RobertaModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')

# 3) Configuration / file paths / hyperparameters
BASE_DIR         = "/content/drive/MyDrive/MELD"
TRAIN_CSV        = os.path.join(BASE_DIR, "train_for_student_ss.csv")
DEV_CSV          = os.path.join(BASE_DIR, "dev_for_student_ss.csv")
OUTPUT_DIR       = os.path.join(BASE_DIR, "student_roberta_base_Fcls")
MODEL_CHECKPOINT = "roberta-base"
EMOTIONS         = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
NUM_LABELS       = len(EMOTIONS)
MAX_LEN          = 128
BATCH_SIZE       = 8
EPOCHS           = 4
LR               = 9e-5
WEIGHT_DECAY     = 0.01
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4) Define Roberta+Fcls→MLP model (unchanged)
class RobertaForERCWithFcls(PreTrainedModel):
    config_class = RobertaConfig

    def __init__(self, config: RobertaConfig):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        hidden_size = config.hidden_size        # 768
        num_labels  = config.num_labels         # 7

        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(3 * hidden_size, hidden_size),
            torch.nn.Tanh(),
            torch.nn.Dropout(config.hidden_dropout_prob),
            torch.nn.Linear(hidden_size, num_labels)
        )
        self.post_init()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                spans=None,   # Tensor shape [B,2]
                labels=None,
                **kwargs
               ):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        hs = outputs.last_hidden_state  # [B, S, H]
        B, S, H = hs.size()

        # Build Fp, Fq, Ff
        Fp = torch.zeros((B, H), device=hs.device)
        Fq = torch.zeros((B, H), device=hs.device)
        Ff = torch.zeros((B, H), device=hs.device)

        for i, (a, b) in enumerate(spans.tolist()):
            if a > 0:
                Fp[i] = hs[i, :a, :].mean(dim=0)
            if (b >= a) and (a >= 0):
                Fq[i] = hs[i, a : (b + 1), :].mean(dim=0)
            if (b + 1) < S and (b >= 0):
                Ff[i] = hs[i, (b + 1) : S, :].mean(dim=0)

        Fcls = torch.cat([Fp, Fq, Ff], dim=1)  # [B, 3H]
        logits = self.classifier(Fcls)         # [B, num_labels]

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.config.num_labels),
                labels.view(-1)
            )
        return {"loss": loss, "logits": logits}

# 5) Load CSVs into pandas
df_train = pd.read_csv(TRAIN_CSV)   # columns: "student_input", "Emotion"
df_dev   = pd.read_csv(DEV_CSV)

# 6) Prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 7) Helper to find span indices in a tokenized list
def find_span_indices(text):
    enc_no_special = tokenizer(
        text,
        add_special_tokens=False,
        truncation=True,
        max_length=MAX_LEN
    )
    ids = enc_no_special["input_ids"]
    try:
        a = ids.index(tokenizer.convert_tokens_to_ids("<s>"))
        b = ids.index(tokenizer.convert_tokens_to_ids("</s>"), a + 1)
    except ValueError:
        a, b = -1, -1
    return a, b

# 8) Manually build tensors for train split
train_input_ids_list, train_attn_mask_list, train_spans_list, train_labels_list = [], [], [], []
for _, row in df_train.iterrows():
    text = row["student_input"]
    a, b = find_span_indices(text)
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    train_input_ids_list.append(enc["input_ids"].squeeze(0))
    train_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    train_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    train_labels_list.append(torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long))

train_input_ids      = torch.stack(train_input_ids_list)      # [N_train, MAX_LEN]
train_attention_mask = torch.stack(train_attn_mask_list)       # [N_train, MAX_LEN]
train_spans          = torch.stack(train_spans_list)           # [N_train, 2]
train_labels         = torch.stack(train_labels_list)          # [N_train]

# 9) Same for dev split
dev_input_ids_list, dev_attn_mask_list, dev_spans_list, dev_labels_list = [], [], [], []
for _, row in df_dev.iterrows():
    text = row["student_input"]
    a, b = find_span_indices(text)
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    dev_input_ids_list.append(enc["input_ids"].squeeze(0))
    dev_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    dev_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    dev_labels_list.append(torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long))

dev_input_ids      = torch.stack(dev_input_ids_list)      # [N_dev, MAX_LEN]
dev_attention_mask = torch.stack(dev_attn_mask_list)       # [N_dev, MAX_LEN]
dev_spans          = torch.stack(dev_spans_list)           # [N_dev, 2]
dev_labels         = torch.stack(dev_labels_list)          # [N_dev]

# 10) Build TensorDatasets
train_dataset = TensorDataset(
    train_input_ids,
    train_attention_mask,
    train_spans,
    train_labels
)
dev_dataset = TensorDataset(
    dev_input_ids,
    dev_attention_mask,
    dev_spans,
    dev_labels
)

# 11) Custom collate_fn for DataLoader / Trainer
def collate_fn(batch):
    input_ids      = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    spans          = torch.stack([item[2] for item in batch])
    labels         = torch.stack([item[3] for item in batch])
    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "spans":          spans,
        "labels":         labels
    }

# 12) Instantiate model
config = RobertaConfig.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)
model = RobertaForERCWithFcls.from_pretrained(
    MODEL_CHECKPOINT,
    config=config
).to(DEVICE)

# 13) Evaluation metrics (accuracy + weighted F1)
accuracy    = evaluate.load("accuracy")
f1_weighted = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to NumPy array for metric computation
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    # Labels are already passed as a NumPy array by the Trainer,
    # so no need to call .numpy() again.
    # labels = labels.numpy() # Remove this line
    return {
        "accuracy":     accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_weighted":  f1_weighted.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# 14) TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
)
# ───────────────────────────────────────────────────────────────────────────────
# (A) Sanity‐check forward pass on 4 examples — run this *after* you do:
#     model = RobertaForERCWithFcls.from_pretrained(...).to(DEVICE)
# but *before* you call trainer.train().
# ───────────────────────────────────────────────────────────────────────────────
model.eval()
with torch.no_grad():
    # Grab the first 4 examples from your train_dataset (TensorDataset)
    # train_dataset was constructed as:
    #   train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_spans, train_labels)
    input_ids_batch      = train_input_ids[:4].to(DEVICE)      # [4, MAX_LEN]
    attention_mask_batch = train_attention_mask[:4].to(DEVICE) # [4, MAX_LEN]
    spans_batch          = train_spans[:4].to(DEVICE)          # [4, 2]
    labels_batch         = train_labels[:4].to(DEVICE)         # [4]

    out = model(
        input_ids=input_ids_batch,
        attention_mask=attention_mask_batch,
        spans=spans_batch,
        labels=labels_batch
    )
    logits = out["logits"]  # shape should be [4, 7]
    loss   = out["loss"]    # should be a finite scalar ≈ something > 0

    print("→ Forward‐pass test (4 examples):")
    print("   logits.shape:", logits.shape)
    print("   sample logits (row 0):", logits[0].cpu().numpy())
    print("   sample loss:", loss.item())
# ───────────────────────────────────────────────────────────────────────────────

# 15) Initialize Trainer & fine-tune
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Student PLM (with Fcls MLP) fine‐tuning complete. Model saved to:", OUTPUT_DIR)

# 16) (Optional) Final evaluation on validation set
metrics = trainer.evaluate(eval_dataset=dev_dataset)
print("Validation results:", metrics)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of RobertaForERCWithFcls were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


→ Forward‐pass test (4 examples):
   logits.shape: torch.Size([4, 7])
   sample logits (row 0): [nan nan nan nan nan nan nan]
   sample loss: nan


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# FULL STUDENT PLM TRAINING (FREEZING FIRST 8 ROBERTA-LARGE LAYERS, CLASSIFYING ON Fcls)
# ───────────────────────────────────────────────────────────────────────────────

# 1) Install required libraries (run once)
!pip install transformers datasets evaluate

# 2) Imports & Mount Drive
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    RobertaConfig,
    PreTrainedModel,
    RobertaModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')

# 3) Configuration / file paths / hyperparameters
BASE_DIR         = "/content/drive/MyDrive/MELD"
TRAIN_CSV        = os.path.join(BASE_DIR, "train_for_student_ss.csv")
DEV_CSV          = os.path.join(BASE_DIR, "dev_for_student_ss.csv")
OUTPUT_DIR       = os.path.join(BASE_DIR, "student_roberta_large_Fcls")
MODEL_CHECKPOINT = "roberta-large"
EMOTIONS         = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
NUM_LABELS       = len(EMOTIONS)
MAX_LEN          = 128
BATCH_SIZE       = 8
EPOCHS           = 4
LR               = 9e-5
WEIGHT_DECAY     = 0.01
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4) Define Roberta-large + Fcls→MLP model
class RobertaForERCWithFcls(PreTrainedModel):
    """
    RoBERTa‐based model that:
    - Receives (input_ids, attention_mask, spans) each forward
    - Runs RoBERTa
    - Mean‐pools over [0:a), [a:b+1], [b+1:end] to form Fp, Fq, Ff
    - Concatenates [Fp; Fq; Ff] ∈ ℝ^{3H}
    - Feeds through a two‐layer MLP head (3H→H→num_labels)
    """
    config_class = RobertaConfig

    def __init__(self, config: RobertaConfig):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        hidden_size = config.hidden_size        # 1024 for roberta-large
        num_labels  = config.num_labels         # 7

        # Two-layer MLP: (3H → H) → Tanh → Dropout → (H → num_labels)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(3 * hidden_size, hidden_size),
            torch.nn.Tanh(),
            torch.nn.Dropout(config.hidden_dropout_prob),
            torch.nn.Linear(hidden_size, num_labels)
        )
        self.post_init()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                spans=None,   # Tensor shape [B,2]
                labels=None,
                **kwargs
               ):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        hs = outputs.last_hidden_state  # [B, S, H]
        B, S, H = hs.size()

        # Build Fp, Fq, Ff
        Fp = torch.zeros((B, H), device=hs.device)
        Fq = torch.zeros((B, H), device=hs.device)
        Ff = torch.zeros((B, H), device=hs.device)

        for i, (a, b) in enumerate(spans.tolist()):
            if a > 0:
                Fp[i] = hs[i, :a, :].mean(dim=0)
            if (b >= a) and (a >= 0):
                Fq[i] = hs[i, a : (b + 1), :].mean(dim=0)
            if (b + 1) < S and (b >= 0):
                Ff[i] = hs[i, (b + 1) : S, :].mean(dim=0)

        Fcls = torch.cat([Fp, Fq, Ff], dim=1)  # [B, 3H]
        logits = self.classifier(Fcls)         # [B, num_labels]

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.config.num_labels),
                labels.view(-1)
            )
        return {"loss": loss, "logits": logits}

# 5) Load CSVs into pandas
df_train = pd.read_csv(TRAIN_CSV)   # columns: "student_input", "Emotion"
df_dev   = pd.read_csv(DEV_CSV)

# 6) Prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 7) Helper to find span indices in a tokenized list
def find_span_indices(text):
    enc_no_special = tokenizer(
        text,
        add_special_tokens=False,
        truncation=True,
        max_length=MAX_LEN
    )
    ids = enc_no_special["input_ids"]
    try:
        a = ids.index(tokenizer.convert_tokens_to_ids("<s>"))
        b = ids.index(tokenizer.convert_tokens_to_ids("</s>"), a + 1)
    except ValueError:
        a, b = -1, -1
    return a, b

# 8) Manually build tensors for train split
train_input_ids_list, train_attn_mask_list, train_spans_list, train_labels_list = [], [], [], []
for _, row in df_train.iterrows():
    text = row["student_input"]
    a, b = find_span_indices(text)
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    train_input_ids_list.append(enc["input_ids"].squeeze(0))
    train_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    train_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    train_labels_list.append(torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long))

train_input_ids      = torch.stack(train_input_ids_list)      # [N_train, MAX_LEN]
train_attention_mask = torch.stack(train_attn_mask_list)       # [N_train, MAX_LEN]
train_spans          = torch.stack(train_spans_list)           # [N_train, 2]
train_labels         = torch.stack(train_labels_list)          # [N_train]

# 9) Same for dev split
dev_input_ids_list, dev_attn_mask_list, dev_spans_list, dev_labels_list = [], [], [], []
for _, row in df_dev.iterrows():
    text = row["student_input"]
    a, b = find_span_indices(text)
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    dev_input_ids_list.append(enc["input_ids"].squeeze(0))
    dev_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    dev_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    dev_labels_list.append(torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long))

dev_input_ids      = torch.stack(dev_input_ids_list)      # [N_dev, MAX_LEN]
dev_attention_mask = torch.stack(dev_attn_mask_list)       # [N_dev, MAX_LEN]
dev_spans          = torch.stack(dev_spans_list)           # [N_dev, 2]
dev_labels         = torch.stack(dev_labels_list)          # [N_dev]

# 10) Build TensorDatasets
train_dataset = TensorDataset(
    train_input_ids,
    train_attention_mask,
    train_spans,
    train_labels
)
dev_dataset   = TensorDataset(
    dev_input_ids,
    dev_attention_mask,
    dev_spans,
    dev_labels
)

# 11) Custom collate_fn for DataLoader / Trainer
def collate_fn(batch):
    input_ids      = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    spans          = torch.stack([item[2] for item in batch])
    labels         = torch.stack([item[3] for item in batch])
    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "spans":          spans,
        "labels":         labels
    }

# 12) Instantiate model
config = RobertaConfig.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)
model = RobertaForERCWithFcls.from_pretrained(
    MODEL_CHECKPOINT,
    config=config
).to(DEVICE)

# ─── Freeze first 8 Transformer layers of RoBERTa‐large ─────────────────────────
for idx, layer in enumerate(model.roberta.encoder.layer):
    if idx < 8:
        for param in layer.parameters():
            param.requires_grad = False
# Now only layers 8–23 and the MLP head will be trained.

# 13) Prepare evaluation metrics (accuracy + weighted F1)
accuracy    = evaluate.load("accuracy")
f1_weighted = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    # labels might already be a NumPy array
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    return {
        "accuracy":     accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_weighted":  f1_weighted.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# 14) Define TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
)

# 15) Initialize Trainer & fine-tune end-to-end
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Student PLM (with Fcls MLP & first‐8 layers frozen) fine‐tuning complete. "
      "Checkpoint saved at:", OUTPUT_DIR)

# 16) (Optional) Final evaluation on validation set
metrics = trainer.evaluate(eval_dataset=dev_dataset)
print("Validation results:", metrics)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of RobertaForERCWithFcls were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.0,,0.138087,0.033509


KeyboardInterrupt: 

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# SANITY CHECKS: spans, labels, forward‐pass
# ───────────────────────────────────────────────────────────────────────────────

# 1a) Print the first 10 spans from your training set:
print("First 10 'spans' for train:")
for i in range(min(10, len(train_spans_list))):
    print(f"  Example {i}: student_input =\n    {df_train['student_input'].iloc[i]}\n"
          f"    spans (a,b) = {train_spans_list[i].tolist()}")

# If you see many or all “[-1, -1]”, that means your `<s>`/`</s>` markers
# are not being found properly. You need to confirm that *every* student_input
# actually contains the literal tokens "<s>" and "</s>" after tokenization.

# 1b) Check label distribution in train_labels (should contain 7 distinct ints from 0..6)
unique_labels = torch.unique(train_labels).tolist()
print("Unique label IDs in train_labels:", unique_labels)
print("Label counts in train split:")
for lab in sorted(unique_labels):
    print("  ", EMOTIONS[lab], "→", (train_labels == lab).sum().item())

# If unique_labels is just [0] or [ some single int ], you haven’t mapped your 'Emotion' column correctly.

# 1c) Build a small minibatch (first 4 examples) and do a forward pass to inspect logits/loss
model.eval()
with torch.no_grad():
    # Take the first 4 examples from train_dataset
    input_ids_batch      = train_input_ids[:4].unsqueeze(0)     # already [4, MAX_LEN]
    attention_mask_batch = train_attention_mask[:4].unsqueeze(0)
    spans_batch          = train_spans[:4].unsqueeze(0)
    labels_batch         = train_labels[:4].unsqueeze(0)

    # Actually pass them through the model:
    out = model(
        input_ids=input_ids_batch.to(DEVICE),
        attention_mask=attention_mask_batch.to(DEVICE),
        spans=spans_batch.to(DEVICE),
        labels=labels_batch.to(DEVICE)
    )
    print("  logits.shape:", out["logits"].shape)   # should be [4, 7]
    print("  sample logits:", out["logits"].cpu().numpy())
    print("  sample loss:", out["loss"].item())

# If `out["loss"]` is NaN or extremely large, something is mis‐shaped or your spans are invalid.
# If out["logits"] is nearly constant across all 4 examples,
# your MLP has no signal (likely because Fcls is constant).


First 10 'spans' for train:
  Example 0: student_input =
    <s> Chandler <mask> says: also I was the point person on my company’s transition from the KL-5 to GR-6 system. </s> Chandler neutrally says: That I did. That I did.
    spans (a,b) = [0, 31]
  Example 1: student_input =
    <s> The Interviewer <mask> says: You must’ve had your hands full. </s> The Interviewer neutrally says: So let’s talk a little bit about your duties.
    spans (a,b) = [0, 19]
  Example 2: student_input =
    Chandler neutrally says: also I was the point person on my company’s transition from the KL-5 to GR-6 system. <s> Chandler <mask> says: That I did. That I did. </s> Chandler surprisingly says: My duties?  All right.
    spans (a,b) = [0, 48]
  Example 3: student_input =
    The Interviewer neutrally says: You must’ve had your hands full. <s> The Interviewer <mask> says: So let’s talk a little bit about your duties. </s> The Interviewer neutrally says: Now you’ll be heading a whole division, so you’ll h

ValueError: too many values to unpack (expected 2)

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# FULL STUDENT PLM TRAINING (FREEZING FIRST 8 ROBERTA-LARGE LAYERS,
#   LOCATING spans ON the SAME tokenization that goes into RoBERTa)
# ───────────────────────────────────────────────────────────────────────────────

# 1) Install required libraries (run once)
!pip install transformers datasets evaluate

# 2) Imports & Mount Drive
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    RobertaConfig,
    PreTrainedModel,
    RobertaModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')

# 3) Configuration / file paths / hyperparameters
BASE_DIR         = "/content/drive/MyDrive/MELD"
TRAIN_CSV        = os.path.join(BASE_DIR, "train_for_student_ss.csv")
DEV_CSV          = os.path.join(BASE_DIR, "dev_for_student_ss.csv")
OUTPUT_DIR       = os.path.join(BASE_DIR, "student_roberta_large_Fcls")
MODEL_CHECKPOINT = "roberta-large"
EMOTIONS         = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
NUM_LABELS       = len(EMOTIONS)
MAX_LEN          = 128
BATCH_SIZE       = 16
EPOCHS           = 4
LR               = 2e-5
WEIGHT_DECAY     = 0.01
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4) Define Roberta-large + Fcls→MLP model (unchanged)
class RobertaForERCWithFcls(PreTrainedModel):
    """
    RoBERTa‐based model that:
    - Receives (input_ids, attention_mask, spans) each forward
    - Runs RoBERTa
    - Mean‐pools over [0:a), [a:b+1], [b+1:end] to form Fp, Fq, Ff
    - Concatenates [Fp; Fq; Ff] ∈ ℝ^{3H}
    - Feeds through a two‐layer MLP head (3H→H→num_labels)
    """
    config_class = RobertaConfig

    def __init__(self, config: RobertaConfig):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        hidden_size = config.hidden_size        # 1024 for roberta-large
        num_labels  = config.num_labels         # 7

        # Two-layer MLP: (3H → H) → Tanh → Dropout → (H → num_labels)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(3 * hidden_size, hidden_size),
            torch.nn.Tanh(),
            torch.nn.Dropout(config.hidden_dropout_prob),
            torch.nn.Linear(hidden_size, num_labels)
        )
        self.post_init()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                spans=None,   # Tensor shape [B,2]
                labels=None,
                **kwargs
               ):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        hs = outputs.last_hidden_state  # [B, S, H]
        B, S, H = hs.size()

        # Build Fp, Fq, Ff by mean‐pooling up to spans
        Fp = torch.zeros((B, H), device=hs.device)
        Fq = torch.zeros((B, H), device=hs.device)
        Ff = torch.zeros((B, H), device=hs.device)

        for i, (a, b) in enumerate(spans.tolist()):
            if a > 0:
                Fp[i] = hs[i, :a].mean(dim=0)
            if b >= a:
                Fq[i] = hs[i, a : b+1].mean(dim=0)
            if b+1 < S and b >= 0:
                Ff[i] = hs[i, b+1 :].mean(dim=0)

        Fcls = torch.cat([Fp, Fq, Ff], dim=1)  # [B, 3H]
        logits = self.classifier(Fcls)         # [B, num_labels]

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.config.num_labels),
                labels.view(-1)
            )
        return {"loss": loss, "logits": logits}

# 5) Load CSVs into pandas
df_train = pd.read_csv(TRAIN_CSV)   # columns: "student_input", "Emotion"
df_dev   = pd.read_csv(DEV_CSV)

# 6) Prepare tokenizer (We rely on roberta-large’s vocab, which already knows "<s>" and "</s>")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 7) Manually build tensors for the train split, locating spans in the SAME tokenization
train_input_ids_list, train_attn_mask_list, train_spans_list, train_labels_list = [], [], [], []
for _, row in df_train.iterrows():
    text = row["student_input"]

    # (a) Tokenize ONCE with special tokens:
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ids = enc["input_ids"][0].tolist()        # e.g. [0, 782, 1234, ..., 2, 1]
    attention = enc["attention_mask"][0]       # [MAX_LEN]

    # (b) Find "<s>" and "</s>" tokens in exactly that same ids-list:
    s_id  = tokenizer.convert_tokens_to_ids("<s>")
    es_id = tokenizer.convert_tokens_to_ids("</s>")
    try:
        a = ids.index(s_id)
        b = ids.index(es_id, a + 1)
    except ValueError:
        a, b = -1, -1  # sanity check—should not happen if every student_input has both markers

    # (c) Append to lists:
    train_input_ids_list.append(enc["input_ids"].squeeze(0))       # Tensor([MAX_LEN])
    train_attn_mask_list.append(enc["attention_mask"].squeeze(0))  # Tensor([MAX_LEN])
    train_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    train_labels_list.append(
        torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long)
    )

# (d) Stack them into big tensors:
train_input_ids      = torch.stack(train_input_ids_list)      # [N_train, MAX_LEN]
train_attention_mask = torch.stack(train_attn_mask_list)       # [N_train, MAX_LEN]
train_spans          = torch.stack(train_spans_list)           # [N_train, 2]
train_labels         = torch.stack(train_labels_list)          # [N_train]

# 8) Same for dev split:
dev_input_ids_list, dev_attn_mask_list, dev_spans_list, dev_labels_list = [], [], [], []
for _, row in df_dev.iterrows():
    text = row["student_input"]
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ids = enc["input_ids"][0].tolist()
    s_id  = tokenizer.convert_tokens_to_ids("<s>")
    es_id = tokenizer.convert_tokens_to_ids("</s>")
    try:
        a = ids.index(s_id)
        b = ids.index(es_id, a + 1)
    except ValueError:
        a, b = -1, -1

    dev_input_ids_list.append(enc["input_ids"].squeeze(0))
    dev_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    dev_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    dev_labels_list.append(
        torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long)
    )

dev_input_ids      = torch.stack(dev_input_ids_list)      # [N_dev, MAX_LEN]
dev_attention_mask = torch.stack(dev_attn_mask_list)       # [N_dev, MAX_LEN]
dev_spans          = torch.stack(dev_spans_list)           # [N_dev, 2]
dev_labels         = torch.stack(dev_labels_list)          # [N_dev]

# 9) Build TensorDatasets
train_dataset = TensorDataset(
    train_input_ids,
    train_attention_mask,
    train_spans,
    train_labels
)
dev_dataset   = TensorDataset(
    dev_input_ids,
    dev_attention_mask,
    dev_spans,
    dev_labels
)

# 10) Custom collate_fn for DataLoader / Trainer
def collate_fn(batch):
    input_ids      = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    spans          = torch.stack([item[2] for item in batch])
    labels         = torch.stack([item[3] for item in batch])
    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "spans":          spans,
        "labels":         labels
    }

# 11) Instantiate model
config = RobertaConfig.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)
model = RobertaForERCWithFcls.from_pretrained(
    MODEL_CHECKPOINT,
    config=config
).to(DEVICE)

# 12) Freeze first 8 Transformer blocks of RoBERTa-large
for idx, layer in enumerate(model.roberta.encoder.layer):
    if idx < 8:
        for param in layer.parameters():
            param.requires_grad = False
# Now layers 0..7 are frozen. Only layers 8..23 + MLP head train.

# 13) Prepare evaluation metrics (accuracy + weighted F1)
accuracy    = evaluate.load("accuracy")
f1_weighted = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    # labels may already be a NumPy array
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    return {
        "accuracy":     accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_weighted":  f1_weighted.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# 14) TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
)

# 15) Initialize Trainer & fine-tune end-to-end
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Student PLM (with Fcls MLP & first‐8 layers frozen) fine‐tuning complete.")
print("Checkpoint saved to:", OUTPUT_DIR)

# 16) (Optional) Final evaluation on validation set
metrics = trainer.evaluate(eval_dataset=dev_dataset)
print("Validation results:", metrics)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of RobertaForERCWithFcls were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.0,,0.138087,0.033509


KeyboardInterrupt: 

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# FULL STUDENT PLM TRAINING (FREEZING FIRST 8 ROBERTA-LARGE LAYERS,
#   LOCATING spans on the same single tokenization)
# ───────────────────────────────────────────────────────────────────────────────

# 1) Install required libraries (run once)
!pip install transformers datasets evaluate

# 2) Imports & Mount Drive
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    RobertaConfig,
    PreTrainedModel,
    RobertaModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')

# 3) Configuration / file paths / hyperparameters
BASE_DIR         = "/content/drive/MyDrive/MELD"
TRAIN_CSV        = os.path.join(BASE_DIR, "train_for_student_ss.csv")
DEV_CSV          = os.path.join(BASE_DIR, "dev_for_student_ss.csv")
OUTPUT_DIR       = os.path.join(BASE_DIR, "student_roberta_large_Fcls")
MODEL_CHECKPOINT = "roberta-large"
EMOTIONS         = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
NUM_LABELS       = len(EMOTIONS)
MAX_LEN          = 128
BATCH_SIZE       = 16
EPOCHS           = 4
LR               = 2e-5
WEIGHT_DECAY     = 0.01
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4) Define Roberta-large + Fcls→MLP model (unchanged)
class RobertaForERCWithFcls(PreTrainedModel):
    """
    RoBERTa‐based model that:
    - Receives (input_ids, attention_mask, spans) each forward
    - Runs RoBERTa
    - Mean‐pools over [0:a), [a:b+1], [b+1:end] to form Fp, Fq, Ff
    - Concatenates [Fp; Fq; Ff] ∈ ℝ^{3H}
    - Feeds through a two‐layer MLP head (3H→H→num_labels)
    """
    config_class = RobertaConfig

    def __init__(self, config: RobertaConfig):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        hidden_size = config.hidden_size        # 1024 for roberta-large
        num_labels  = config.num_labels         # 7

        # Two-layer MLP: (3H → H) → Tanh → Dropout → (H → num_labels)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(3 * hidden_size, hidden_size),
            torch.nn.Tanh(),
            torch.nn.Dropout(config.hidden_dropout_prob),
            torch.nn.Linear(hidden_size, num_labels)
        )
        self.post_init()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                spans=None,   # Tensor shape [B,2]
                labels=None,
                **kwargs
               ):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        hs = outputs.last_hidden_state  # [B, S, H]
        B, S, H = hs.size()

        # Build Fp, Fq, Ff by mean‐pooling up to spans
        Fp = torch.zeros((B, H), device=hs.device)
        Fq = torch.zeros((B, H), device=hs.device)
        Ff = torch.zeros((B, H), device=hs.device)

        for i, (a, b) in enumerate(spans.tolist()):
            # a,b are guaranteed valid indices within [0, S-1]
            if a > 0:
                Fp[i] = hs[i, :a, :].mean(dim=0)
            if (b >= a) and (a >= 0):
                Fq[i] = hs[i, a : (b + 1), :].mean(dim=0)
            if (b + 1) < S and (b >= 0):
                Ff[i] = hs[i, (b + 1) : S, :].mean(dim=0)

        Fcls = torch.cat([Fp, Fq, Ff], dim=1)  # [B, 3H]
        logits = self.classifier(Fcls)         # [B, num_labels]

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.config.num_labels),
                labels.view(-1)
            )
        return {"loss": loss, "logits": logits}

# 5) Load CSVs into pandas
df_train = pd.read_csv(TRAIN_CSV)   # columns: "student_input", "Emotion"
df_dev   = pd.read_csv(DEV_CSV)

# 6) Prepare tokenizer (We rely on roberta-large’s vocab,
#    which already contains "<s>" and "</s>" as single tokens)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 7) Manually build tensors for the train split, locating spans in the same tokenization
train_input_ids_list    = []
train_attn_mask_list    = []
train_spans_list        = []
train_labels_list       = []

for _, row in df_train.iterrows():
    text = row["student_input"]
    # (a) Tokenize ONCE, with add_special_tokens=False so we do NOT get a second <s> or </s>
    enc = tokenizer(
        text,
        add_special_tokens=False,   # <<-- crucial, so we only see your manually inserted <s>/</s>
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ids      = enc["input_ids"][0].tolist()        # e.g. [ 0, 634,  ... , 2, 1 ]
    attention= enc["attention_mask"][0]             # [MAX_LEN]

    # (b) Locate "<s>" and "</s>" in that same ids-list
    s_id   = tokenizer.convert_tokens_to_ids("<s>")
    es_id  = tokenizer.convert_tokens_to_ids("</s>")
    try:
        a = ids.index(s_id)
        b = ids.index(es_id, a + 1)
    except ValueError:
        a, b = -1, -1  # ideally this never happens if every student_input has both <s> and </s>

    # (c) Append to our lists
    train_input_ids_list.append(enc["input_ids"].squeeze(0))
    train_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    train_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    train_labels_list.append(torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long))

# (d) Stack them into big tensors
train_input_ids      = torch.stack(train_input_ids_list)      # [N_train, MAX_LEN]
train_attention_mask = torch.stack(train_attn_mask_list)       # [N_train, MAX_LEN]
train_spans          = torch.stack(train_spans_list)           # [N_train, 2]
train_labels         = torch.stack(train_labels_list)          # [N_train]

# 8) Same for dev split
dev_input_ids_list    = []
dev_attn_mask_list    = []
dev_spans_list        = []
dev_labels_list       = []

for _, row in df_dev.iterrows():
    text = row["student_input"]
    enc = tokenizer(
        text,
        add_special_tokens=False,  # <<-- same as above
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ids      = enc["input_ids"][0].tolist()
    attention= enc["attention_mask"][0]
    s_id   = tokenizer.convert_tokens_to_ids("<s>")
    es_id  = tokenizer.convert_tokens_to_ids("</s>")
    try:
        a = ids.index(s_id)
        b = ids.index(es_id, a + 1)
    except ValueError:
        a, b = -1, -1

    dev_input_ids_list.append(enc["input_ids"].squeeze(0))
    dev_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    dev_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    dev_labels_list.append(torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long))

dev_input_ids      = torch.stack(dev_input_ids_list)      # [N_dev, MAX_LEN]
dev_attention_mask = torch.stack(dev_attn_mask_list)       # [N_dev, MAX_LEN]
dev_spans          = torch.stack(dev_spans_list)           # [N_dev, 2]
dev_labels         = torch.stack(dev_labels_list)          # [N_dev]

# 9) Build TensorDatasets
train_dataset = TensorDataset(
    train_input_ids,
    train_attention_mask,
    train_spans,
    train_labels
)
dev_dataset   = TensorDataset(
    dev_input_ids,
    dev_attention_mask,
    dev_spans,
    dev_labels
)

# 10) Custom collate_fn for DataLoader / Trainer
def collate_fn(batch):
    input_ids      = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    spans          = torch.stack([item[2] for item in batch])
    labels         = torch.stack([item[3] for item in batch])
    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "spans":          spans,
        "labels":         labels
    }

# 11) Instantiate model
config = RobertaConfig.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)
model = RobertaForERCWithFcls.from_pretrained(
    MODEL_CHECKPOINT,
    config=config
).to(DEVICE)

# 12) Freeze first 8 Transformer blocks of RoBERTa-large
for idx, layer in enumerate(model.roberta.encoder.layer):
    if idx < 8:
        for param in layer.parameters():
            param.requires_grad = False
# Now layers 0..7 are frozen. Only layers 8..23 + MLP head train.

# 13) Prepare evaluation metrics (accuracy + weighted F1)
accuracy    = evaluate.load("accuracy")
f1_weighted = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    return {
        "accuracy":     accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_weighted":  f1_weighted.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# 14) TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
)

# 15) Initialize Trainer & fine-tune end-to-end
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ── Sanity check forward‐pass on 4 examples before training ─────────────────────
model.eval()
with torch.no_grad():
    iids = train_input_ids[:4].to(DEVICE)
    am   = train_attention_mask[:4].to(DEVICE)
    sp   = train_spans[:4].to(DEVICE)
    lb   = train_labels[:4].to(DEVICE)
    out  = model(input_ids=iids, attention_mask=am, spans=sp, labels=lb)
    print("Forward check logits.shape:", out["logits"].shape)   # should be [4,7]
    print("Forward check sample loss:", out["loss"].item())     # should be > 0 and not nan

# 16) Now train
trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Student PLM (with Fcls MLP & first‐8 layers frozen) fine-tuning complete.")
print("Checkpoint saved to:", OUTPUT_DIR)

# 17) (Optional) Final evaluation on validation set
metrics = trainer.evaluate(eval_dataset=dev_dataset)
print("Validation results:", metrics)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of RobertaForERCWithFcls were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Forward check logits.shape: torch.Size([4, 7])
Forward check sample loss: nan


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.0,,0.138087,0.033509


KeyboardInterrupt: 

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# FULL STUDENT PLM TRAINING (RoBERTa‐large, freeze first 8 layers,
#   manually locate <s>…</s> _after_ RoBERTa adds its own, so no NaNs)
# ───────────────────────────────────────────────────────────────────────────────

# 1) Install required libraries (run once)
!pip install transformers datasets evaluate

# 2) Imports & Mount Drive
import os
import torch
import pandas as pd
from torch.utils.data import TensorDataset
from transformers import (
    RobertaConfig,
    PreTrainedModel,
    RobertaModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')

# 3) Configuration / file paths / hyperparameters
BASE_DIR         = "/content/drive/MyDrive/MELD"
TRAIN_CSV        = os.path.join(BASE_DIR, "train_for_student_ss.csv")
DEV_CSV          = os.path.join(BASE_DIR, "dev_for_student_ss.csv")
OUTPUT_DIR       = os.path.join(BASE_DIR, "student_roberta_large_Fcls")
MODEL_CHECKPOINT = "roberta-large"
EMOTIONS         = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
NUM_LABELS       = len(EMOTIONS)
MAX_LEN          = 128
BATCH_SIZE       = 16
EPOCHS           = 4
LR               = 2e-5
WEIGHT_DECAY     = 0.01
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4) Define RoBERTa‐large + Fcls→MLP model (unchanged)
class RobertaForERCWithFcls(PreTrainedModel):
    """
    RoBERTa-based model that:
      - Receives (input_ids, attention_mask, spans) each forward
      - Runs RoBERTa
      - Mean-pools over [0:a), [a:b+1], [b+1:end] to form Fp, Fq, Ff
      - Concatenates [Fp; Fq; Ff] ∈ ℝ^{3H}
      - Feeds through a two-layer MLP head (3H→H→num_labels)
    """
    config_class = RobertaConfig

    def __init__(self, config: RobertaConfig):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        hidden_size = config.hidden_size        # 1024 for roberta-large
        num_labels  = config.num_labels         # 7

        # Two-layer MLP: (3H → H) → Tanh → Dropout → (H → num_labels)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(3 * hidden_size, hidden_size),
            torch.nn.Tanh(),
            torch.nn.Dropout(config.hidden_dropout_prob),
            torch.nn.Linear(hidden_size, num_labels)
        )
        self.post_init()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                spans=None,   # Tensor shape [B,2]
                labels=None,
                **kwargs
               ):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        hs = outputs.last_hidden_state  # [B, S, H]
        B, S, H = hs.size()

        # Build Fp, Fq, Ff by mean-pooling up to spans
        Fp = torch.zeros((B, H), device=hs.device)
        Fq = torch.zeros((B, H), device=hs.device)
        Ff = torch.zeros((B, H), device=hs.device)

        for i, (a, b) in enumerate(spans.tolist()):
            # a,b are valid indices within [0..S-1]
            if a > 0:
                Fp[i] = hs[i, :a, :].mean(dim=0)
            if (b >= a) and (a >= 0):
                Fq[i] = hs[i, a : (b + 1), :].mean(dim=0)
            if (b + 1) < S and (b >= 0):
                Ff[i] = hs[i, (b + 1) : S, :].mean(dim=0)

        Fcls = torch.cat([Fp, Fq, Ff], dim=1)  # [B, 3H]
        logits = self.classifier(Fcls)         # [B, num_labels]

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.config.num_labels),
                labels.view(-1)
            )
        return {"loss": loss, "logits": logits}

# 5) Load CSVs into pandas
df_train = pd.read_csv(TRAIN_CSV)   # must have columns: "student_input", "Emotion"
df_dev   = pd.read_csv(DEV_CSV)

# 6) Prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# 7) Helper to locate *your* "<s>" / "</s>" in a single encoded sequence
def find_user_span_indices(ids: list, s_id: int, es_id: int):
    # ids is the full token list AFTER tokenization with add_special_tokens=True
    # There are two occurrences of s_id: at index 0 (RoBERTa's own) and at your inserted "<s>"
    # We want the user's inserted one → the second occurrence of s_id.
    s_positions = [i for i, x in enumerate(ids) if x == s_id]
    if len(s_positions) < 2:
        return -1, -1
    a = s_positions[1]  # pick the second <s>

    # For "</s>", there are two occurrences: one user-inserted, one at the very end (index len(ids)-1).
    # We want the user-inserted one, which will be the first in the list strictly < (len(ids)-1).
    es_positions = [i for i, x in enumerate(ids) if x == es_id]
    b = -1
    for pos in es_positions:
        if pos != (len(ids) - 1):
            b = pos
            break
    return a, b

# 8) Manually build tensors for the train split
train_input_ids_list    = []
train_attn_mask_list    = []
train_spans_list        = []
train_labels_list       = []

s_id  = tokenizer.convert_tokens_to_ids("<s>")
es_id = tokenizer.convert_tokens_to_ids("</s>")

for _, row in df_train.iterrows():
    text = row["student_input"]

    # (a) Tokenize ONCE, with add_special_tokens=True so RoBERTa injects its own <s> … </s>
    enc = tokenizer(
        text,
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ids       = enc["input_ids"][0].tolist()        # length=MAX_LEN
    attention = enc["attention_mask"][0]             # length=MAX_LEN

    # (b) Locate your inserted "<s>" and "</s>" in that same ids-list
    a, b = find_user_span_indices(ids, s_id, es_id)
    # If (a,b)==(-1,-1), it means something went wrong—ideally this never happens
    if (a, b) == (-1, -1):
        print("WARNING: could not find user <s></s> in:", text)

    # (c) Append to lists
    train_input_ids_list.append(enc["input_ids"].squeeze(0))       # Tensor([MAX_LEN])
    train_attn_mask_list.append(enc["attention_mask"].squeeze(0))  # Tensor([MAX_LEN])
    train_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    train_labels_list.append(
        torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long)
    )

# (d) Stack into big tensors
train_input_ids      = torch.stack(train_input_ids_list)      # [N_train, MAX_LEN]
train_attention_mask = torch.stack(train_attn_mask_list)       # [N_train, MAX_LEN]
train_spans          = torch.stack(train_spans_list)           # [N_train, 2]
train_labels         = torch.stack(train_labels_list)          # [N_train]

# 9) Same for dev split
dev_input_ids_list    = []
dev_attn_mask_list    = []
dev_spans_list        = []
dev_labels_list       = []

for _, row in df_dev.iterrows():
    text = row["student_input"]
    enc  = tokenizer(
        text,
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ids       = enc["input_ids"][0].tolist()
    attention = enc["attention_mask"][0]

    a, b = find_user_span_indices(ids, s_id, es_id)
    if (a, b) == (-1, -1):
        print("WARNING (dev): could not find user <s></s> in:", text)

    dev_input_ids_list.append(enc["input_ids"].squeeze(0))
    dev_attn_mask_list.append(enc["attention_mask"].squeeze(0))
    dev_spans_list.append(torch.tensor([a, b], dtype=torch.long))
    dev_labels_list.append(
        torch.tensor(EMOTIONS.index(row["Emotion"]), dtype=torch.long)
    )

dev_input_ids      = torch.stack(dev_input_ids_list)      # [N_dev, MAX_LEN]
dev_attention_mask = torch.stack(dev_attn_mask_list)       # [N_dev, MAX_LEN]
dev_spans          = torch.stack(dev_spans_list)           # [N_dev, 2]
dev_labels         = torch.stack(dev_labels_list)          # [N_dev]

# 10) Build TensorDatasets
train_dataset = TensorDataset(
    train_input_ids,
    train_attention_mask,
    train_spans,
    train_labels
)
dev_dataset   = TensorDataset(
    dev_input_ids,
    dev_attention_mask,
    dev_spans,
    dev_labels
)

# 11) Custom collate_fn for DataLoader / Trainer
def collate_fn(batch):
    input_ids      = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    spans          = torch.stack([item[2] for item in batch])
    labels         = torch.stack([item[3] for item in batch])
    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "spans":          spans,
        "labels":         labels
    }

# 12) Instantiate model
config = RobertaConfig.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS
)
model = RobertaForERCWithFcls.from_pretrained(
    MODEL_CHECKPOINT,
    config=config
).to(DEVICE)

# 13) Freeze first 8 Transformer blocks of RoBERTa-large
for idx, layer in enumerate(model.roberta.encoder.layer):
    if idx < 8:
        for param in layer.parameters():
            param.requires_grad = False
# Now only layers 8..23 (plus the MLP head) are trainable

# 14) Prepare evaluation metrics (accuracy + weighted F1)
accuracy    = evaluate.load("accuracy")
f1_weighted = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    return {
        "accuracy":     accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_weighted":  f1_weighted.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# 15) TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
)

# 16) Initialize Trainer & fine-tune
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ── Sanity‐check forward‐pass on 4 examples before training ─────────────────────
model.eval()
with torch.no_grad():
    iids = train_input_ids[:4].to(DEVICE)
    am   = train_attention_mask[:4].to(DEVICE)
    sp   = train_spans[:4].to(DEVICE)
    lb   = train_labels[:4].to(DEVICE)
    out  = model(input_ids=iids, attention_mask=am, spans=sp, labels=lb)
    print("Forward check logits.shape:", out["logits"].shape)  # should be [4,7]
    print("Forward check sample loss:", out["loss"].item())    # should be a positive number, not nan

# 17) Now train
trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Student PLM (with Fcls MLP & first-8 layers frozen) fine-tuning complete.")
print("Checkpoint saved to:", OUTPUT_DIR)

# 18) (Optional) Final evaluation on validation set
metrics = trainer.evaluate(eval_dataset=dev_dataset)
print("Validation results:", metrics)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of RobertaForERCWithFcls were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Forward check logits.shape: torch.Size([4, 7])
Forward check sample loss: nan


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 