In [None]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from google.colab import drive

# Mount Drive (if needed)
drive.mount('/content/drive')

# ─── Paths & Globals ─────────────────────────────────────────────────────────
base       = "/content/drive/MyDrive/MELD"
checkpoint = os.path.join(base, "teacher_roberta_erc")    # your fine-tuned model
outs_dir   = os.path.join(base, "segment_embeddings")
os.makedirs(outs_dir, exist_ok=True)

# ─── Device & Encoder ─────────────────────────────────────────────────────────
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = AutoModel.from_pretrained(
    checkpoint,
    local_files_only=True   # ensure it loads from Drive, not Hub
).to(device)
encoder.eval()

# ─── Tokenizer & Marker IDs ───────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
s_id  = tokenizer.convert_tokens_to_ids("<s>")   # your inserted start marker
es_id = tokenizer.convert_tokens_to_ids("</s>")  # your inserted end marker

# ─── Load the CSVs ─────────────────────────────────────────────────────────────
dfs = {
    split: pd.read_csv(f"{base}/{split}_with_context.csv")
    for split in ("train", "dev", "test")
}

# ─── 1) Recompute query spans correctly ───────────────────────────────────────
def find_span(text):
    # Turn off default special tokens so we see only your markers
    ids = tokenizer(
        text,
        add_special_tokens=False,
        truncation=True,
        max_length=128
    )["input_ids"]
    try:
        start = ids.index(s_id)
        end   = ids.index(es_id, start+1)
    except ValueError:
        # if something’s wrong, mark invalid
        start, end = -1, -1
    return (start, end)

for split, df in dfs.items():
    df["query_span"] = df["bert_input"].apply(find_span)
    # overwrite if you want
    df.to_csv(f"{base}/{split}_with_spans.csv", index=False)
    dfs[split] = df

# ─── 2) Build DataLoader ──────────────────────────────────────────────────────
def make_loader(df, batch_size=32):
    ds = Dataset.from_pandas(df)
    def collate(batch):
        texts = [ex["bert_input"] for ex in batch]
        spans = torch.tensor([ex["query_span"] for ex in batch], dtype=torch.long)
        enc = tokenizer(
            texts,
            padding="longest",
            truncation=True,
            max_length=128,
            add_special_tokens=False,   # same here!
            return_tensors="pt"
        )
        return enc["input_ids"], enc["attention_mask"], spans
    return DataLoader(ds, batch_size=batch_size, collate_fn=collate)

loaders = {split: make_loader(df) for split, df in dfs.items()}

# ─── 3) Inference Loop: extract & save Fp, Fq, Ff ─────────────────────────────
for split, loader in loaders.items():
    all_Fcls = []
    all_Fp, all_Fq, all_Ff = [], [], []
    with torch.no_grad():
        for input_ids, attention_mask, spans in loader:
            input_ids      = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            spans          = spans.to(device)

            hs = encoder(
                input_ids=input_ids,
                attention_mask=attention_mask
            ).last_hidden_state  # [B, S, H]
            B, S, H = hs.size()

            Fp = torch.zeros((B, H), device=device)
            Fq = torch.zeros((B, H), device=device)
            Ff = torch.zeros((B, H), device=device)

            for i, (a, b) in enumerate(spans.tolist()):
                if a > 0:
                    Fp[i] = hs[i, :a].mean(dim=0)
                if b >= a:
                    Fq[i] = hs[i, a : b+1].mean(dim=0)
                if b+1 < S and b >= 0:
                    Ff[i] = hs[i, b+1 :].mean(dim=0)

            all_Fp.append(Fp.cpu())
            all_Fq.append(Fq.cpu())
            all_Ff.append(Ff.cpu())

            Fcls = torch.cat([Fp, Fq, Ff], dim=1)  # [B, 3*H]
            all_Fcls.append(Fcls.cpu())

    Fp_tensor = torch.cat(all_Fp, dim=0)
    Fq_tensor = torch.cat(all_Fq, dim=0)
    Ff_tensor = torch.cat(all_Ff, dim=0)
    Fcls_tensor = torch.cat(all_Fcls, dim=0)

    torch.save(Fp_tensor, os.path.join(outs_dir, f"{split}_Fp.pt"))
    torch.save(Fq_tensor, os.path.join(outs_dir, f"{split}_Fq.pt"))
    torch.save(Ff_tensor, os.path.join(outs_dir, f"{split}_Ff.pt"))
    torch.save(Fcls_tensor, os.path.join(outs_dir, f"{split}_Fcls.pt"))
    print(f"[{split}] saved Fcls shape:", Fcls_tensor.shape)
    print(f"[{split}] saved shapes:",
          Fp_tensor.shape, Fq_tensor.shape, Ff_tensor.shape)


In [None]:
# Display the first 10 rows of the span column alongside the input text
display(df.loc[:20, ["bert_input", "query_span"]])


Unnamed: 0,bert_input,query_span
0,<s> Mark <mask> says: Why do all you’re coffee...,"(0, 22)"
1,<s> Rachel <mask> says: Oh. That’s so Monica c...,"(0, 43)"
2,Rachel says: Oh. That’s so Monica can keep tra...,"(40, 51)"
3,"<s> Joey <mask> says: Come on, Lydia, you can ...","(0, 16)"
4,"Joey says: Come on, Lydia, you can do it.<s> J...","(14, 22)"
5,Joey says: Push!<s> Joey <mask> says: Push 'em...,"(6, 26)"
6,"Joey says: Push 'em out, push 'em out, harder,...","(18, 37)"
7,"Joey says: Push 'em out, push 'em out, way out...","(17, 40)"
8,Joey says: Let's get that ball and really move...,"(21, 39)"
9,"Joey says: Let's— I was just—yeah, right.<s> ...","(16, 24)"


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 1) Load your concatenated features tensor
base = "/content/drive/MyDrive/MELD/segment_embeddings"
Fcls_train = torch.load(f"{base}/train_Fcls.pt")  # [N, 3*H]
Fcls_dev   = torch.load(f"{base}/dev_Fcls.pt")
Fcls_test  = torch.load(f"{base}/test_Fcls.pt")

# 2) Define the Fine-Grained MLP head exactly as used in training
class FineGrainedHead(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_labels: int, dropout: float = 0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.act = nn.Tanh()
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, num_labels)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.act(self.fc1(x))  # FC + Tanh
        x = self.drop(x)           # Dropout
        logits = self.fc2(x)       # Final linear -> logits
        return logits

# 3) Instantiate and load any pretrained head weights if available
hidden_size = 768
input_dim   = 3 * hidden_size
num_labels  = 7
head = FineGrainedHead(input_dim, hidden_size, num_labels)

# If you saved the head state_dict previously, load it:
# head.load_state_dict(torch.load("/content/drive/MyDrive/MELD/head_weights.pt"))

head.eval()

# 4) Compute probabilities for each split
with torch.no_grad():
    logits_train = head(Fcls_train)           # [N_train, num_labels]
    probs_train  = F.softmax(logits_train, dim=1)   # [N_train, num_labels]

    logits_dev = head(Fcls_dev)
    probs_dev  = F.softmax(logits_dev, dim=1)

    logits_test = head(Fcls_test)
    probs_test  = F.softmax(logits_test, dim=1)

# 5) Inspect shapes and a few sample probabilities
print("Train probabilities shape:", probs_train.shape)
print("Sample train probabilities (first 5 rows):")
print(probs_train[:25])

# 6) (Optional) Convert to NumPy and save for later use:
probs_train_np = probs_train.cpu().numpy()
torch.save(probs_train, f"{base}/train_probs.pt")



Train probabilities shape: torch.Size([9988, 7])
Sample train probabilities (first 5 rows):
tensor([[0.1246, 0.1137, 0.1698, 0.1739, 0.1121, 0.1311, 0.1747],
        [0.1090, 0.1288, 0.1657, 0.1701, 0.1105, 0.1356, 0.1804],
        [0.1508, 0.0997, 0.1920, 0.1528, 0.1049, 0.1153, 0.1843],
        [0.1132, 0.1250, 0.1912, 0.1698, 0.0939, 0.1135, 0.1932],
        [0.1702, 0.1649, 0.1332, 0.1454, 0.0939, 0.1233, 0.1691],
        [0.1379, 0.1199, 0.1685, 0.1637, 0.0903, 0.1247, 0.1952],
        [0.1244, 0.1171, 0.1616, 0.1549, 0.1084, 0.1604, 0.1732],
        [0.1251, 0.1129, 0.1754, 0.1414, 0.1071, 0.1418, 0.1962],
        [0.1315, 0.1214, 0.1620, 0.1412, 0.1348, 0.1336, 0.1754],
        [0.1408, 0.1283, 0.1605, 0.1478, 0.1127, 0.1279, 0.1821],
        [0.1634, 0.1201, 0.1124, 0.1289, 0.1220, 0.1438, 0.2093],
        [0.1345, 0.1072, 0.1934, 0.1429, 0.1142, 0.1278, 0.1800],
        [0.1765, 0.1622, 0.1090, 0.1076, 0.1624, 0.1292, 0.1530],
        [0.1188, 0.1011, 0.1967, 0.1446, 0.1149, 0

In [None]:
import os
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

# ─── Configuration ────────────────────────────────────────────────────────────
base = "/content/drive/MyDrive/MELD"
features_dir = os.path.join(base, "features")
head_save_path = os.path.join(base, "head_finetuned.pt")

# Hyperparameters
hidden_size = 768
input_dim   = 3 * hidden_size
num_labels  = 7
dropout     = 0.3
lr          = 1e-3
batch_size  = 32
epochs      = 10
device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ─── 1) Load Fcls features and labels ─────────────────────────────────────────
# Features
base1 = "/content/drive/MyDrive/MELD/segment_embeddings"
Fcls_train = torch.load(f"{base1}/train_Fcls.pt")  # [N, 3*H]
Fcls_dev   = torch.load(f"{base1}/dev_Fcls.pt")

# Labels from CSV
df_train = pd.read_csv(os.path.join(base, "train_with_spans.csv"))
df_dev   = pd.read_csv(os.path.join(base, "dev_with_spans.csv"))

# Map emotions to integer IDs
EMOTIONS = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
label_map = {emo: i for i, emo in enumerate(EMOTIONS)}
labels_train = torch.tensor(df_train["Emotion"].map(label_map).values, dtype=torch.long)
labels_dev   = torch.tensor(df_dev["Emotion"].map(label_map).values, dtype=torch.long)

# ─── 2) Create DataLoaders ────────────────────────────────────────────────────
train_ds = TensorDataset(Fcls_train, labels_train)
dev_ds   = TensorDataset(Fcls_dev,   labels_dev)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
dev_loader   = DataLoader(dev_ds,   batch_size=batch_size)

# ─── 3) Define Fine-Grained MLP Head ──────────────────────────────────────────
class FineGrainedHead(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels, dropout):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.act = nn.Tanh()
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, num_labels)
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.drop(x)
        return self.fc2(x)

# Instantiate
head = FineGrainedHead(input_dim, hidden_size, num_labels, dropout).to(device)

# ─── 4) Training Setup ─────────────────────────────────────────────────────────
optimizer = optim.AdamW(head.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# ─── 5) Training Loop ─────────────────────────────────────────────────────────
for epoch in range(1, epochs+1):
    head.train()
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = head(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
    avg_train_loss = total_loss / len(train_loader.dataset)

    # Evaluate on dev
    head.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in dev_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = head(X_batch)
            preds = logits.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    dev_acc = correct / total

    print(f"Epoch {epoch}/{epochs} • Train Loss: {avg_train_loss:.4f} • Dev Acc: {dev_acc:.4f}")

# ─── 6) Save the Fine-Tuned Head ───────────────────────────────────────────────
torch.save(head.state_dict(), head_save_path)
print(f"Saved fine-tuned head weights to {head_save_path}")


Epoch 1/10 • Train Loss: 0.8883 • Dev Acc: 0.6101
Epoch 2/10 • Train Loss: 0.8309 • Dev Acc: 0.6137
Epoch 3/10 • Train Loss: 0.8056 • Dev Acc: 0.6083
Epoch 4/10 • Train Loss: 0.8062 • Dev Acc: 0.6119
Epoch 5/10 • Train Loss: 0.8019 • Dev Acc: 0.6191
Epoch 6/10 • Train Loss: 0.7998 • Dev Acc: 0.6074
Epoch 7/10 • Train Loss: 0.7937 • Dev Acc: 0.6056
Epoch 8/10 • Train Loss: 0.7964 • Dev Acc: 0.6047
Epoch 9/10 • Train Loss: 0.7780 • Dev Acc: 0.6209
Epoch 10/10 • Train Loss: 0.7756 • Dev Acc: 0.6056
Saved fine-tuned head weights to /content/drive/MyDrive/MELD/head_finetuned.pt


In [None]:
from torch import nn
import numpy as np

# ─── Paths ───────────────────────────────────────────────────────────────────
base         = "/content/drive/MyDrive/MELD"
base1 = "/content/drive/MyDrive/MELD/segment_embeddings"
head_path    = f"{base}/head_finetuned.pt"
fcls_path    = f"{base1}/train_Fcls.pt"
output_csv   = f"{base}/train_with_predictions.csv"

# ─── 1) Load Fcls features ────────────────────────────────────────────────────
Fcls_train = torch.load(fcls_path, weights_only=True)  # [N, 3*H]

# ─── 2) Define the MLP head architecture ─────────────────────────────────────
class FineGrainedHead(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels, dropout=0.3):
        super().__init__()
        self.fc1  = nn.Linear(input_dim, hidden_dim)
        self.act  = nn.Tanh()
        self.drop = nn.Dropout(dropout)
        self.fc2  = nn.Linear(hidden_dim, num_labels)
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.drop(x)
        return self.fc2(x)

# ─── 3) Instantiate and load saved weights ────────────────────────────────────
hidden_size = 768
input_dim   = 3 * hidden_size
num_labels  = 7

head = FineGrainedHead(input_dim, hidden_size, num_labels)
head.load_state_dict(torch.load(head_path, map_location="cpu"))
head.eval()

# ─── 4) Compute logits and probabilities ─────────────────────────────────────
with torch.no_grad():
    logits = head(Fcls_train)                   # [N, num_labels]
    probs  = torch.softmax(logits, dim=1).cpu().numpy()  # [N, num_labels]

# ─── 5) Map *all* confidences back to the DataFrame ───────────────────────────
df = pd.read_csv(f"{base}/train_with_context.csv")

EMOTIONS    = ["anger","disgust","fear","joy","neutral","sadness","surprise"]
pred_ids    = np.argmax(probs, axis=1)
pred_labels = [EMOTIONS[i] for i in pred_ids]
pred_conf   = probs.max(axis=1)

# 5a) keep the top prediction + its confidence
df["pred_label"]      = pred_labels
df["pred_confidence"] = pred_conf

# 5b) add one column per emotion
for idx, emo in enumerate(EMOTIONS):
    df[f"conf_{emo}"] = probs[:, idx]

# 6) Save the extended DataFrame
df.to_csv(f"{base}/train_with_all_confidences.csv", index=False)


In [None]:
from torch import nn
import torch
import numpy as np
import pandas as pd

# ─── Paths ───────────────────────────────────────────────────────────────────
base       = "/content/drive/MyDrive/MELD"
base1      = "/content/drive/MyDrive/MELD/segment_embeddings"
head_path  = f"{base}/head_finetuned.pt"
fcls_dev   = f"{base1}/dev_Fcls.pt"
input_csv  = f"{base}/dev_with_context.csv"
output_csv = f"{base}/dev_with_all_confidences.csv"

# ─── 1) Load dev Fcls features ────────────────────────────────────────────────
#    Shape: [N_dev, 3*H]
Fcls_dev_tensor = torch.load(fcls_dev, weights_only=True)

# ─── 2) Define the MLP head architecture ─────────────────────────────────────
class FineGrainedHead(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels, dropout=0.3):
        super().__init__()
        self.fc1  = nn.Linear(input_dim, hidden_dim)
        self.act  = nn.Tanh()
        self.drop = nn.Dropout(dropout)
        self.fc2  = nn.Linear(hidden_dim, num_labels)
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.drop(x)
        return self.fc2(x)

hidden_size = 768
input_dim   = 3 * hidden_size
num_labels  = 7

# Instantiate and load saved head weights
head = FineGrainedHead(input_dim, hidden_size, num_labels)
head.load_state_dict(torch.load(head_path, map_location="cpu"))
head.eval()

# ─── 3) Compute logits and probabilities for dev ───────────────────────────────
with torch.no_grad():
    logits_dev = head(Fcls_dev_tensor)                     # [N_dev, 7]
    probs_dev  = torch.softmax(logits_dev, dim=1).cpu().numpy()  # [N_dev, 7]

# ─── 4) Load original dev DataFrame ─────────────────────────────────────────────
df_dev = pd.read_csv(input_csv)

EMOTIONS = ["anger","disgust","fear","joy","neutral","sadness","surprise"]

# 4a) Top‐predicted label and its confidence
pred_ids_dev    = np.argmax(probs_dev, axis=1)          # [N_dev]
pred_labels_dev = [EMOTIONS[i] for i in pred_ids_dev]   # list of length N_dev
pred_conf_dev   = probs_dev.max(axis=1)                 # [N_dev]

df_dev["pred_label"]      = pred_labels_dev
df_dev["pred_confidence"] = pred_conf_dev

# 4b) One column per emotion confidence
for idx, emo in enumerate(EMOTIONS):
    df_dev[f"conf_{emo}"] = probs_dev[:, idx]

# ─── 5) Save the dev DataFrame with all confidences ────────────────────────────
df_dev.to_csv(output_csv, index=False)
print(f"✅ Saved dev confidences to: {output_csv}  (shape: {df_dev.shape})")


✅ Saved dev confidences to: /content/drive/MyDrive/MELD/dev_with_all_confidences.csv  (shape: (1108, 21))


In [None]:
import os
import pandas as pd

base = "/content/drive/MyDrive/MELD"
preds_file = os.path.join(base, "train_with_predictions.csv")
p = 0.7

emo_adverb = {
    "anger":    "angrily",
    "disgust":  "disgustedly",
    "fear":     "fearfully",
    "joy":      "joyfully",
    "neutral":  "neutrally",
    "sadness":  "sadly",
    "surprise": "surprisingly"
}

# 1) Load your teacher‐annotated train CSV
df = pd.read_csv(preds_file)
df = df.sort_values(["Dialogue_ID","Utterance_ID"]).reset_index(drop=True)

# 2) Precompute each speaker’s turn‐list per dialogue
#    A dict: (did, speaker) → sorted list of utterance_IDs they actually speak
from collections import defaultdict
speaker_turns = defaultdict(list)
for _, row in df.iterrows():
    speaker_turns[(row.Dialogue_ID, row.Speaker)].append(row.Utterance_ID)
for key in speaker_turns:
    speaker_turns[key].sort()

# 3) Build single‐speaker student_input using speaker’s own ±1 window
def build_ss_input(row):
    did, uid, spk = row.Dialogue_ID, row.Utterance_ID, row.Speaker
    # query
    query = f"<s> {spk} <mask> says: {row.Utterance} </s>"

    # find this speaker’s list and our position in it
    turns = speaker_turns[(did, spk)]
    pos   = turns.index(uid)

    def fmt_speaker_context(pos2):
        if pos2 < 0 or pos2 >= len(turns):
            return ""
        other_id = turns[pos2]
        other    = df[(df.Dialogue_ID==did)&(df.Utterance_ID==other_id)].iloc[0]
        text     = other.Utterance
        conf     = other.pred_confidence
        if conf >= p:
            emo = other.pred_label
            return f"{spk} {emo_adverb[emo]} says: {text}"
        else:
            return f"{spk} says: {text}"

    # previous and next **by this same speaker**
    past   = fmt_speaker_context(pos-1)
    future = fmt_speaker_context(pos+1)

    # stitch together
    return " ".join(seg for seg in (past, query, future) if seg)

df["student_input"] = df.apply(build_ss_input, axis=1)

# 4) Save out
out_csv = os.path.join(base, "train_for_student_ss.csv")
df.to_csv(out_csv, index=False)
print(f"Saved single‐speaker (speaker‐window) inputs: {out_csv}")


Saved single‐speaker (speaker‐window) inputs: /content/drive/MyDrive/MELD/train_for_student_ss.csv


In [None]:
import os
import pandas as pd
from collections import defaultdict

# ─── CONFIG ───────────────────────────────────────────────────────────────────
base       = "/content/drive/MyDrive/MELD"
preds_file = os.path.join(base, "train_with_all_confidences.csv")  # <-- update this!
out_file   = os.path.join(base, "train_for_student_ss.csv")
p          = 0.7  # confidence threshold

emo_adverb = {
    "anger":    "angrily",
    "disgust":  "disgustedly",
    "fear":     "fearfully",
    "joy":      "joyfully",
    "neutral":  "neutrally",
    "sadness":  "sadly",
    "surprise": "surprisingly"
}

# ─── 1) Load and sort ──────────────────────────────────────────────────────────
df = pd.read_csv(preds_file)
df = df.sort_values(["Dialogue_ID", "Utterance_ID"]).reset_index(drop=True)

# ─── 2) Precompute speaker‐turn lists ─────────────────────────────────────────
speaker_turns = defaultdict(list)
for _, row in df.iterrows():
    speaker_turns[(row.Dialogue_ID, row.Speaker)].append(row.Utterance_ID)
for key in speaker_turns:
    speaker_turns[key].sort()

# ─── 3) Build single‐speaker student_input ────────────────────────────────────
def build_ss_input(row):
    if row.pred_confidence < p:
        return row.bert_input   # fallback if query low‐confidence

    did, uid, spk = row.Dialogue_ID, row.Utterance_ID, row.Speaker
    query = f"<s> {spk} <mask> says: {row.Utterance} </s>"

    turns = speaker_turns[(did, spk)]
    pos   = turns.index(uid)

    def fmt(idx):
        if idx < 0 or idx >= len(turns):
            return ""
        other_id = turns[idx]
        other    = df[(df.Dialogue_ID==did)&(df.Utterance_ID==other_id)].iloc[0]
        text     = other.Utterance
        # always include context if query is confident
        if row.pred_confidence >= p:
            emo = other.pred_label
            return f"{spk} {emo_adverb[emo]} says: {text}"
        else:
            return f"{spk} says: {text}"

    past   = fmt(pos-1)
    future = fmt(pos+1)
    return " ".join(seg for seg in (past, query, future) if seg)

df["student_input"] = df.apply(build_ss_input, axis=1)

# ─── 4) Save out ───────────────────────────────────────────────────────────────
df.to_csv(out_file, index=False)
print(f"Saved single-speaker student inputs (query-threshold) to:\n{out_file}")


Saved single-speaker student inputs (query-threshold) to:
/content/drive/MyDrive/MELD/train_for_student_ss.csv


In [None]:
import os
import pandas as pd
from collections import defaultdict

# ─── CONFIG ───────────────────────────────────────────────────────────────────
base       = "/content/drive/MyDrive/MELD"
preds_file = os.path.join(base, "dev_with_all_confidences.csv")  # <-- update this!
out_file   = os.path.join(base, "dev_for_student_ss.csv")
p          = 0.7  # confidence threshold

emo_adverb = {
    "anger":    "angrily",
    "disgust":  "disgustedly",
    "fear":     "fearfully",
    "joy":      "joyfully",
    "neutral":  "neutrally",
    "sadness":  "sadly",
    "surprise": "surprisingly"
}

# ─── 1) Load and sort ──────────────────────────────────────────────────────────
df = pd.read_csv(preds_file)
df = df.sort_values(["Dialogue_ID", "Utterance_ID"]).reset_index(drop=True)

# ─── 2) Precompute speaker‐turn lists ─────────────────────────────────────────
speaker_turns = defaultdict(list)
for _, row in df.iterrows():
    speaker_turns[(row.Dialogue_ID, row.Speaker)].append(row.Utterance_ID)
for key in speaker_turns:
    speaker_turns[key].sort()

# ─── 3) Build single‐speaker student_input ────────────────────────────────────
def build_ss_input(row):
    if row.pred_confidence < p:
        return row.bert_input   # fallback if query low‐confidence

    did, uid, spk = row.Dialogue_ID, row.Utterance_ID, row.Speaker
    query = f"<s> {spk} <mask> says: {row.Utterance} </s>"

    turns = speaker_turns[(did, spk)]
    pos   = turns.index(uid)

    def fmt(idx):
        if idx < 0 or idx >= len(turns):
            return ""
        other_id = turns[idx]
        other    = df[(df.Dialogue_ID==did)&(df.Utterance_ID==other_id)].iloc[0]
        text     = other.Utterance
        # always include context if query is confident
        if row.pred_confidence >= p:
            emo = other.pred_label
            return f"{spk} {emo_adverb[emo]} says: {text}"
        else:
            return f"{spk} says: {text}"

    past   = fmt(pos-1)
    future = fmt(pos+1)
    return " ".join(seg for seg in (past, query, future) if seg)

df["student_input"] = df.apply(build_ss_input, axis=1)

# ─── 4) Save out ───────────────────────────────────────────────────────────────
df.to_csv(out_file, index=False)
print(f"Saved single-speaker student inputs (query-threshold) to:\n{out_file}")


Saved single-speaker student inputs (query-threshold) to:
/content/drive/MyDrive/MELD/dev_for_student_ss.csv
