In [None]:
import os
import pandas as pd
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
#import os
import pandas as pd
from collections import defaultdict

import os
import pandas as pd
from collections import defaultdict

import os
import pandas as pd
from collections import defaultdict

import os, re
import pandas as pd


Mounted at /content/drive


In [None]:

# ===== config =====
BASE = "/content/drive/MyDrive/MELD"

# teacher predictions (must contain at least Dialogue/Utterance/Speaker/Utterance + pred + conf)
TEACH_TRAIN = os.path.join(BASE, "teacher_predictions_train.csv")
TEACH_DEV   = os.path.join(BASE, "teacher_predictions_dev.csv")

# raw MELD splits (contain gold Emotion)
RAW_TRAIN   = os.path.join(BASE, "train_with_context.csv")
RAW_DEV     = os.path.join(BASE, "dev_with_context.csv")

# outputs
OUT_TRAIN   = os.path.join(BASE, "train_for_student_paragraph.csv")
OUT_DEV     = os.path.join(BASE, "dev_for_student_paragraph.csv")

# suggestive text controls
TAU = 0.70       # confidence threshold for inserting adverbs
K_PAST = 2       # number of previous utterances to include
K_FUT  = 2       # number of following utterances to include

EMO_ADVERB = {
    "anger":"angrily", "disgust":"disgustedly", "fear":"fearfully",
    "joy":"joyfully", "neutral":"neutrally", "sadness":"sadly", "surprise":"surprisingly"
}

def pick_col(df, candidates):
    cols = {c.lower(): c for c in df.columns}
    for name in candidates:
        if name in df.columns: return name
        if name.lower() in cols: return cols[name.lower()]
    raise KeyError(f"Need one of {candidates}, have {list(df.columns)}")

def norm(s):  # compact whitespace
    return re.sub(r"\s+", " ", str(s)).strip()

def format_context_line(spk, text, pred, conf):
    """Context: 'Speaker [adverb?] says: text' where adverb is used if conf≥TAU."""
    adv = EMO_ADVERB.get(str(pred)) if (pd.notna(conf) and float(conf) >= TAU) else None
    if adv:
        return f"{spk} {adv} says: {text}"
    return f"{spk} says: {text}"

def build_paragraph_rows(teach_csv, raw_csv, out_csv):
    # load dfs
    t = pd.read_csv(teach_csv)
    r = pd.read_csv(raw_csv)

    # teacher columns
    T_DID   = pick_col(t, ["Dialogue_ID","conv_id","Conversation_ID","dialogue_id"])
    T_UID   = pick_col(t, ["Utterance_ID","utt_id","Utterance_ID_in_Dialogue","utterance_id"])
    T_SPK   = pick_col(t, ["Speaker","speaker","speaker_id"])
    T_UTT   = pick_col(t, ["Utterance","utterance","text"])
    T_PRED  = pick_col(t, ["pred","pred_label","coarse_pred","teacher_pred"])
    T_CONF  = pick_col(t, ["conf","pred_confidence","coarse_conf","teacher_confidence"])

    # raw columns (for gold label)
    R_DID   = pick_col(r, ["Dialogue_ID","conv_id","Conversation_ID","dialogue_id"])
    R_UID   = pick_col(r, ["Utterance_ID","utt_id","Utterance_ID_in_Dialogue","utterance_id"])
    R_EMO   = pick_col(r, ["Emotion","emotion","label"])

    # merge gold Emotion
    df = t.merge(
        r[[R_DID, R_UID, R_EMO]],
        left_on=[T_DID, T_UID],
        right_on=[R_DID, R_UID],
        how="left",
        validate="one_to_one",
    ).rename(columns={
        T_DID:"DID", T_UID:"UID", T_SPK:"SPK", T_UTT:"UTT",
        T_PRED:"PRED", T_CONF:"CONF", R_EMO:"EMO"
    })

    # sort by dialogue order
    df = df.sort_values(["DID","UID"]).reset_index(drop=True)

    # group by dialogue
    paragraphs = []
    for did, sub in df.groupby("DID", sort=False):
        sub = sub.sort_values("UID").reset_index(drop=True)

        # build a quick lookup by absolute index
        for i in range(len(sub)):
            # context window indices
            left  = max(0, i - K_PAST)
            right = min(len(sub), i + 1 + K_FUT)

            pieces = []

            # PAST
            for j in range(left, i):
                spk = str(sub.loc[j, "SPK"])
                txt = norm(sub.loc[j, "UTT"])
                prd = sub.loc[j, "PRED"]
                cnf = sub.loc[j, "CONF"]
                pieces.append(format_context_line(spk, txt, prd, cnf))

            # QUERY (mask adverb; always exactly one masked line)
            spk_q = str(sub.loc[i, "SPK"])
            txt_q = norm(sub.loc[i, "UTT"])
            # query line: '<s> Speaker <mask> says: text </s>'
            pieces.append(f"<s> {spk_q} <mask> says: {txt_q} </s>")

            # FUTURE
            for j in range(i + 1, right):
                spk = str(sub.loc[j, "SPK"])
                txt = norm(sub.loc[j, "UTT"])
                prd = sub.loc[j, "PRED"]
                cnf = sub.loc[j, "CONF"]
                pieces.append(format_context_line(spk, txt, prd, cnf))

            paragraphs.append({
                "student_input": " ".join(pieces),
                "Emotion": str(sub.loc[i, "EMO"])
            })

    out_df = pd.DataFrame(paragraphs)
    out_df.to_csv(out_csv, index=False)
    print(f"✓ Saved → {out_csv}  (rows: {len(out_df)})")

# run for train + dev
build_paragraph_rows(TEACH_TRAIN, RAW_TRAIN, OUT_TRAIN)
build_paragraph_rows(TEACH_DEV,   RAW_DEV,   OUT_DEV)


Mounted at /content/drive
✓ Saved → /content/drive/MyDrive/MELD/train_for_student_paragraph.csv  (rows: 9988)
✓ Saved → /content/drive/MyDrive/MELD/dev_for_student_paragraph.csv  (rows: 1108)
