In [7]:
import csv, json, re, random, pathlib
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
# Reproducibility
SEED = 17
random.seed(SEED)
np.random.seed(SEED)

# Paths
ROOT = pathlib.Path(".")
RAW  = ROOT / "data_raw"      # contains dialogues_*.txt
PROC = ROOT / "data_proc"     # will be created
PROC.mkdir(parents=True, exist_ok=True)

# Emotion mapping (DailyDialog)
EMO_ID2NAME = {
    0: "no_emotion",
    1: "anger",
    2: "disgust",
    3: "fear",
    4: "happiness",
    5: "sadness",
    6: "surprise"
}
EMO_NAME2ID = {v:k for k,v in EMO_ID2NAME.items()}


**Dataset Loading**

In [11]:
text_path = RAW / "dialogues_text.txt"
emo_path  = RAW / "dialogues_emotion.txt"
act_path  = RAW / "dialogues_act.txt"          

texts = text_path.read_text(encoding="utf-8").splitlines()
emos  = emo_path.read_text(encoding="utf-8").splitlines()
acts  = act_path.read_text(encoding="utf-8").splitlines()

assert len(texts) == len(emos) == len(acts), "Mismatch: texts/emos/acts line counts differ."
print(f"Loaded {len(texts):,} dialogues.")

Loaded 13,118 dialogues.


**Flatenning the dialogues to utterance level**

In [13]:
def clean_utt(u: str) -> str:
    u = u.strip()
    u = re.sub(r"\s+", " ", u)  # collapse whitespace
    return u

rows = []
bad_align = 0

for d_id, (t_line, e_line) in enumerate(zip(texts, emos)):
    utts = [clean_utt(u) for u in t_line.split("__eou__") if u.strip()]
    e_labels = [int(x) for x in e_line.split() if x != ""]
    
    if len(utts) != len(e_labels):
        bad_align += 1
        m = min(len(utts), len(e_labels))
        utts, e_labels = utts[:m], e_labels[:m]
    
    for turn_id, (utt, emo_id) in enumerate(zip(utts, e_labels)):
        rows.append({
            "dialog_id": d_id,
            "turn_id": turn_id,
            "utterance": utt,
            "emotion_id": emo_id,
            "emotion": EMO_ID2NAME.get(emo_id, "unknown")
        })

print(f"Total utterances: {len(rows):,} | dialogues with length mismatch trimmed: {bad_align}")
df_all = pd.DataFrame(rows)
df_all.head(10)

Total utterances: 102,979 | dialogues with length mismatch trimmed: 1


Unnamed: 0,dialog_id,turn_id,utterance,emotion_id,emotion
0,0,0,The kitchen stinks .,2,disgust
1,0,1,I'll throw out the garbage .,0,no_emotion
2,1,0,"So Dick , how about getting some coffee for to...",4,happiness
3,1,1,Coffee ? I don ’ t honestly like that kind of ...,2,disgust
4,1,2,"Come on , you can at least try a little , besi...",0,no_emotion
5,1,3,What ’ s wrong with that ? Cigarette is the th...,1,anger
6,1,4,"Not for me , Dick .",0,no_emotion
7,2,0,Are things still going badly with your housegu...,0,no_emotion
8,2,1,Getting worse . Now he ’ s eating me out of ho...,1,anger
9,2,2,"Leo , I really think you ’ re beating around t...",0,no_emotion
