In [10]:
import csv, json, re, random, pathlib
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from scipy.sparse import save_npz, load_npz

In [11]:
# Reproducibility
SEED = 17
random.seed(SEED)
np.random.seed(SEED)

# Paths
ROOT = pathlib.Path(".")
RAW  = ROOT / "data_raw"      # contains dialogues_*.txt
PROC = ROOT / "data_proc"     # will be created
PROC.mkdir(parents=True, exist_ok=True)

# Emotion mapping (DailyDialog)
EMO_ID2NAME = {
    0: "no_emotion",
    1: "anger",
    2: "disgust",
    3: "fear",
    4: "happiness",
    5: "sadness",
    6: "surprise"
}
EMO_NAME2ID = {v:k for k,v in EMO_ID2NAME.items()}


**Dataset Loading**

In [12]:
text_path = RAW / "dialogues_text.txt"
emo_path  = RAW / "dialogues_emotion.txt"
act_path  = RAW / "dialogues_act.txt"          

texts = text_path.read_text(encoding="utf-8").splitlines()
emos  = emo_path.read_text(encoding="utf-8").splitlines()
acts  = act_path.read_text(encoding="utf-8").splitlines()

assert len(texts) == len(emos) == len(acts), "Mismatch: texts/emos/acts line counts differ."
print(f"Loaded {len(texts):,} dialogues.")

Loaded 13,118 dialogues.


**Flatenning the dialogues to utterance level**

In [13]:
def clean_utt(u: str) -> str:
    u = u.strip()
    u = re.sub(r"\s+", " ", u)  # collapse whitespace
    return u

rows = []
bad_align = 0

for d_id, (t_line, e_line) in enumerate(zip(texts, emos)):
    utts = [clean_utt(u) for u in t_line.split("__eou__") if u.strip()]
    e_labels = [int(x) for x in e_line.split() if x != ""]
    
    if len(utts) != len(e_labels):
        bad_align += 1
        m = min(len(utts), len(e_labels))
        utts, e_labels = utts[:m], e_labels[:m]
    
    for turn_id, (utt, emo_id) in enumerate(zip(utts, e_labels)):
        rows.append({
            "dialog_id": d_id,
            "turn_id": turn_id,
            "utterance": utt,
            "emotion_id": emo_id,
            "emotion": EMO_ID2NAME.get(emo_id, "unknown")
        })



In [14]:
print(f"Total utterances: {len(rows):,} | dialogues with length mismatch trimmed: {bad_align}")
df_all = pd.DataFrame(rows)
df_all.head(10)

Total utterances: 102,979 | dialogues with length mismatch trimmed: 1


Unnamed: 0,dialog_id,turn_id,utterance,emotion_id,emotion
0,0,0,The kitchen stinks .,2,disgust
1,0,1,I'll throw out the garbage .,0,no_emotion
2,1,0,"So Dick , how about getting some coffee for to...",4,happiness
3,1,1,Coffee ? I don ’ t honestly like that kind of ...,2,disgust
4,1,2,"Come on , you can at least try a little , besi...",0,no_emotion
5,1,3,What ’ s wrong with that ? Cigarette is the th...,1,anger
6,1,4,"Not for me , Dick .",0,no_emotion
7,2,0,Are things still going badly with your housegu...,0,no_emotion
8,2,1,Getting worse . Now he ’ s eating me out of ho...,1,anger
9,2,2,"Leo , I really think you ’ re beating around t...",0,no_emotion


In [15]:
master_csv = PROC / "dailydialog_utterances.csv"
df_all.to_csv(master_csv, index=False, encoding="utf-8")
with open(PROC / "emotion_label_map.json", "w", encoding="utf-8") as f:
    json.dump(EMO_ID2NAME, f, indent=2, ensure_ascii=False)

print("Saved:", master_csv)
print("Label map:", PROC / "emotion_label_map.json")

Saved: data_proc/dailydialog_utterances.csv
Label map: data_proc/emotion_label_map.json


In [16]:
print("Dialogs:", df_all["dialog_id"].nunique())
print("Utterances:", len(df_all))

print("\nEmotion distribution (counts):")
print(df_all["emotion"].value_counts())

print("\nEmotion distribution (proportions):")
print((df_all["emotion"].value_counts(normalize=True)*100).round(2).astype(str) + "%")


Dialogs: 13118
Utterances: 102979

Emotion distribution (counts):
emotion
no_emotion    85572
happiness     12885
surprise       1823
sadness        1150
anger          1022
disgust         353
fear            174
Name: count, dtype: int64

Emotion distribution (proportions):
emotion
no_emotion     83.1%
happiness     12.51%
surprise       1.77%
sadness        1.12%
anger          0.99%
disgust        0.34%
fear           0.17%
Name: proportion, dtype: object


In [18]:
# dialogue-level split (no leakage)
dialog_ids = df_all["dialog_id"].unique().tolist()
random.shuffle(dialog_ids)

n = len(dialog_ids)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)
train_ids = set(dialog_ids[:n_train])
val_ids   = set(dialog_ids[n_train:n_train+n_val])
test_ids  = set(dialog_ids[n_train+n_val:])

def subset(df, ids):
    return df[df["dialog_id"].isin(ids)].copy()

train_df = subset(df_all, train_ids)
val_df   = subset(df_all, val_ids)
test_df  = subset(df_all, test_ids)

# Attach split column
train_df["split"] = "train"
val_df["split"]   = "val"
test_df["split"]  = "test"

print(f"Train/Val/Test Dialogues: {len(train_ids)} / {len(val_ids)} / {len(test_ids)}")
print(f"Train/Val/Test Utterances: {len(train_df)} / {len(val_df)} / {len(test_df)}")

# Save
train_df.to_csv(PROC/"train.csv", index=False, encoding="utf-8")
val_df.to_csv(PROC/"val.csv", index=False, encoding="utf-8")
test_df.to_csv(PROC/"test.csv", index=False, encoding="utf-8")
print("\nWrote split files to:", PROC)

Train/Val/Test Dialogues: 10494 / 1311 / 1313
Train/Val/Test Utterances: 82687 / 10268 / 10024

Wrote split files to: data_proc


**Preprocessing**

In [19]:
# Advanced Text Cleaning & Normalization
# Download necessary NLTK data
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Contraction mapping
CONTRACTION_MAP = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have",
    "he'll": "he will", "he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
    "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
    "so've": "so have", "so's": "so is", "that'd": "that would", "that'd've": "that would have",
    "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
    "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
    "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
    "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
    "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def advanced_clean_utt(text: str) -> str:
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = ' '.join([CONTRACTION_MAP.get(t, t) for t in text.split()])
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Lemmatize and remove stop words
    clean_tokens = [
        lemmatizer.lemmatize(token) for token in tokens 
        if token not in stop_words and len(token) > 1
    ]
    return ' '.join(clean_tokens)

# Apply cleaning function
for df in [train_df, val_df, test_df]:
    df['cleaned_utterance'] = df['utterance'].apply(advanced_clean_utt)

print("Cleaned training data sample:")
train_df[['utterance', 'cleaned_utterance']].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venusikhakolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned training data sample:


Unnamed: 0,utterance,cleaned_utterance
0,The kitchen stinks .,kitchen stink
1,I'll throw out the garbage .,ill throw garbage
7,Are things still going badly with your housegu...,thing still going badly houseguest
8,Getting worse . Now he ’ s eating me out of ho...,getting worse eating house home tried talking ...
9,"Leo , I really think you ’ re beating around t...",leo really think beating around bush guy know ...


**handling imbalance in training set**

In [None]:


# Original distribution in training set
print("Original training set distribution:")
print(train_df['emotion'].value_counts())

# Separate majority and minority classes
majority_class = train_df[train_df['emotion'] == 'no_emotion']
minority_classes = train_df[train_df['emotion'] != 'no_emotion']

# Get the size of the next largest class
undersample_size = len(train_df[train_df['emotion'] == 'happiness'])

# Undersample the majority class
majority_undersampled = majority_class.sample(
    n=undersample_size, 
    random_state=SEED
)

# Combine with minority classes to create a balanced training set
train_df_balanced = pd.concat([majority_undersampled, minority_classes])

# Shuffle the balanced dataset
train_df_balanced = train_df_balanced.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("\nBalanced training set distribution:")
print(train_df_balanced['emotion'].value_counts())

# Save the balanced training set
balanced_train_csv = PROC / "train_balanced.csv"
train_df_balanced.to_csv(balanced_train_csv, index=False, encoding="utf-8")
print(f"\nSaved balanced training set to: {balanced_train_csv}")

Original training set distribution:
emotion
no_emotion    68524
happiness     10513
surprise       1455
sadness         947
anger           832
disgust         273
fear            143
Name: count, dtype: int64

Balanced training set distribution:
emotion
no_emotion    10513
happiness     10513
surprise       1455
sadness         947
anger           832
disgust         273
fear            143
Name: count, dtype: int64

Saved balanced training set to: data_proc/train_balanced.csv


**Vectorization for model training**


In [21]:
# TF-IDF Vectorization

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # Use unigrams and bigrams
    max_features=5000,       # Keep top 5k features
    sublinear_tf=True        # Apply sublinear TF scaling
)

# Fit on the balanced training data and transform all splits
X_train = tfidf_vectorizer.fit_transform(train_df_balanced['cleaned_utterance'])
X_val = tfidf_vectorizer.transform(val_df['cleaned_utterance'])
X_test = tfidf_vectorizer.transform(test_df['cleaned_utterance'])

# Get the labels
y_train = train_df_balanced['emotion_id']
y_val = val_df['emotion_id']
y_test = test_df['emotion_id']

# Save the vectorizer and the processed data
joblib.dump(tfidf_vectorizer, PROC / 'tfidf_vectorizer.joblib')

# Save the sparse matrices
save_npz(PROC / 'X_train.npz', X_train)
save_npz(PROC / 'X_val.npz', X_val)
save_npz(PROC / 'X_test.npz', X_test)

# Save the labels
y_train.to_csv(PROC / 'y_train.csv', index=False, header=True)
y_val.to_csv(PROC / 'y_val.csv', index=False, header=True)
y_test.to_csv(PROC / 'y_test.csv', index=False, header=True)

print("TF-IDF vectorization complete.")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nSaved vectorizer to: {PROC / 'tfidf_vectorizer.joblib'}")
print(f"Saved data matrices to: {PROC}/")

TF-IDF vectorization complete.
X_train shape: (24676, 5000)
X_val shape: (10268, 5000)
X_test shape: (10024, 5000)

Saved vectorizer to: data_proc/tfidf_vectorizer.joblib
Saved data matrices to: data_proc/
