In [1]:
import csv, json, re, random, pathlib
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from scipy.sparse import save_npz, load_npz

In [2]:
# Reproducibility
SEED = 17
random.seed(SEED)
np.random.seed(SEED)

# Paths
ROOT = pathlib.Path(".")
RAW  = ROOT / "data_raw"      # contains dialogues_*.txt
PROC = ROOT / "data_proc"     # will be created
PROC.mkdir(parents=True, exist_ok=True)

# Emotion mapping (DailyDialog)
EMO_ID2NAME = {
    0: "no_emotion",
    1: "anger",
    2: "disgust",
    3: "fear",
    4: "happiness",
    5: "sadness",
    6: "surprise"
}
EMO_NAME2ID = {v:k for k,v in EMO_ID2NAME.items()}


**Dataset Loading**

In [3]:
# --- Load the Correct Dataset ---

# Load the new, balanced, and augmented dataset created in the previous notebook
df_all = pd.read_csv(PROC / "dailydialog_balanced_augmented.csv")

# The 'dialog_id' column is no longer needed for splitting, but we need a unique ID for each utterance
# to ensure a clean train/val/test split without data leakage.
df_all = df_all.reset_index().rename(columns={'index': 'utterance_id'})

print("Loaded the CORRECT augmented and balanced dataset.")
display(df_all.head())

print(f"\nTotal utterances: {len(df_all):,}")
print("\nEmotion distribution:")
print(df_all["emotion"].value_counts())

Loaded the CORRECT augmented and balanced dataset.


Unnamed: 0,utterance_id,utterance,emotion
0,0,You're right but I'll miss him jumping on me.,sadness
1,1,I used to play,fear
2,2,It was a kind of sixth sense .,no_emotion
3,3,"no, looked everywhere",sadness
4,4,Do you have sport shirts for ladies ?,no_emotion



Total utterances: 109,947

Emotion distribution:
emotion
no_emotion    33124
happiness     33124
sadness       18654
fear           7928
anger          7576
surprise       7144
disgust        2397
Name: count, dtype: int64


In [4]:
# --- Create Numerical Labels ---

from sklearn.preprocessing import LabelEncoder

# The models need a numerical ID for the emotion labels.
# We'll create an 'emotion_id' column.

label_encoder = LabelEncoder()

# Fit the encoder on the full set of emotion labels to ensure consistency
df_all['emotion_id'] = label_encoder.fit_transform(df_all['emotion'])

# Save the mapping from ID to emotion name for later use
emo_id2name = {i: label for i, label in enumerate(label_encoder.classes_)}
with open(PROC / "emotion_label_map.json", "w") as f:
    json.dump(emo_id2name, f)

print("Created 'emotion_id' column.")
print("Emotion to ID mapping:", emo_id2name)
display(df_all.head())

Created 'emotion_id' column.
Emotion to ID mapping: {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'happiness', 4: 'no_emotion', 5: 'sadness', 6: 'surprise'}


Unnamed: 0,utterance_id,utterance,emotion,emotion_id
0,0,You're right but I'll miss him jumping on me.,sadness,5
1,1,I used to play,fear,2
2,2,It was a kind of sixth sense .,no_emotion,4
3,3,"no, looked everywhere",sadness,5
4,4,Do you have sport shirts for ladies ?,no_emotion,4


Total utterances: 102,979 | dialogues with length mismatch trimmed: 1


Unnamed: 0,dialog_id,turn_id,utterance,emotion_id,emotion
0,0,0,The kitchen stinks .,2,disgust
1,0,1,I'll throw out the garbage .,0,no_emotion
2,1,0,"So Dick , how about getting some coffee for to...",4,happiness
3,1,1,Coffee ? I don ’ t honestly like that kind of ...,2,disgust
4,1,2,"Come on , you can at least try a little , besi...",0,no_emotion
5,1,3,What ’ s wrong with that ? Cigarette is the th...,1,anger
6,1,4,"Not for me , Dick .",0,no_emotion
7,2,0,Are things still going badly with your housegu...,0,no_emotion
8,2,1,Getting worse . Now he ’ s eating me out of ho...,1,anger
9,2,2,"Leo , I really think you ’ re beating around t...",0,no_emotion


Saved: data_proc/dailydialog_utterances.csv
Label map: data_proc/emotion_label_map.json


Dialogs: 13118
Utterances: 102979

Emotion distribution (counts):
emotion
no_emotion    85572
happiness     12885
surprise       1823
sadness        1150
anger          1022
disgust         353
fear            174
Name: count, dtype: int64

Emotion distribution (proportions):
emotion
no_emotion     83.1%
happiness     12.51%
surprise       1.77%
sadness        1.12%
anger          0.99%
disgust        0.34%
fear           0.17%
Name: proportion, dtype: object


In [5]:
# --- Split and Clean the Data ---

from sklearn.model_selection import train_test_split

# First, split into 80% train and 20% temporary (for val/test)
train_df, temp_df = train_test_split(
    df_all, 
    test_size=0.2, 
    random_state=SEED, 
    stratify=df_all['emotion']  # Stratify to maintain emotion distribution
)

# Split the temporary 20% into 10% validation and 10% test (50/50 split of temp_df)
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=SEED, 
    stratify=temp_df['emotion'] # Stratify again
)

# --- Text Cleaning Function and Dependencies ---

# Download necessary NLTK data
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Contraction mapping
CONTRACTION_MAP = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have",
    "he'll": "he will", "he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
    "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
    "so've": "so have", "so's": "so is", "that'd": "that would", "that'd've": "that would have",
    "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
    "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
    "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
    "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
    "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def advanced_clean_utt(text: str) -> str:
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = ' '.join([CONTRACTION_MAP.get(t, t) for t in text.split()])
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Lemmatize and remove stop words
    clean_tokens = [
        lemmatizer.lemmatize(token) for token in tokens 
        if token not in stop_words and len(token) > 1
    ]
    return ' '.join(clean_tokens)

for df in [train_df, val_df, test_df]:
    df['cleaned_utterance'] = df['utterance'].apply(advanced_clean_utt)

# Attach split column for reference
train_df['split'] = "train"
val_df['split']   = "val"
test_df['split']  = "test"

print(f"Train/Val/Test Utterances: {len(train_df)} / {len(val_df)} / {len(test_df)}")

# Save the split files
train_df.to_csv(PROC/"train.csv", index=False, encoding="utf-8")
val_df.to_csv(PROC/"val.csv", index=False, encoding="utf-8")
test_df.to_csv(PROC/"test.csv", index=False, encoding="utf-8")
print("\nWrote split files to:", PROC)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venusikhakolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Train/Val/Test Utterances: 87957 / 10995 / 10995

Wrote split files to: data_proc


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venusikhakolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned training data sample:
                                           utterance  \
0                               The kitchen stinks .   
1                       I'll throw out the garbage .   
7  Are things still going badly with your housegu...   
8  Getting worse . Now he ’ s eating me out of ho...   
9  Leo , I really think you ’ re beating around t...   

                                   cleaned_utterance  
0                                      kitchen stink  
1                                  ill throw garbage  
7                 thing still going badly houseguest  
8  getting worse eating house home tried talking ...  
9  leo really think beating around bush guy know ...  

Updated val.csv and test.csv with cleaned utterances.


In [6]:
# TF-IDF Vectorization

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # Use unigrams and bigrams
    max_features=5000,       # Keep top 5k features
    sublinear_tf=True        # Apply sublinear TF scaling
)

# Fit on the balanced training data and transform all splits
X_train = tfidf_vectorizer.fit_transform(train_df['cleaned_utterance'])
X_val = tfidf_vectorizer.transform(val_df['cleaned_utterance'])
X_test = tfidf_vectorizer.transform(test_df['cleaned_utterance'])

# Get the labels
y_train = train_df['emotion_id']
y_val = val_df['emotion_id']
y_test = test_df['emotion_id']

# Save the vectorizer and the processed data
joblib.dump(tfidf_vectorizer, PROC / 'tfidf_vectorizer.joblib')

# Save the sparse matrices
save_npz(PROC / 'X_train.npz', X_train)
save_npz(PROC / 'X_val.npz', X_val)
save_npz(PROC / 'X_test.npz', X_test)

# Save the labels
y_train.to_csv(PROC / 'y_train.csv', index=False, header=True)
y_val.to_csv(PROC / 'y_val.csv', index=False, header=True)
y_test.to_csv(PROC / 'y_test.csv', index=False, header=True)

print("TF-IDF vectorization complete.")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nSaved vectorizer to: {PROC / 'tfidf_vectorizer.joblib'}")
print(f"Saved data matrices to: {PROC}/")

TF-IDF vectorization complete.
X_train shape: (87957, 5000)
X_val shape: (10995, 5000)
X_test shape: (10995, 5000)

Saved vectorizer to: data_proc/tfidf_vectorizer.joblib
Saved data matrices to: data_proc/


**Preprocessing**

In [7]:
# Advanced Text Cleaning & Normalization
# Download necessary NLTK data
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Contraction mapping
CONTRACTION_MAP = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have",
    "he'll": "he will", "he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
    "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
    "so've": "so have", "so's": "so is", "that'd": "that would", "that'd've": "that would have",
    "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
    "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
    "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
    "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
    "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def advanced_clean_utt(text: str) -> str:
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = ' '.join([CONTRACTION_MAP.get(t, t) for t in text.split()])
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Lemmatize and remove stop words
    clean_tokens = [
        lemmatizer.lemmatize(token) for token in tokens 
        if token not in stop_words and len(token) > 1
    ]
    return ' '.join(clean_tokens)

# Apply cleaning function
for df in [train_df, val_df, test_df]:
    df['cleaned_utterance'] = df['utterance'].apply(advanced_clean_utt)

# --- FIX: Save the updated dataframes with cleaned text ---
val_df.to_csv(PROC/"val.csv", index=False, encoding="utf-8")
test_df.to_csv(PROC/"test.csv", index=False, encoding="utf-8")

print("Cleaned training data sample:")
print(train_df[['utterance', 'cleaned_utterance']].head())
print("\nUpdated val.csv and test.csv with cleaned utterances.")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venusikhakolli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned training data sample:
                                               utterance  \
9782   I could believe it, Did my friend really just ...   
70409  Oh no, would you like to talk about it? I real...   
22370                Thanks . I really appreciate that .   
50586  I received a lot of emails from potential clie...   
30742  My work involves various routine bookkeeping a...   

                                       cleaned_utterance  
9782               could believe friend really say loved  
70409             oh would like talk really hope alright  
22370                           thanks really appreciate  
50586  received lot email potential client answer tim...  
30742  work involves various routine bookkeeping basi...  

Updated val.csv and test.csv with cleaned utterances.


**handling imbalance in training set**

In [8]:


# Original distribution in training set
print("Original training set distribution:")
print(train_df['emotion'].value_counts())

# Separate majority and minority classes
majority_class = train_df[train_df['emotion'] == 'no_emotion']
minority_classes = train_df[train_df['emotion'] != 'no_emotion']

# Get the size of the next largest class
undersample_size = len(train_df[train_df['emotion'] == 'happiness'])

# Undersample the majority class
majority_undersampled = majority_class.sample(
    n=undersample_size, 
    random_state=SEED
)

# Combine with minority classes to create a balanced training set
train_df_balanced = pd.concat([majority_undersampled, minority_classes])

# Shuffle the balanced dataset
train_df_balanced = train_df_balanced.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("\nBalanced training set distribution:")
print(train_df_balanced['emotion'].value_counts())

# Save the balanced training set
balanced_train_csv = PROC / "train_balanced.csv"
train_df_balanced.to_csv(balanced_train_csv, index=False, encoding="utf-8")
print(f"\nSaved balanced training set to: {balanced_train_csv}")

Original training set distribution:
emotion
happiness     26499
no_emotion    26499
sadness       14923
fear           6342
anger          6061
surprise       5715
disgust        1918
Name: count, dtype: int64

Balanced training set distribution:
emotion
happiness     26499
no_emotion    26499
sadness       14923
fear           6342
anger          6061
surprise       5715
disgust        1918
Name: count, dtype: int64

Saved balanced training set to: data_proc/train_balanced.csv


**Vectorization for model training**


In [9]:
# TF-IDF Vectorization

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # Use unigrams and bigrams
    max_features=5000,       # Keep top 5k features
    sublinear_tf=True        # Apply sublinear TF scaling
)

# Fit on the balanced training data and transform all splits
X_train = tfidf_vectorizer.fit_transform(train_df_balanced['cleaned_utterance'])
X_val = tfidf_vectorizer.transform(val_df['cleaned_utterance'])
X_test = tfidf_vectorizer.transform(test_df['cleaned_utterance'])

# Get the labels
y_train = train_df_balanced['emotion_id']
y_val = val_df['emotion_id']
y_test = test_df['emotion_id']

# Save the vectorizer and the processed data
joblib.dump(tfidf_vectorizer, PROC / 'tfidf_vectorizer.joblib')

# Save the sparse matrices
save_npz(PROC / 'X_train.npz', X_train)
save_npz(PROC / 'X_val.npz', X_val)
save_npz(PROC / 'X_test.npz', X_test)

# Save the labels
y_train.to_csv(PROC / 'y_train.csv', index=False, header=True)
y_val.to_csv(PROC / 'y_val.csv', index=False, header=True)
y_test.to_csv(PROC / 'y_test.csv', index=False, header=True)

print("TF-IDF vectorization complete.")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nSaved vectorizer to: {PROC / 'tfidf_vectorizer.joblib'}")
print(f"Saved data matrices to: {PROC}/")

TF-IDF vectorization complete.
X_train shape: (87957, 5000)
X_val shape: (10995, 5000)
X_test shape: (10995, 5000)

Saved vectorizer to: data_proc/tfidf_vectorizer.joblib
Saved data matrices to: data_proc/
