In [1]:
import sys, json
print('sys.executable:', sys.executable)
print('python version:', sys.version)
try:
    import tensorflow as tf
    print('tensorflow version:', tf.__version__)
except Exception as e:
    print('tensorflow import error:', repr(e))

sys.executable: /Users/anshureddy/Desktop/NLP Project/nlp-team17/.venv/bin/python
python version: 3.12.6 (v3.12.6:a4a2d2b0d85, Sep  6 2024, 16:08:03) [Clang 13.0.0 (clang-1300.0.29.30)]
tensorflow version: 2.16.2
tensorflow version: 2.16.2


In [2]:
import csv, json, re, random, pathlib
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from scipy.sparse import save_npz, load_npz

In [3]:
# Reproducibility
SEED = 17
random.seed(SEED)
np.random.seed(SEED)

# Paths
ROOT = pathlib.Path(".")
RAW  = ROOT / "data_raw"      # contains dialogues_*.txt
PROC = ROOT / "data_proc"     # will be created
PROC.mkdir(parents=True, exist_ok=True)

# Emotion mapping (DailyDialog)
EMO_ID2NAME = {
    0: "no_emotion",
    1: "anger",
    2: "disgust",
    3: "fear",
    4: "happiness",
    5: "sadness",
    6: "surprise"
}
EMO_NAME2ID = {v:k for k,v in EMO_ID2NAME.items()}


**Dataset Loading**

In [4]:
text_path = RAW / "dialogues_text.txt"
emo_path  = RAW / "dialogues_emotion.txt"
act_path  = RAW / "dialogues_act.txt"          

texts = text_path.read_text(encoding="utf-8").splitlines()
emos  = emo_path.read_text(encoding="utf-8").splitlines()
acts  = act_path.read_text(encoding="utf-8").splitlines()

assert len(texts) == len(emos) == len(acts), "Mismatch: texts/emos/acts line counts differ."
print(f"Loaded {len(texts):,} dialogues.")

Loaded 13,118 dialogues.


**Flatenning the dialogues to utterance level**

In [5]:
def clean_utt(u: str) -> str:
    u = u.strip()
    u = re.sub(r"\s+", " ", u)  # collapse whitespace
    return u

rows = []
bad_align = 0

for d_id, (t_line, e_line) in enumerate(zip(texts, emos)):
    utts = [clean_utt(u) for u in t_line.split("__eou__") if u.strip()]
    e_labels = [int(x) for x in e_line.split() if x != ""]
    
    if len(utts) != len(e_labels):
        bad_align += 1
        m = min(len(utts), len(e_labels))
        utts, e_labels = utts[:m], e_labels[:m]
    
    for turn_id, (utt, emo_id) in enumerate(zip(utts, e_labels)):
        rows.append({
            "dialog_id": d_id,
            "turn_id": turn_id,
            "utterance": utt,
            "emotion_id": emo_id,
            "emotion": EMO_ID2NAME.get(emo_id, "unknown")
        })



In [6]:
print(f"Total utterances: {len(rows):,} | dialogues with length mismatch trimmed: {bad_align}")
df_all = pd.DataFrame(rows)
df_all.head(10)

Total utterances: 102,979 | dialogues with length mismatch trimmed: 1


Unnamed: 0,dialog_id,turn_id,utterance,emotion_id,emotion
0,0,0,The kitchen stinks .,2,disgust
1,0,1,I'll throw out the garbage .,0,no_emotion
2,1,0,"So Dick , how about getting some coffee for to...",4,happiness
3,1,1,Coffee ? I don ’ t honestly like that kind of ...,2,disgust
4,1,2,"Come on , you can at least try a little , besi...",0,no_emotion
5,1,3,What ’ s wrong with that ? Cigarette is the th...,1,anger
6,1,4,"Not for me , Dick .",0,no_emotion
7,2,0,Are things still going badly with your housegu...,0,no_emotion
8,2,1,Getting worse . Now he ’ s eating me out of ho...,1,anger
9,2,2,"Leo , I really think you ’ re beating around t...",0,no_emotion


In [7]:
master_csv = PROC / "dailydialog_utterances.csv"
df_all.to_csv(master_csv, index=False, encoding="utf-8")
with open(PROC / "emotion_label_map.json", "w", encoding="utf-8") as f:
    json.dump(EMO_ID2NAME, f, indent=2, ensure_ascii=False)

print("Saved:", master_csv)
print("Label map:", PROC / "emotion_label_map.json")

Saved: data_proc/dailydialog_utterances.csv
Label map: data_proc/emotion_label_map.json


In [8]:
print("Dialogs:", df_all["dialog_id"].nunique())
print("Utterances:", len(df_all))

print("\nEmotion distribution (counts):")
print(df_all["emotion"].value_counts())

print("\nEmotion distribution (proportions):")
print((df_all["emotion"].value_counts(normalize=True)*100).round(2).astype(str) + "%")


Dialogs: 13118
Utterances: 102979

Emotion distribution (counts):
emotion
no_emotion    85572
happiness     12885
surprise       1823
sadness        1150
anger          1022
disgust         353
fear            174
Name: count, dtype: int64

Emotion distribution (proportions):
emotion
no_emotion     83.1%
happiness     12.51%
surprise       1.77%
sadness        1.12%
anger          0.99%
disgust        0.34%
fear           0.17%
Name: proportion, dtype: object


In [9]:
# dialogue-level split (no leakage)
dialog_ids = df_all["dialog_id"].unique().tolist()
random.shuffle(dialog_ids)

n = len(dialog_ids)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)
train_ids = set(dialog_ids[:n_train])
val_ids   = set(dialog_ids[n_train:n_train+n_val])
test_ids  = set(dialog_ids[n_train+n_val:])

def subset(df, ids):
    return df[df["dialog_id"].isin(ids)].copy()

train_df = subset(df_all, train_ids)
val_df   = subset(df_all, val_ids)
test_df  = subset(df_all, test_ids)

# Attach split column
train_df["split"] = "train"
val_df["split"]   = "val"
test_df["split"]  = "test"

print(f"Train/Val/Test Dialogues: {len(train_ids)} / {len(val_ids)} / {len(test_ids)}")
print(f"Train/Val/Test Utterances: {len(train_df)} / {len(val_df)} / {len(test_df)}")

# Save
train_df.to_csv(PROC/"train.csv", index=False, encoding="utf-8")
val_df.to_csv(PROC/"val.csv", index=False, encoding="utf-8")
test_df.to_csv(PROC/"test.csv", index=False, encoding="utf-8")
print("\nWrote split files to:", PROC)

Train/Val/Test Dialogues: 10494 / 1311 / 1313
Train/Val/Test Utterances: 82687 / 10268 / 10024

Wrote split files to: data_proc


**Preprocessing**

In [10]:
# Advanced Text Cleaning & Normalization
# Download necessary NLTK data
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Contraction mapping
CONTRACTION_MAP = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have",
    "he'll": "he will", "he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
    "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
    "so've": "so have", "so's": "so is", "that'd": "that would", "that'd've": "that would have",
    "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
    "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
    "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
    "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
    "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def advanced_clean_utt(text: str) -> str:
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = ' '.join([CONTRACTION_MAP.get(t, t) for t in text.split()])
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Lemmatize and remove stop words
    clean_tokens = [
        lemmatizer.lemmatize(token) for token in tokens 
        if token not in stop_words and len(token) > 1
    ]
    return ' '.join(clean_tokens)

# Apply cleaning function
for df in [train_df, val_df, test_df]:
    df['cleaned_utterance'] = df['utterance'].apply(advanced_clean_utt)

print("Cleaned training data sample:")
train_df[['utterance', 'cleaned_utterance']].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anshureddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned training data sample:


Unnamed: 0,utterance,cleaned_utterance
0,The kitchen stinks .,kitchen stink
1,I'll throw out the garbage .,ill throw garbage
7,Are things still going badly with your housegu...,thing still going badly houseguest
8,Getting worse . Now he ’ s eating me out of ho...,getting worse eating house home tried talking ...
9,"Leo , I really think you ’ re beating around t...",leo really think beating around bush guy know ...


**handling imbalance in training set**

In [11]:


# Original distribution in training set
print("Original training set distribution:")
print(train_df['emotion'].value_counts())

# Separate majority and minority classes
majority_class = train_df[train_df['emotion'] == 'no_emotion']
minority_classes = train_df[train_df['emotion'] != 'no_emotion']

# Get the size of the next largest class
undersample_size = len(train_df[train_df['emotion'] == 'happiness'])

# Undersample the majority class
majority_undersampled = majority_class.sample(
    n=undersample_size, 
    random_state=SEED
)

# Combine with minority classes to create a balanced training set
train_df_balanced = pd.concat([majority_undersampled, minority_classes])

# Shuffle the balanced dataset
train_df_balanced = train_df_balanced.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("\nBalanced training set distribution:")
print(train_df_balanced['emotion'].value_counts())

# Save the balanced training set
balanced_train_csv = PROC / "train_balanced.csv"
train_df_balanced.to_csv(balanced_train_csv, index=False, encoding="utf-8")
print(f"\nSaved balanced training set to: {balanced_train_csv}")

Original training set distribution:
emotion
no_emotion    68524
happiness     10513
surprise       1455
sadness         947
anger           832
disgust         273
fear            143
Name: count, dtype: int64

Balanced training set distribution:
emotion
no_emotion    10513
happiness     10513
surprise       1455
sadness         947
anger           832
disgust         273
fear            143
Name: count, dtype: int64

Saved balanced training set to: data_proc/train_balanced.csv


**Vectorization for model training**


In [12]:
# TF-IDF Vectorization

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # Use unigrams and bigrams
    max_features=5000,       # Keep top 5k features
    sublinear_tf=True        # Apply sublinear TF scaling
)

# Fit on the balanced training data and transform all splits
X_train = tfidf_vectorizer.fit_transform(train_df_balanced['cleaned_utterance'])
X_val = tfidf_vectorizer.transform(val_df['cleaned_utterance'])
X_test = tfidf_vectorizer.transform(test_df['cleaned_utterance'])

# Get the labels
y_train = train_df_balanced['emotion_id']
y_val = val_df['emotion_id']
y_test = test_df['emotion_id']

# Save the vectorizer and the processed data
joblib.dump(tfidf_vectorizer, PROC / 'tfidf_vectorizer.joblib')

# Save the sparse matrices
save_npz(PROC / 'X_train.npz', X_train)
save_npz(PROC / 'X_val.npz', X_val)
save_npz(PROC / 'X_test.npz', X_test)

# Save the labels
y_train.to_csv(PROC / 'y_train.csv', index=False, header=True)
y_val.to_csv(PROC / 'y_val.csv', index=False, header=True)
y_test.to_csv(PROC / 'y_test.csv', index=False, header=True)

print("TF-IDF vectorization complete.")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nSaved vectorizer to: {PROC / 'tfidf_vectorizer.joblib'}")
print(f"Saved data matrices to: {PROC}/")

TF-IDF vectorization complete.
X_train shape: (24676, 5000)
X_val shape: (10268, 5000)
X_test shape: (10024, 5000)

Saved vectorizer to: data_proc/tfidf_vectorizer.joblib
Saved data matrices to: data_proc/


In [13]:
# Baseline training: Logistic Regression on TF-IDF features
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

# Train classifier
clf = LogisticRegression(
    multi_class='multinomial',
    solver='saga',
    max_iter=2000,
    #class_weight='balanced',
    random_state=SEED,
    n_jobs=-1
)
print("Training LogisticRegression baseline...")
clf.fit(X_train, y_train)

# Validation metrics
y_val_pred = clf.predict(X_val)
val_report = classification_report(y_val, y_val_pred, output_dict=True)
val_macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Macro-F1: {val_macro_f1:.4f}")
print(classification_report(y_val, y_val_pred))

# Test metrics
y_test_pred = clf.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
test_macro_f1 = f1_score(y_test, y_test_pred, average='macro')
print(f"Test Macro-F1: {test_macro_f1:.4f}")
print(classification_report(y_test, y_test_pred))

# Save model and metrics
joblib.dump(clf, PROC / 'logistic_baseline.joblib')
metrics = {
    'validation': val_report,
    'validation_macro_f1': float(val_macro_f1),
    'test': test_report,
    'test_macro_f1': float(test_macro_f1)
}
with open(PROC / 'baseline_metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)
print(f"Saved model and metrics to: {PROC}")

# Confusion matrix plot (test)
cm = confusion_matrix(y_test, y_test_pred, labels=sorted(EMO_ID2NAME.keys()))
cm_norm = cm.astype('float') / (cm.sum(axis=1)[:, np.newaxis] + 1e-12)
plt.figure(figsize=(8,6))
plt.imshow(cm_norm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Normalized Confusion Matrix (test)')
plt.colorbar()
labels = [EMO_ID2NAME[i] for i in sorted(EMO_ID2NAME.keys())]
plt.xticks(np.arange(len(labels)), labels, rotation=45, ha='right')
plt.yticks(np.arange(len(labels)), labels)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig(PROC / 'confusion_matrix_test.png', dpi=150)
plt.close()
print(f"Saved confusion matrix to: {PROC / 'confusion_matrix_test.png'}")

Training LogisticRegression baseline...
Validation Macro-F1: 0.3035
              precision    recall  f1-score   support

           0       0.92      0.79      0.85      8663
           1       0.18      0.05      0.08        80
           2       0.67      0.09      0.15        47
           3       0.00      0.00      0.00        12
           4       0.33      0.72      0.45      1183
           5       0.21      0.25      0.23       106
           6       0.38      0.34      0.36       177

    accuracy                           0.76     10268
   macro avg       0.39      0.32      0.30     10268
weighted avg       0.83      0.76      0.78     10268

Test Macro-F1: 0.3351
              precision    recall  f1-score   support

           0       0.92      0.78      0.85      8385
           1       0.45      0.09      0.15       110
           2       0.38      0.09      0.15        33
           3       1.00      0.05      0.10        19
           4       0.34      0.75      0.4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Saved confusion matrix to: data_proc/confusion_matrix_test.png


In [14]:
# Inference demo: load baseline and show predictions for a few test dialogues
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json

# Paths
vec_path = PROC / 'tfidf_vectorizer.joblib'
model_path = PROC / 'logistic_baseline.joblib'
test_csv = PROC / 'test.csv'
out_dir = PROC

# Load artifacts
vec = joblib.load(vec_path)
clf = joblib.load(model_path)

df_test = pd.read_csv(test_csv)

# Ensure cleaned_utterance exists — fall back to applying basic cleaning if missing
if 'cleaned_utterance' not in df_test.columns:
    print('`cleaned_utterance` column missing in test set — using simple clean fallback')
    def simple_clean(s):
        s = str(s).lower()
        s = re.sub(r"[^a-zA-Z\s]", "", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s
    df_test['cleaned_utterance'] = df_test['utterance'].fillna('').apply(simple_clean)

# Choose a few sample dialogues from the test set (if fewer dialogs exist, take all)
unique_dialogs = df_test['dialog_id'].drop_duplicates()
count = min(3, len(unique_dialogs))
sample_dialogs = unique_dialogs.sample(n=count, random_state=SEED).tolist()

results = {}
for did in sample_dialogs:
    d = df_test[df_test['dialog_id'] == did].sort_values('turn_id')
    texts = d['cleaned_utterance'].fillna(d['utterance']).tolist()
    X = vec.transform(texts)
    probs = clf.predict_proba(X)
    preds = clf.predict(X)
    pred_ids = [int(p) for p in preds]
    true_ids = d['emotion_id'].astype(int).tolist()

    # Build a small DataFrame of results
    row_df = pd.DataFrame({
        'turn_id': d['turn_id'].tolist(),
        'utterance': d['utterance'].tolist(),
        'true_emotion': [EMO_ID2NAME.get(i,'unk') for i in true_ids],
        'pred_emotion': [EMO_ID2NAME.get(i,'unk') for i in pred_ids],
        'pred_max_prob': probs.max(axis=1)
    })

    results[did] = {
        'rows': row_df,
        'probs': probs.tolist()
    }

    print(f"\nDialogue {did} — {len(texts)} turns")
    display(row_df)

    # Plot timeline: probability for top-3 predicted classes per utterance
    top_k = 3
    top_class_indices = np.argsort(np.mean(probs, axis=0))[-top_k:][::-1]
    plt.figure(figsize=(8,3))
    for idx in top_class_indices:
        plt.plot(np.arange(len(texts)), probs[:, idx], marker='o', label=EMO_ID2NAME.get(idx))
    plt.xticks(np.arange(len(texts)), row_df['turn_id'])
    plt.xlabel('Turn (index)')
    plt.ylabel('Predicted probability')
    plt.title(f'Dialogue {did} — top {top_k} class probabilities')
    plt.legend(loc='best')
    plt.tight_layout()
    plot_path = out_dir / f'timeline_dialog_{did}.png'
    plt.savefig(plot_path, dpi=150)
    plt.close()
    print(f"Saved timeline plot: {plot_path}")

# Save a compact JSON of sample predictions
out_preds = {str(did): results[did]['rows'].to_dict(orient='records') for did in results}
with open(out_dir / 'sample_dialog_predictions.json', 'w', encoding='utf-8') as f:
    json.dump(out_preds, f, indent=2, ensure_ascii=False)

print('\nSaved sample predictions to:', out_dir / 'sample_dialog_predictions.json')

`cleaned_utterance` column missing in test set — using simple clean fallback

Dialogue 9542 — 16 turns


Unnamed: 0,turn_id,utterance,true_emotion,pred_emotion,pred_max_prob
0,0,"Excuse me , sir . Can I help you ?",no_emotion,no_emotion,0.937777
1,1,I'd be glad to have your help . Could you make...,no_emotion,happiness,0.657833
2,2,I'm sorry . We can not regulate the air-condit...,no_emotion,no_emotion,0.748632
3,3,Good idea .,no_emotion,happiness,0.95916
4,4,"Your wet towel , sir .",no_emotion,no_emotion,0.818046
5,5,Thank you . Could I have something cold to dri...,no_emotion,happiness,0.517129
6,6,"Yes . We have mineral water , orange juice , c...",no_emotion,no_emotion,0.865185
7,7,"Orange juice , please .",no_emotion,no_emotion,0.868913
8,8,Would you like some ice in your drink ?,no_emotion,happiness,0.551966
9,9,Yes .,no_emotion,no_emotion,0.678648


Saved timeline plot: data_proc/timeline_dialog_9542.png

Dialogue 5269 — 12 turns


Unnamed: 0,turn_id,utterance,true_emotion,pred_emotion,pred_max_prob
0,0,Do you like flowers ?,happiness,happiness,0.600869
1,1,"Of course , I like .",happiness,happiness,0.65814
2,2,What's your favorite flower ?,happiness,happiness,0.758778
3,3,Forsythia . It's also called winter jasmine wh...,happiness,no_emotion,0.513889
4,4,Spring is a lively season .,happiness,happiness,0.678546
5,5,Yes . How about you ?,happiness,no_emotion,0.678648
6,6,I admire plum blossoms very much . It seems th...,happiness,no_emotion,0.597562
7,7,You have a perfect taste !,happiness,happiness,0.932757
8,8,It is the symbol of laughing at hoar frost and...,happiness,happiness,0.566749
9,9,But it used to be ignored by many people .,no_emotion,no_emotion,0.832052


Saved timeline plot: data_proc/timeline_dialog_5269.png

Dialogue 6343 — 6 turns


Unnamed: 0,turn_id,utterance,true_emotion,pred_emotion,pred_max_prob
0,0,Maybe we all will be all things to all men .,no_emotion,no_emotion,0.760013
1,1,How terrible !,no_emotion,sadness,0.408615
2,2,"But for the life , we'll be changed by this so...",no_emotion,no_emotion,0.655885
3,3,"I really don't want to go into the world , I f...",no_emotion,sadness,0.336459
4,4,Don't be silly . We have reached the age to ta...,no_emotion,no_emotion,0.56678
5,5,I know .,no_emotion,no_emotion,0.520597


Saved timeline plot: data_proc/timeline_dialog_6343.png

Saved sample predictions to: data_proc/sample_dialog_predictions.json


In [15]:
# BiLSTM + GloVe baseline (end-to-end)
import os
import pathlib
import joblib
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Paths & reproducibility
SEED = 17
np.random.seed(SEED)
tf.random.set_seed(SEED)
ROOT = pathlib.Path('.')
PROC = ROOT / 'data_proc'
PROC.mkdir(parents=True, exist_ok=True)

# Files
train_csv = PROC / 'train.csv'
val_csv   = PROC / 'val.csv'
test_csv  = PROC / 'test.csv'
label_enc_path = PROC / 'label_encoder.pkl'
tokenizer_path = PROC / 'bilstm_tokenizer.joblib'
model_path = PROC / 'bilstm_glove.h5'
preds_path = PROC / 'bilstm_glove_predictions.csv'

# GloVe file path candidates
glove_candidates = [
    ROOT / 'glove.6B.100d.txt',
    PROC / 'glove.6B.100d.txt',
    pathlib.Path('/usr/local/share/glove.6B.100d.txt'),
    pathlib.Path('glove.6B.100d.txt')
]
glove_path = None
for p in glove_candidates:
    if p.exists():
        glove_path = p
        break
if glove_path is None:
    raise FileNotFoundError(
        'GloVe file not found. Place "glove.6B.100d.txt" in the project root or data_proc.\n'
        'You can download from: https://nlp.stanford.edu/projects/glove/'
    )
print(f'Using GloVe file: {glove_path}')

# Load CSVs
train = pd.read_csv(train_csv)
val = pd.read_csv(val_csv)
test = pd.read_csv(test_csv)
print('Loaded splits: ', len(train), len(val), len(test))

# Ensure required columns exist
for df_name, df in [('train', train), ('val', val), ('test', test)]:
    if 'utterance' not in df.columns or 'emotion' not in df.columns:
        raise ValueError(f"{df_name}.csv must contain 'utterance' and 'emotion' columns")

# Load or create LabelEncoder
if label_enc_path.exists():
    print('Loading existing label encoder...')
    label_encoder = joblib.load(label_enc_path)
else:
    print('label_encoder.pkl not found — fitting a new LabelEncoder on training labels and saving it')
    label_encoder = LabelEncoder()
    label_encoder.fit(train['emotion'].astype(str).tolist())
    joblib.dump(label_encoder, label_enc_path)
    print(f'Saved label encoder to: {label_enc_path}')

# Prepare integer labels
y_train = label_encoder.transform(train['emotion'].astype(str).tolist())
y_val   = label_encoder.transform(val['emotion'].astype(str).tolist())
y_test  = label_encoder.transform(test['emotion'].astype(str).tolist())
num_labels = len(label_encoder.classes_)
print('Num labels:', num_labels)

# Tokenizer + sequences
all_train_texts = train['utterance'].astype(str).tolist()
MAX_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(all_train_texts)
word_index = tokenizer.word_index
vocab_size = min(MAX_WORDS, len(word_index)) + 1
print('Vocab size (used):', vocab_size)

# Save tokenizer
joblib.dump(tokenizer, tokenizer_path)
print(f'Saved tokenizer to: {tokenizer_path}')

# Convert to sequences and pad
MAXLEN = 50
X_train = pad_sequences(tokenizer.texts_to_sequences(train['utterance'].astype(str).tolist()), maxlen=MAXLEN, padding='post', truncating='post')
X_val   = pad_sequences(tokenizer.texts_to_sequences(val['utterance'].astype(str).tolist()), maxlen=MAXLEN, padding='post', truncating='post')
X_test  = pad_sequences(tokenizer.texts_to_sequences(test['utterance'].astype(str).tolist()), maxlen=MAXLEN, padding='post', truncating='post')
print('Sequences shapes:', X_train.shape, X_val.shape, X_test.shape)

# Build embedding matrix from GloVe
EMB_DIM = 100
embeddings_index = {}
print('Loading GloVe vectors (this may take a moment)...')
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        if coefs.shape[0] == EMB_DIM:
            embeddings_index[word] = coefs

print(f'Loaded {len(embeddings_index):,} glove vectors')

embedding_matrix = np.zeros((vocab_size, EMB_DIM), dtype='float32')
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec
# Note: index 0 stays zeros (padding)
print('Built embedding matrix:', embedding_matrix.shape)

# Build model
def build_bilstm(vocab_size, emb_dim, maxlen, embedding_matrix, num_labels):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=emb_dim,
                        weights=[embedding_matrix],
                        input_length=maxlen,
                        trainable=False))
    model.add(Bidirectional(LSTM(128, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_labels, activation='softmax'))
    return model

model = build_bilstm(vocab_size=vocab_size, emb_dim=EMB_DIM, maxlen=MAXLEN, embedding_matrix=embedding_matrix, num_labels=num_labels)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

# Callbacks
checkpoint = ModelCheckpoint(str(model_path), monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True, mode='max')

# Train
EPOCHS = 8
BATCH = 32
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH,
    callbacks=[checkpoint, early],
    verbose=2
)

# Load best model (checkpoint ensured)
if os.path.exists(model_path):
    print('Loading best saved model...')
    model = tf.keras.models.load_model(str(model_path))

# Evaluation on test set
y_proba = model.predict(X_test, batch_size= BATCH)
y_pred = np.argmax(y_proba, axis=1)

test_acc = accuracy_score(y_test, y_pred)
test_macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f'Test accuracy: {test_acc:.4f}')
print(f'Test macro F1: {test_macro_f1:.4f}')
print('\nClassification report (test):')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Save predictions (with dialog_id / turn_id if present)
out_df = test.copy()
out_df['pred_emotion_bilstm'] = label_encoder.inverse_transform(y_pred)
cols = ['utterance', 'emotion', 'pred_emotion_bilstm']
for c in ['dialog_id', 'turn_id']:
    if c in out_df.columns:
        cols.insert(0, c)
out_df.to_csv(preds_path, index=False, encoding='utf-8', columns=cols)
print(f'Saved predictions to: {preds_path}')

# Save tokenizer & label encoder already done; save model architecture/weights path printed
print('Artifacts saved:')
print('-', model_path)
print('-', tokenizer_path)
print('-', label_enc_path)

Using GloVe file: data_proc/glove.6B.100d.txt
Loaded splits:  82687 10268 10024
Loading existing label encoder...
Num labels: 7
Vocab size (used): 17862
Saved tokenizer to: data_proc/bilstm_tokenizer.joblib
Sequences shapes: (82687, 50) (10268, 50) (10024, 50)
Loading GloVe vectors (this may take a moment)...
Loaded 400,000 glove vectors
Built embedding matrix: (17862, 100)


2025-11-17 20:09:46.057623: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-11-17 20:09:46.057664: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-11-17 20:09:46.057679: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-11-17 20:09:46.057711: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-17 20:09:46.057726: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/8


2025-11-17 20:09:47.233347: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_accuracy improved from None to 0.86005, saving model to data_proc/bilstm_glove.h5




2584/2584 - 88s - 34ms/step - accuracy: 0.8390 - loss: 0.5240 - val_accuracy: 0.8601 - val_loss: 0.4453
Epoch 2/8

Epoch 2: val_accuracy improved from 0.86005 to 0.86473, saving model to data_proc/bilstm_glove.h5




2584/2584 - 84s - 32ms/step - accuracy: 0.8491 - loss: 0.4689 - val_accuracy: 0.8647 - val_loss: 0.4289
Epoch 3/8

Epoch 3: val_accuracy improved from 0.86473 to 0.86560, saving model to data_proc/bilstm_glove.h5




2584/2584 - 83s - 32ms/step - accuracy: 0.8522 - loss: 0.4516 - val_accuracy: 0.8656 - val_loss: 0.4201
Epoch 4/8

Epoch 4: val_accuracy improved from 0.86560 to 0.86697, saving model to data_proc/bilstm_glove.h5




2584/2584 - 83s - 32ms/step - accuracy: 0.8540 - loss: 0.4378 - val_accuracy: 0.8670 - val_loss: 0.4154
Epoch 5/8

Epoch 5: val_accuracy did not improve from 0.86697
2584/2584 - 83s - 32ms/step - accuracy: 0.8547 - loss: 0.4257 - val_accuracy: 0.8663 - val_loss: 0.4174
Epoch 6/8

Epoch 6: val_accuracy did not improve from 0.86697
2584/2584 - 83s - 32ms/step - accuracy: 0.8570 - loss: 0.4123 - val_accuracy: 0.8659 - val_loss: 0.4170
Epoch 7/8

Epoch 7: val_accuracy did not improve from 0.86697
2584/2584 - 84s - 32ms/step - accuracy: 0.8586 - loss: 0.4016 - val_accuracy: 0.8664 - val_loss: 0.4227




Loading best saved model...
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step
Test accuracy: 0.8651
Test macro F1: 0.2645

Classification report (test):
              precision    recall  f1-score   support

       anger       1.00      0.01      0.02       110
     disgust       0.00      0.00      0.00        33
        fear       0.00      0.00      0.00        19
   happiness       0.67      0.41      0.51      1189
  no_emotion       0.88      0.97      0.92      8385
     sadness       0.00      0.00      0.00        97
    surprise       0.62      0.30      0.40       191

    accuracy                           0.87     10024
   macro avg       0.45      0.24      0.26     10024
weighted avg       0.84      0.87      0.84     10024

Saved predictions to: data_proc/bilstm_glove_predictions.csv
Artifacts saved:
- data_proc/bilstm_glove.h5
- data_proc/bilstm_tokenizer.joblib
- data_proc/label_encoder.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
