In [1]:
# --- 1. Imports and Setup ---

import pandas as pd
import numpy as np
import pathlib

# Paths
ROOT = pathlib.Path(".")
RAW = ROOT / "data_raw"
PROC = ROOT / "data_proc"
PROC.mkdir(exist_ok=True)

# Original dataset (DailyDialog)
ORIGINAL_DATASET_PATH = PROC / "dailydialog_utterances.csv"

# New dataset
NEW_DATASET_PATH = RAW / "emotion-emotion_69k.csv"

print("Setup complete.")

Setup complete.


In [2]:
# --- 2. Load Datasets ---

# Load the original DailyDialog dataset
df_original = pd.read_csv(ORIGINAL_DATASET_PATH)

# Load the new emotion dataset
df_new = pd.read_csv(NEW_DATASET_PATH)

print("Original Dataset (DailyDialog):")
display(df_original.head())

print("\nNew Dataset (emotion_69k):")
display(df_new.head())

Original Dataset (DailyDialog):


Unnamed: 0,dialog_id,turn_id,utterance,emotion_id,emotion
0,0,0,The kitchen stinks .,2,disgust
1,0,1,I'll throw out the garbage .,0,no_emotion
2,1,0,"So Dick , how about getting some coffee for to...",4,happiness
3,1,1,Coffee ? I don ’ t honestly like that kind of ...,2,disgust
4,1,2,"Come on , you can at least try a little , besi...",0,no_emotion



New Dataset (emotion_69k):


Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,
2,2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,,
3,3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,,
4,4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,,


In [3]:
# --- 3. Inspect Emotions in New Dataset ---

# Get the unique emotion labels from the new dataset
unique_emotions_new = df_new['emotion'].unique()

print("Unique emotions in the new dataset:")
for emotion in unique_emotions_new:
    print(f"- {emotion}")

Unique emotions in the new dataset:
- sentimental
- afraid
- proud
- faithful
- terrified
- joyful
- angry
- sad
- jealous
- grateful
- prepared
- embarrassed
- excited
- annoyed
- lonely
- ashamed
- guilty
- surprised
- nostalgic
- confident
- furious
- disappointed
- caring
- trusting
- disgusted
- anticipating
- anxious
- hopeful
- content
- impressed
- apprehensive
- devastated
-  I really killed it!
-  we were in a different country
- t even like scary things
- t believe I like the show Power so much. I was never really into shows like that
- nan
-  time to jump on the motorcycle and go cruising!
-  a boy.  I hear all these different labor stories that aren't exactly reassuring!  
- t believe my daughter taught herself how to play the ukelele. I was amazed
- t think I wold like super heroes
- m so mad with my brother. He stole from me and didn't think I would notice. 
-  but what I didn't know was that he was working in the next room with the door open.  He approached and asked wh

In [4]:
# --- 4. Map and Clean New Dataset ---

# Define the mapping from new emotions to our target emotions
emotion_mapping = {
    # Anger
    'angry': 'anger',
    'furious': 'anger',
    'annoyed': 'anger',

    # Disgust
    'disgusted': 'disgust',

    # Fear
    'afraid': 'fear',
    'terrified': 'fear',
    'anxious': 'fear',
    'apprehensive': 'fear',

    # Happiness
    'joyful': 'happiness',
    'excited': 'happiness',
    'proud': 'happiness',
    'grateful': 'happiness',
    'content': 'happiness',
    'impressed': 'happiness',
    'confident': 'happiness',
    'hopeful': 'happiness',
    'caring': 'happiness', # Often expressed with positive sentiment
    'trusting': 'happiness',

    # Sadness
    'sad': 'sadness',
    'lonely': 'sadness',
    'disappointed': 'sadness',
    'ashamed': 'sadness',
    'guilty': 'sadness',
    'embarrassed': 'sadness',
    'devastated': 'sadness',
    'nostalgic': 'sadness', # Can be bittersweet, but often leans towards sadness/longing
    'sentimental': 'sadness',

    # Surprise
    'surprised': 'surprise',
    'anticipating': 'surprise',

    # Default to no_emotion for others
    'faithful': 'no_emotion',
    'prepared': 'no_emotion',
    'jealous': 'no_emotion', # Could be anger, but let's be conservative
}

# Apply the mapping
df_new['emotion_mapped'] = df_new['emotion'].str.strip().map(emotion_mapping).fillna('no_emotion')

# The text data seems to be in the 'empathetic_dialogues' column. Let's use that.
# We also need to clean it up a bit.
def clean_new_text(text):
    if not isinstance(text, str):
        return ""
    # Remove the 'Customer :' and 'Agent :' prefixes
    text = re.sub(r'(Customer|Agent)\s*:\s*', '', text)
    # A single utterance might be better, let's take the first line
    return text.split('\n')[0].strip()

import re
df_new['utterance'] = df_new['empathetic_dialogues'].apply(clean_new_text)

# Select and rename columns to match the original dataset format
df_new_formatted = df_new[['utterance', 'emotion_mapped']].rename(columns={'emotion_mapped': 'emotion'})

# Remove rows with empty utterances
df_new_formatted = df_new_formatted[df_new_formatted['utterance'] != ""]

print("New dataset after mapping and cleaning:")
display(df_new_formatted.head())

print("\nValue counts of mapped emotions:")
print(df_new_formatted['emotion'].value_counts())

New dataset after mapping and cleaning:


Unnamed: 0,utterance,emotion
0,I remember going to see the fireworks with my ...,sadness
1,This was a best friend. I miss her.,sadness
2,We no longer talk.,sadness
3,"Was this a friend you were in love with, or ju...",sadness
4,Where has she gone?,sadness



Value counts of mapped emotions:
emotion
happiness     20239
sadness       17504
fear           7754
anger          6554
surprise       5321
no_emotion     5220
disgust        2044
Name: count, dtype: int64


In [5]:
# --- 5. Combine, Shuffle, and Save ---

# Select relevant columns from the original dataset
df_original_subset = df_original[['utterance', 'emotion']]

# Concatenate the two dataframes
df_combined = pd.concat([df_original_subset, df_new_formatted], ignore_index=True)

# Shuffle the combined dataframe
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the final combined dataset
COMBINED_DATASET_PATH = PROC / "dailydialog_plus_emotion69k.csv"
df_combined.to_csv(COMBINED_DATASET_PATH, index=False, encoding='utf-8')

print(f"Combined dataset shape: {df_combined.shape}")
print(f"Saved combined dataset to: {COMBINED_DATASET_PATH}")

print("\nFinal emotion distribution in the combined dataset:")
print(df_combined['emotion'].value_counts())

Combined dataset shape: (167615, 2)
Saved combined dataset to: data_proc/dailydialog_plus_emotion69k.csv

Final emotion distribution in the combined dataset:
emotion
no_emotion    90792
happiness     33124
sadness       18654
fear           7928
anger          7576
surprise       7144
disgust        2397
Name: count, dtype: int64


In [11]:
# --- 6. Balance the Combined Dataset via Undersampling ---

# Separate the majority class ('no_emotion') from the minority classes
df_majority = df_combined[df_combined['emotion'] == 'no_emotion']
df_minority = df_combined[df_combined['emotion'] != 'no_emotion']

# Determine the size of the next largest class ('happiness')
target_size = len(df_combined[df_combined['emotion'] == 'happiness'])

# Randomly undersample the majority class to match the target size
df_majority_undersampled = df_majority.sample(
    n=target_size, 
    random_state=42
)

# Combine the undersampled majority class with the minority classes
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the final balanced dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the final balanced dataset
BALANCED_DATASET_PATH = PROC / "dailydialog_balanced_augmented.csv"
df_balanced.to_csv(BALANCED_DATASET_PATH, index=False, encoding='utf-8')

print(f"Final balanced dataset shape: {df_balanced.shape}")
print(f"Saved balanced dataset to: {BALANCED_DATASET_PATH}")

print("\nFinal emotion distribution in the balanced dataset:")
print(df_balanced['emotion'].value_counts())

Final balanced dataset shape: (109947, 2)
Saved balanced dataset to: data_proc/dailydialog_balanced_augmented.csv

Final emotion distribution in the balanced dataset:
emotion
no_emotion    33124
happiness     33124
sadness       18654
fear           7928
anger          7576
surprise       7144
disgust        2397
Name: count, dtype: int64


In [6]:
df_combined[df_combined['emotion'] == 'no_emotion']

Unnamed: 0,utterance,emotion
0,I was thinking about seeing a movie .,no_emotion
2,"Mary , how was your date with John ?",no_emotion
3,I mean did they reward your 25 years of servic...,no_emotion
4,I don ’ t know . Let me see .,no_emotion
5,Can I help you ?,no_emotion
...,...,...
167601,I'll say the movie's starting in the minute .,no_emotion
167602,I'm afraid not ! Now this is just between you ...,no_emotion
167606,"But if you placed a larger order , we would in...",no_emotion
167608,"Really , that ’ s cool . Let me see what you ’...",no_emotion
