In [1]:
!pip install pandas numpy matplotlib scikit-learn torch torchvision torchaudio tqdm

python(67349) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd 
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torchaudio
from tqdm import tqdm

In [44]:
dataset_path = "/Users/vanilla/.cache/kagglehub/datasets/samuelsamsudinng/iemocap-emotion-speech-database/versions/1"
df = pd.read_csv(f"{dataset_path}/iemocap_full_dataset.csv")

print(f"Original dataset size: {len(df)}")

Original dataset size: 10039


In [3]:
df.head()

NameError: name 'df' is not defined

In [46]:
df.columns

Index(['session', 'method', 'gender', 'emotion', 'n_annotators', 'agreement',
       'path'],
      dtype='object')

In [47]:
df['emotion'].value_counts()

emotion
xxx    2507
fru    1849
neu    1708
ang    1103
sad    1084
exc    1041
hap     595
sur     107
fea      40
oth       3
dis       2
Name: count, dtype: int64

In [28]:
target_emotions = ['hap', 'sad', 'ang', 'neu', 'fru', 'exc']

df_filtered = df[df['emotion'].isin(target_emotions)].copy()

print("Filtered dataset size: ", len(df_filtered))
print("Filtered emotion distribution: ", df_filtered['emotion'].value_counts())
print(f"Removed {len(df) - len(df_filtered)} utterances (xxx, sur, fea, oth, dis)")

Filtered dataset size:  7380
Filtered emotion distribution:  emotion
fru    1849
neu    1708
ang    1103
sad    1084
exc    1041
hap     595
Name: count, dtype: int64
Removed 2659 utterances (xxx, sur, fea, oth, dis)


### Handling class imbalance by merging excited class (1041) with happy (595)

In [48]:
class_counts = df_filtered['emotion'].value_counts()
print("Class counts before merging: ", class_counts)

Class counts before merging:  emotion
fru    1849
neu    1708
ang    1103
sad    1084
exc    1041
hap     595
Name: count, dtype: int64


In [30]:
df_filtered_merge = df_filtered.copy()
df_filtered_merge['emotion'] = df_filtered_merge['emotion'].replace({'exc': 'hap'})
class_counts_after = df_filtered_merge['emotion'].value_counts()
print("Class counts after merging exc -> hap: ", class_counts_after)

Class counts after merging exc -> hap:  emotion
fru    1849
neu    1708
hap    1636
ang    1103
sad    1084
Name: count, dtype: int64


### Checking imabalnce ratio

In [49]:
imbalance_ratio = class_counts_after.max() / class_counts_after.min()
print(f"Imbalance ratio: {imbalance_ratio:.2f}")

Imbalance ratio: 1.71


### Creating Train/Val/Test Splits (Session-Based)

Train: Sessions 1-4 (90% Train, 10% Val)

Test: Session 5

In [51]:
train_val_df = df_filtered_merge[df_filtered_merge['session'].isin([1, 2, 3, 4])].copy()
test_df = df_filtered_merge[df_filtered_merge['session'] == 5].copy()

train_df, val_df = train_test_split(
    train_val_df,
    test_size = 0.1,
    stratify=train_val_df['emotion'],
    random_state=42
)

In [52]:
split_stats = pd.DataFrame({
    "Split": ["Train", "Validation", "Test"],
    "Samples": [len(train_df), len(val_df), len(test_df)],
    "Percentage": [
        len(train_df) / len(df_filtered_merge) * 100,
        len(val_df) / len(df_filtered_merge) * 100,
        len(test_df) / len(df_filtered_merge) * 100
    ]
})

print("Split Summary")
display(split_stats)

emotion_distribution = pd.DataFrame({
    "Train": train_df['emotion'].value_counts(),
    "Validation": val_df['emotion'].value_counts(),
    "Test": test_df['emotion'].value_counts()
}).fillna(0).astype(int)

print("Emotion Distribution (Counts)")
display(emotion_distribution)

Split Summary


Unnamed: 0,Split,Samples,Percentage
0,Train,5182,70.216802
1,Validation,576,7.804878
2,Test,1622,21.97832


Emotion Distribution (Counts)


Unnamed: 0_level_0,Train,Validation,Test
emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ang,840,93,170
fru,1321,147,381
hap,1075,119,442
neu,1191,133,384
sad,755,84,245


In [54]:
train_df.columns

Index(['session', 'method', 'gender', 'emotion', 'n_annotators', 'agreement',
       'path', 'audio_path'],
      dtype='object')

In [53]:
train_df.head()

Unnamed: 0,session,method,gender,emotion,n_annotators,agreement,path,audio_path
5854,4,script,M,neu,3,2,Session4/sentences/wav/Ses04M_script01_1/Ses04...,/Users/vanilla/.cache/kagglehub/datasets/samue...
7732,4,impro,M,fru,3,3,Session4/sentences/wav/Ses04M_impro02/Ses04M_i...,/Users/vanilla/.cache/kagglehub/datasets/samue...
2836,2,impro,F,hap,4,2,Session2/sentences/wav/Ses02F_impro03/Ses02F_i...,/Users/vanilla/.cache/kagglehub/datasets/samue...
2853,2,impro,F,neu,3,2,Session2/sentences/wav/Ses02F_impro03/Ses02F_i...,/Users/vanilla/.cache/kagglehub/datasets/samue...
1090,1,script,F,fru,3,2,Session1/sentences/wav/Ses01F_script01_1/Ses01...,/Users/vanilla/.cache/kagglehub/datasets/samue...


### Audio Stats

*(We still didnt recieve the audio files from the source, the data on HF has everything but the audio files)*

In [None]:
def get_audio_stats(df_subset, sample_size=100):
    """Get audio duration statistics from a sample"""
    durations = []
    sample = df_subset.sample(min(sample_size, len(df_subset)), random_state=42)
    
    for _, row in tqdm(sample.iterrows(), total=len(sample), desc="Analyzing audio"):
        try:
            waveform, sr = torchaudio.load(row['audio_path'])
            duration = waveform.shape[1] / sr
            durations.append(duration)
        except Exception as e:
            print(f"Error loading {row['audio_path']}: {e}")
    
    return np.array(durations)

print("Analyzing audio durations (sampling 100 files)...")
durations = get_audio_stats(train_df, sample_size=100)

print(f"Audio Duration Statistics:")
print(f"Mean: {durations.mean():.2f}s")
print(f"Std:  {durations.std():.2f}s")
print(f"Min:  {durations.min():.2f}s")
print(f"Max:  {durations.max():.2f}s")
print(f"Median: {np.median(durations):.2f}s")