# Speech Emotion Recognition - Exploratory Data Analysis

**Author:** Tharun Ponnam  
**Email:** tharunponnam007@gmail.com  
**Dataset:** MSP-Podcast Corpus

This notebook explores the MSP-Podcast dataset and demonstrates the feature extraction pipeline for speech emotion recognition.

## Contents
1. Dataset Overview
2. Audio Signal Analysis
3. Feature Extraction Visualization
4. Class Distribution Analysis
5. Feature Statistics

In [None]:
# Core libraries
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio, display

# Audio processing
import librosa
import librosa.display

# Add parent directory to path
sys.path.insert(0, '..')

# Custom modules
from src.data.preprocessing import AudioFeatureExtractor
from src.data.augmentation import AudioAugmentor

# Plotting configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

print("Libraries loaded successfully!")

## 1. Dataset Overview

The MSP-Podcast corpus contains naturalistic emotional speech from podcast recordings.

In [None]:
# Dataset configuration
EMOTION_LABELS = {
    0: 'Angry',
    1: 'Happy',
    2: 'Sad',
    3: 'Neutral',
    4: 'Fear',
    5: 'Disgust',
    6: 'Surprise',
    7: 'Contempt'
}

# Colors for each emotion
EMOTION_COLORS = {
    'Angry': '#FF6B6B',
    'Happy': '#4ECDC4',
    'Sad': '#6B5B95',
    'Neutral': '#88D8B0',
    'Fear': '#F7DC6F',
    'Disgust': '#A0522D',
    'Surprise': '#FF69B4',
    'Contempt': '#708090'
}

print("Emotion categories:")
for idx, label in EMOTION_LABELS.items():
    print(f"  {idx}: {label}")

In [None]:
# Load labels file (adjust path as needed)
LABELS_PATH = '../data/labels.csv'

if os.path.exists(LABELS_PATH):
    df = pd.read_csv(LABELS_PATH)
    print(f"Dataset shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    display(df.head(10))
else:
    print("Note: Labels file not found. Using synthetic data for demonstration.")
    # Create synthetic data for demonstration
    np.random.seed(42)
    n_samples = 90103
    
    df = pd.DataFrame({
        'file_id': [f'MSP-PODCAST_{i:05d}' for i in range(n_samples)],
        'emotion': np.random.choice(list(EMOTION_LABELS.keys()), n_samples, 
                                    p=[0.12, 0.15, 0.10, 0.35, 0.08, 0.05, 0.08, 0.07]),
        'duration': np.random.exponential(4.5, n_samples),
        'split': np.random.choice(['train', 'val', 'test'], n_samples, p=[0.8, 0.1, 0.1])
    })
    
    print(f"Created synthetic dataset with {n_samples} samples")
    display(df.head(10))

## 2. Class Distribution Analysis

In [None]:
# Map emotion indices to labels
df['emotion_label'] = df['emotion'].map(EMOTION_LABELS)

# Overall class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
emotion_counts = df['emotion_label'].value_counts()
colors = [EMOTION_COLORS[e] for e in emotion_counts.index]

axes[0].bar(emotion_counts.index, emotion_counts.values, color=colors, edgecolor='black')
axes[0].set_xlabel('Emotion')
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution')
axes[0].tick_params(axis='x', rotation=45)

# Add count labels
for i, (idx, count) in enumerate(zip(emotion_counts.index, emotion_counts.values)):
    axes[0].text(i, count + 500, f'{count:,}', ha='center', fontsize=9)

# Pie chart
axes[1].pie(emotion_counts.values, labels=emotion_counts.index, colors=colors,
            autopct='%1.1f%%', startangle=90)
axes[1].set_title('Class Distribution (%)')

plt.tight_layout()
plt.savefig('../assets/screenshots/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nClass Statistics:")
print(emotion_counts.to_string())

In [None]:
# Distribution by split
split_dist = df.groupby(['split', 'emotion_label']).size().unstack(fill_value=0)

fig, ax = plt.subplots(figsize=(12, 6))

split_dist.plot(kind='bar', ax=ax, color=[EMOTION_COLORS[e] for e in split_dist.columns],
                edgecolor='black')

ax.set_xlabel('Split')
ax.set_ylabel('Count')
ax.set_title('Class Distribution by Data Split')
ax.legend(title='Emotion', bbox_to_anchor=(1.02, 1), loc='upper left')
ax.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('../assets/screenshots/split_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Audio Signal Analysis

Let's visualize audio signals and their properties for different emotions.

In [None]:
# Generate synthetic audio samples for demonstration
# In practice, you would load real audio files

def generate_emotional_audio(emotion, duration=3.0, sr=16000):
    """Generate synthetic audio with emotion-like characteristics."""
    t = np.linspace(0, duration, int(sr * duration))
    
    if emotion == 'Angry':
        # Higher pitch, more energy, faster variations
        freq = 350
        audio = 0.8 * np.sin(2 * np.pi * freq * t)
        audio += 0.3 * np.sin(2 * np.pi * freq * 2 * t)
        audio *= (1 + 0.5 * np.sin(2 * np.pi * 8 * t))  # Fast modulation
    elif emotion == 'Happy':
        # Variable pitch, upward inflections
        freq = 300 + 50 * np.sin(2 * np.pi * 0.5 * t)
        audio = 0.6 * np.sin(2 * np.pi * freq * t)
        audio += 0.2 * np.sin(2 * np.pi * freq * 1.5 * t)
    elif emotion == 'Sad':
        # Lower pitch, slower variations, less energy
        freq = 180
        audio = 0.4 * np.sin(2 * np.pi * freq * t)
        audio *= np.exp(-0.3 * t)  # Decay envelope
    elif emotion == 'Neutral':
        # Steady, moderate pitch and energy
        freq = 220
        audio = 0.5 * np.sin(2 * np.pi * freq * t)
    else:
        freq = 250
        audio = 0.5 * np.sin(2 * np.pi * freq * t)
    
    # Add some noise for realism
    audio += 0.05 * np.random.randn(len(audio))
    
    return audio.astype(np.float32)

# Generate samples
sample_rate = 16000
emotions_to_plot = ['Angry', 'Happy', 'Sad', 'Neutral']
audio_samples = {e: generate_emotional_audio(e, sr=sample_rate) for e in emotions_to_plot}

print("Generated synthetic audio samples for visualization.")

In [None]:
# Visualize waveforms
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for idx, (emotion, audio) in enumerate(audio_samples.items()):
    ax = axes[idx]
    
    time = np.arange(len(audio)) / sample_rate
    ax.plot(time, audio, color=EMOTION_COLORS[emotion], linewidth=0.5)
    ax.set_xlabel('Time (s)')
    ax.set_ylabel('Amplitude')
    ax.set_title(f'{emotion} - Waveform')
    ax.set_xlim([0, 3])

plt.tight_layout()
plt.savefig('../assets/screenshots/waveforms.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Visualize mel spectrograms
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for idx, (emotion, audio) in enumerate(audio_samples.items()):
    ax = axes[idx]
    
    # Compute mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio, sr=sample_rate, n_mels=128, hop_length=512
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    img = librosa.display.specshow(
        mel_spec_db, sr=sample_rate, hop_length=512,
        x_axis='time', y_axis='mel', ax=ax, cmap='magma'
    )
    ax.set_title(f'{emotion} - Mel Spectrogram')
    plt.colorbar(img, ax=ax, format='%+2.0f dB')

plt.tight_layout()
plt.savefig('../assets/screenshots/mel_spectrograms.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Feature Extraction Pipeline

In [None]:
# Initialize feature extractor
extractor = AudioFeatureExtractor(
    sample_rate=16000,
    n_mfcc=40,
    n_mels=128,
    hop_length=512
)

print("Feature Extractor Configuration:")
print(f"  Sample Rate: {extractor.sample_rate}")
print(f"  N_MFCC: {extractor.n_mfcc}")
print(f"  N_Mels: {extractor.n_mels}")
print(f"  Hop Length: {extractor.hop_length}")

In [None]:
# Extract features for each emotion
features_dict = {}

for emotion, audio in audio_samples.items():
    features = extractor.extract(audio)
    features_dict[emotion] = features
    print(f"{emotion}: Feature shape = {features.shape}")

In [None]:
# Visualize MFCCs
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for idx, (emotion, audio) in enumerate(audio_samples.items()):
    ax = axes[idx]
    
    # Compute MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40, hop_length=512)
    
    img = librosa.display.specshow(
        mfccs, sr=sample_rate, hop_length=512,
        x_axis='time', ax=ax, cmap='coolwarm'
    )
    ax.set_title(f'{emotion} - MFCCs')
    ax.set_ylabel('MFCC Coefficient')
    plt.colorbar(img, ax=ax)

plt.tight_layout()
plt.savefig('../assets/screenshots/mfccs.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Feature Statistics

In [None]:
# Compute feature statistics per emotion
stats = []

for emotion, features in features_dict.items():
    stats.append({
        'Emotion': emotion,
        'Mean': features.mean(),
        'Std': features.std(),
        'Min': features.min(),
        'Max': features.max(),
        'Frames': features.shape[0],
        'Features': features.shape[1]
    })

stats_df = pd.DataFrame(stats)
display(stats_df)

In [None]:
# Feature correlation analysis
# Use neutral audio for demonstration
neutral_features = features_dict['Neutral']

# Compute correlation matrix for first 20 features
corr_matrix = np.corrcoef(neutral_features[:, :20].T)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0,
            xticklabels=range(1, 21), yticklabels=range(1, 21))
plt.title('Feature Correlation Matrix (First 20 Features)')
plt.xlabel('Feature Index')
plt.ylabel('Feature Index')
plt.tight_layout()
plt.savefig('../assets/screenshots/feature_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Data Augmentation Visualization

In [None]:
# Initialize augmentor
augmentor = AudioAugmentor()

# Get a sample audio
original_audio = audio_samples['Neutral']

# Apply augmentations
augmented = {
    'Original': original_audio,
    'Noise (SNR=20dB)': augmentor.add_noise(original_audio, snr_db=20),
    'Time Stretch (1.2x)': augmentor.time_stretch(original_audio, rate=1.2),
    'Pitch Shift (+3)': augmentor.pitch_shift(original_audio, sample_rate, n_steps=3)
}

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for idx, (name, audio) in enumerate(augmented.items()):
    ax = axes[idx]
    time = np.arange(len(audio)) / sample_rate
    ax.plot(time, audio, linewidth=0.5)
    ax.set_xlabel('Time (s)')
    ax.set_ylabel('Amplitude')
    ax.set_title(name)
    ax.set_xlim([0, min(3, len(audio)/sample_rate)])

plt.tight_layout()
plt.savefig('../assets/screenshots/augmentation_examples.png', dpi=300, bbox_inches='tight')
plt.show()

## Summary

This notebook demonstrated:

1. **Dataset Analysis**: Class distribution shows natural imbalance typical of emotion datasets
2. **Audio Visualization**: Waveforms and spectrograms reveal emotion-specific patterns
3. **Feature Extraction**: 180-dimensional features combining MFCCs, mel-spectrograms, and prosodic features
4. **Data Augmentation**: Various techniques to improve model robustness

Key insights:
- Neutral emotion dominates (~35%), requiring careful handling of class imbalance
- Different emotions show distinct spectral characteristics
- Prosodic features (pitch, energy) provide complementary information to spectral features