In [1]:
import os
import pandas as pd
import numpy as np
import librosa
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

#### Setting path to RAVDESS

Modality-VocalChannel-Emotion-Intensity-Statement-Repetition-Actor.wav

03-01-05-01-02-02-12.wav

we'll only keep 01, 03, 04, 05, 06, 07

In [2]:
DATA_PATH = "../datasets/ravdess"

emotion_map = {
    "01": "neutral",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust"
}

file_paths = []
emotions = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            emotion_code = file.split("-")[2]
            
            if emotion_code in emotion_map:
                file_paths.append(os.path.join(root, file))
                emotions.append(emotion_map[emotion_code])

# Create dataframe
df = pd.DataFrame({
    "path": file_paths,
    "emotion": emotions
})

df.head()


Unnamed: 0,path,emotion
0,../datasets/ravdess\Actor_01\03-01-01-01-01-01...,neutral
1,../datasets/ravdess\Actor_01\03-01-01-01-01-02...,neutral
2,../datasets/ravdess\Actor_01\03-01-01-01-02-01...,neutral
3,../datasets/ravdess\Actor_01\03-01-01-01-02-02...,neutral
4,../datasets/ravdess\Actor_01\03-01-03-01-01-01...,happy


In [3]:
df["emotion"].value_counts()

emotion
happy      192
sad        192
angry      192
fearful    192
disgust    192
neutral     96
Name: count, dtype: int64

In [4]:
print("Total samples:", len(df))

Total samples: 1056


### Audio standardization

In [5]:
SAMPLE_RATE = 22050
DURATION = 3 
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

def load_audio(file_path):
    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    
    if len(signal) > SAMPLES_PER_TRACK:
        signal = signal[:SAMPLES_PER_TRACK]
    else:
        padding = SAMPLES_PER_TRACK - len(signal)
        signal = np.pad(signal, (0, padding))
        
    return signal

### Extract MFCC features

In [6]:
def extract_mfcc(signal, sr=SAMPLE_RATE, n_mfcc=40):
    mfcc = librosa.feature.mfcc(
        y=signal,
        sr=sr,
        n_mfcc=n_mfcc,
        n_fft=2048,
        hop_length=512
    )
    
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    
    combined = np.stack((mfcc, delta, delta2), axis=-1)
    
    return combined


In [7]:
sample_signal = load_audio(df["path"].iloc[0])
mfcc = extract_mfcc(sample_signal)

print("MFCC shape:", mfcc.shape)


MFCC shape: (40, 130, 3)


### Encode labels

Actor column

In [8]:
df["actor"] = df["path"].apply(
    lambda x: x.split("-")[-1].replace(".wav", "")
)

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["emotion"])

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.2, random_state=42)

train_idx, test_idx = next(
    gss.split(df, df["label"], groups=df["actor"])
)

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

### Speaker-based Train/Test split

In [10]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(test_size=0.2, random_state=42)

train_idx, test_idx = next(
    gss.split(df, df["label"], groups=df["actor"])
)

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

### Feature Extraction

In [11]:
X_train, y_train = [], []
X_test, y_test = [], []

# Training set
for _, row in train_df.iterrows():
    signal = load_audio(row["path"])
    features = extract_mfcc(signal)
    X_train.append(features)
    y_train.append(row["label"])

# Test set
for _, row in test_df.iterrows():
    signal = load_audio(row["path"])
    features = extract_mfcc(signal)
    X_test.append(features)
    y_test.append(row["label"])

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (836, 40, 130, 3)
Test shape: (220, 40, 130, 3)


### Normalization

In [12]:
mean = np.mean(X_train)
std = np.std(X_train)

X_train = (X_train - mean) / (std + 1e-8)
X_test = (X_test - mean) / (std + 1e-8)

### One-Hot Encoding

In [13]:
from tensorflow.keras.utils import to_categorical

num_classes = len(np.unique(y_train))

y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)


### Model

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, BatchNormalization,
    Flatten, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam

input_shape = X_train.shape[1:]

model = Sequential()

# Block 1
model.add(Conv2D(64, (3,3), activation='relu', padding='same', input_shape=input_shape))
model.add(BatchNormalization())
model.add(MaxPooling2D((2,2)))
model.add(Dropout(0.25))

# Block 2
model.add(Conv2D(128, (3,3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2,2)))
model.add(Dropout(0.3))

# Block 3
model.add(Conv2D(256, (3,3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2,2)))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#### Model compilation

In [15]:
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=4,
    min_lr=1e-6
)
early_stop = EarlyStopping(
    monitor='val_loss',      # watch validation loss
    patience=8,              # wait 8 epochs before stopping
    restore_best_weights=True
)


In [16]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.argmax(y_train, axis=1)),
    y=np.argmax(y_train, axis=1)
)

class_weights = dict(enumerate(class_weights))
print(class_weights)


{0: np.float64(0.9166666666666666), 1: np.float64(0.9166666666666666), 2: np.float64(0.9166666666666666), 3: np.float64(0.9166666666666666), 4: np.float64(1.8333333333333333), 5: np.float64(0.9166666666666666)}


### Training the model

In [17]:
history = model.fit(
    X_train, y_train,
    epochs=120,
    batch_size=32,
    validation_data=(X_test, y_test),
    class_weight=class_weights,
    # callbacks=[early_stop, lr_scheduler]
)

Epoch 1/120
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 538ms/step - accuracy: 0.2093 - loss: 3.3865 - val_accuracy: 0.0909 - val_loss: 1.8067
Epoch 2/120
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 525ms/step - accuracy: 0.2524 - loss: 1.7658 - val_accuracy: 0.0909 - val_loss: 1.8252
Epoch 3/120
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 526ms/step - accuracy: 0.2883 - loss: 1.6646 - val_accuracy: 0.1318 - val_loss: 1.8433
Epoch 4/120
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 526ms/step - accuracy: 0.2907 - loss: 1.6354 - val_accuracy: 0.1364 - val_loss: 1.8469
Epoch 5/120
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 531ms/step - accuracy: 0.3158 - loss: 1.6035 - val_accuracy: 0.1727 - val_loss: 1.8125
Epoch 6/120
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 527ms/step - accuracy: 0.3349 - loss: 1.5718 - val_accuracy: 0.1727 - val_loss: 1.8373
Epoch 7/120
[1m