In [15]:
import os
import pandas as pd
import numpy as np
import librosa

#### Setting path to RAVDESS

Modality-VocalChannel-Emotion-Intensity-Statement-Repetition-Actor.wav

03-01-05-01-02-02-12.wav

we'll only keep 01, 03, 04, 05, 06, 07

In [16]:
DATA_PATH = "../datasets/ravdess"

emotion_map = {
    "01": "neutral",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust"
}

file_paths = []
emotions = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            emotion_code = file.split("-")[2]
            
            if emotion_code in emotion_map:
                file_paths.append(os.path.join(root, file))
                emotions.append(emotion_map[emotion_code])

# Create dataframe
df = pd.DataFrame({
    "path": file_paths,
    "emotion": emotions
})

df.head()


Unnamed: 0,path,emotion
0,../datasets/ravdess\Actor_01\03-01-01-01-01-01...,neutral
1,../datasets/ravdess\Actor_01\03-01-01-01-01-02...,neutral
2,../datasets/ravdess\Actor_01\03-01-01-01-02-01...,neutral
3,../datasets/ravdess\Actor_01\03-01-01-01-02-02...,neutral
4,../datasets/ravdess\Actor_01\03-01-03-01-01-01...,happy


In [17]:
df["emotion"].value_counts()

emotion
happy      384
sad        384
fearful    384
angry      384
disgust    384
neutral    192
Name: count, dtype: int64

In [18]:
print("Total samples:", len(df))

Total samples: 2112


### Audio standardization

In [19]:
SAMPLE_RATE = 22050
DURATION = 3 
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

def load_audio(file_path):
    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    
    if len(signal) > SAMPLES_PER_TRACK:
        signal = signal[:SAMPLES_PER_TRACK]
    else:
        padding = SAMPLES_PER_TRACK - len(signal)
        signal = np.pad(signal, (0, padding))
        
    return signal

### Extract MFCC

In [20]:
def extract_mfcc(signal, sr=SAMPLE_RATE, n_mfcc=40):
    mfcc = librosa.feature.mfcc(
        y=signal,
        sr=sr,
        n_mfcc=n_mfcc,
        n_fft=2048,
        hop_length=512
    )
    
    return mfcc

In [21]:
sample_signal = load_audio(df["path"].iloc[0])
mfcc = extract_mfcc(sample_signal)

print("MFCC shape:", mfcc.shape)


MFCC shape: (40, 130)


### Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["emotion"])

print(le.classes_)
df.head()

In [23]:
X = []
y = []

for index, row in df.iterrows():
    signal = load_audio(row["path"])
    mfcc = extract_mfcc(signal)
    
    X.append(mfcc)
    y.append(row["label"])

X = np.array(X)
y = np.array(y)

print("Feature shape:", X.shape)
print("Label shape:", y.shape)


Feature shape: (2112, 40, 130)
Label shape: (2112,)


In [24]:
X = X[..., np.newaxis]

print("New feature shape:", X.shape)

New feature shape: (2112, 40, 130, 1)
