<a href="https://colab.research.google.com/github/tWiLighT-xY91/project-prep-topics/blob/main/auodio-emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using Librosa to create a tiny classifier of emotions. Dataset Used: RAVDESS dataset.

In [20]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import IPython.display as ipd
# Importing latest dataset from Kagglehub
import kagglehub
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/ravdess-emotional-speech-audio


In [21]:
# List files in the dataset directory
audio_files = glob.glob(os.path.join(path, '**', '*.wav'), recursive=True)
print("First 5 audio files:")
for i in range(min(5, len(audio_files))):
    print(audio_files[i])

# Select an audio file to play
if len(audio_files) > 0:
    selected_file = audio_files[0]
    print("\nPlaying selected file:", selected_file)
    display(ipd.Audio(selected_file))
else:
    print("No audio files found in the dataset directory.")

First 5 audio files:
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-08-01-01-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-01-01-01-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-07-02-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-07-01-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-01-01-02-01-02.wav

Playing selected file: /kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-08-01-01-01-02.wav


In [22]:
for file in audio_files[:10]:
    print(os.path.basename(file))


03-01-08-01-01-01-02.wav
03-01-01-01-01-01-02.wav
03-01-07-02-01-02-02.wav
03-01-07-01-01-02-02.wav
03-01-01-01-02-01-02.wav
03-01-06-02-02-01-02.wav
03-01-04-01-02-01-02.wav
03-01-01-01-01-02-02.wav
03-01-02-01-01-02-02.wav
03-01-03-01-01-01-02.wav


In [23]:
# Function to extract features from the audios using mfcc

emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def extract_features(file_path):
  try:
    y, sr = librosa.load(file_path, duration = 3, offset = 0.5)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=80)
    return np.mean(mfcc.T, axis=0)
  except Exception as e:
    print(f"Error extracting features from {file_path}: {e}")
    return None

X = []
y = []

for file in audio_files:
    try:
        basename = os.path.basename(file)
        parts = basename.split("-")

        emotion_code = parts[2]  # ✅ confirmed correct index!
        emotion = emotion_map.get(emotion_code)

        if emotion is None:
            print(f"Skipping unknown emotion: {emotion_code}")
            continue

        features = extract_features(file)
        X.append(features)
        y.append(emotion)

    except Exception as e:
        print(f"Skipping file {file}: {e}")

X = np.array(X)
y = np.array(y)


In [24]:
# Building the model

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Normalizing the data:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Handling Class Imbalance:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

#Training the model
model = MLPClassifier(hidden_layer_sizes=(256,128,64), alpha=0.01,activation='relu',solver = 'adam', max_iter=700)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification report: ",classification_report(y_test, y_pred))

Accuracy: 0.9201388888888888
Classification report:                precision    recall  f1-score   support

           0       0.95      0.95      0.95        87
           1       0.95      0.95      0.95        84
           2       0.93      1.00      0.96        75
           3       0.94      0.97      0.95        62
           4       0.86      0.86      0.86        69
           5       0.78      0.83      0.81        35
           6       0.89      0.91      0.90        89
           7       1.00      0.84      0.91        75

    accuracy                           0.92       576
   macro avg       0.91      0.91      0.91       576
weighted avg       0.92      0.92      0.92       576



In [26]:
# Saving the model!
import pickle
with open("emotion_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
