In [3]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import soundfile
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
DATA_PATH = '/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions'

In [5]:
import random

def extract_feature(file_name, mfcc=True, chroma=True, mel=True, augment=False):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

        if augment:
            if random.random() < 0.5:
                noise = 0.005 * np.random.randn(len(X))
                X = X + noise
            if random.random() < 0.5:
                X = librosa.effects.pitch_shift(X, sr=sample_rate, n_steps=random.choice([-2, 2]))

        result = np.array([])

        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            stft = np.abs(librosa.stft(X))
            chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma_feat))
        if mel:
            mel_feat = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel_feat))

    return result


In [6]:
def load_data(data_path=DATA_PATH, test_size=0.2):
    x, y = [], []
    for emotion_label in os.listdir(data_path):
        emotion_path = os.path.join(data_path, emotion_label)
        if not os.path.isdir(emotion_path):
            continue
        for file in os.listdir(emotion_path):
            if not file.lower().endswith(".wav"):
                continue
            file_path = os.path.join(emotion_path, file)
            try:
                features = extract_feature(file_path, augment=True)
                x.append(features)
                y.append(emotion_label)
            except Exception as e:
                print(f"❌ Skipping {file_path}: {e}")
    return train_test_split(np.array(x), y, test_size=test_size, random_state=42)


In [7]:
x_train, x_test, y_train, y_test = load_data()

❌ Skipping /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Happy/03-01-03-01-02-01-20.wav: operands could not be broadcast together with shapes (166566,2) (166566,) 
❌ Skipping /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Fearful/03-01-06-01-01-02-20.wav: operands could not be broadcast together with shapes (209809,2) (209809,) 


In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define parameter grid
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (400,), (200, 100)],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [64, 128, 270],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 400]
}

# Initialize base model
mlp = MLPClassifier(epsilon=1e-08)

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Best model and hyperparameters
best_model = grid_search.best_estimator_
print("Best parameters found:", grid_search.best_params_)



Best parameters found: {'alpha': 0.001, 'batch_size': 128, 'hidden_layer_sizes': (400,), 'learning_rate': 'constant', 'max_iter': 400}


In [12]:
from sklearn.metrics import precision_recall_fscore_support

y_pred = best_model.predict(x_test)

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Micro-Precision: {precision:.3f}")
print(f"Micro-Recall:    {recall:.3f}")
print(f"Micro-F1 Score:  {f1:.3f}")
print(classification_report(y_test, y_pred))

print("Best Hyperparameters:", grid_search.best_params_)


Micro-Precision: 0.614
Micro-Recall:    0.614
Micro-F1 Score:  0.614
              precision    recall  f1-score   support

       Angry       0.58      0.81      0.67       410
     Fearful       0.58      0.50      0.54       420
       Happy       0.69      0.42      0.52       459
         Sad       0.64      0.75      0.69       421

    accuracy                           0.61      1710
   macro avg       0.62      0.62      0.61      1710
weighted avg       0.62      0.61      0.60      1710

Best Hyperparameters: {'alpha': 0.001, 'batch_size': 128, 'hidden_layer_sizes': (400,), 'learning_rate': 'constant', 'max_iter': 400}
