In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import soundfile
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
DATA_PATH = '/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions'

In [3]:
import random

def extract_feature(file_name, mfcc=True, chroma=True, mel=True, augment=False):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

        if augment:
            if random.random() < 0.5:
                noise = 0.005 * np.random.randn(len(X))
                X = X + noise
            if random.random() < 0.5:
                X = librosa.effects.pitch_shift(X, sr=sample_rate, n_steps=random.choice([-2, 2]))

        result = np.array([])

        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            stft = np.abs(librosa.stft(X))
            chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma_feat))
        if mel:
            mel_feat = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel_feat))

    return result


In [4]:
def load_data(data_path=DATA_PATH, test_size=0.2):
    x, y = [], []
    for emotion_label in os.listdir(data_path):
        emotion_path = os.path.join(data_path, emotion_label)
        if not os.path.isdir(emotion_path):
            continue
        for file in os.listdir(emotion_path):
            if not file.lower().endswith(".wav"):
                continue
            file_path = os.path.join(emotion_path, file)
            try:
                features = extract_feature(file_path, augment=True)
                x.append(features)
                y.append(emotion_label)
            except Exception as e:
                print(f"❌ Skipping {file_path}: {e}")
    return train_test_split(np.array(x), y, test_size=test_size, random_state=42)


In [5]:
x_train, x_test, y_train, y_test = load_data()

❌ Skipping /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Happy/03-01-03-01-02-01-20.wav: operands could not be broadcast together with shapes (166566,2) (166566,) 




❌ Skipping /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Fearful/03-01-06-01-01-02-20.wav: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define parameter grid
param_grid = {
    'n_neighbors': [3, 4, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Base model
knn = KNeighborsClassifier()

# Grid search
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Best model
best_knn = grid_search.best_estimator_

Traceback (most recent call last):
  File "/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/.venv/lib/python3.12/site-packages/sklearn/metrics/

In [9]:
from sklearn.metrics import precision_recall_fscore_support

y_pred=best_knn.predict(x_test)

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Micro-Precision: {precision:.3f}")
print(f"Micro-Recall:    {recall:.3f}")
print(f"Micro-F1 Score:  {f1:.3f}")
print(classification_report(y_test, y_pred))

print("Best Hyperparameters:", grid_search.best_params_)

Micro-Precision: 0.602
Micro-Recall:    0.602
Micro-F1 Score:  0.602
              precision    recall  f1-score   support

       Angry       0.69      0.66      0.68       410
     Fearful       0.54      0.43      0.48       420
       Happy       0.56      0.58      0.57       459
         Sad       0.61      0.75      0.67       421

    accuracy                           0.60      1710
   macro avg       0.60      0.60      0.60      1710
weighted avg       0.60      0.60      0.60      1710

Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
