In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import soundfile
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
DATA_PATH = '/home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions'

In [3]:
import random

def extract_feature(file_name, mfcc=True, chroma=True, mel=True, augment=False):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

        if augment:
            if random.random() < 0.5:
                noise = 0.005 * np.random.randn(len(X))
                X = X + noise
            if random.random() < 0.5:
                X = librosa.effects.pitch_shift(X, sr=sample_rate, n_steps=random.choice([-2, 2]))

        result = np.array([])

        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            stft = np.abs(librosa.stft(X))
            chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma_feat))
        if mel:
            mel_feat = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel_feat))

    return result


In [4]:
def load_data(data_path=DATA_PATH, test_size=0.2):
    x, y = [], []
    for emotion_label in os.listdir(data_path):
        emotion_path = os.path.join(data_path, emotion_label)
        if not os.path.isdir(emotion_path):
            continue
        for file in os.listdir(emotion_path):
            if not file.lower().endswith(".wav"):
                continue
            file_path = os.path.join(emotion_path, file)
            try:
                features = extract_feature(file_path, augment=True)
                x.append(features)
                y.append(emotion_label)
            except Exception as e:
                print(f"❌ Skipping {file_path}: {e}")
    return train_test_split(np.array(x), y, test_size=test_size, random_state=42)


In [None]:
x_train, x_test, y_train, y_test = load_data()

❌ Skipping /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Happy/03-01-03-01-02-01-20.wav: operands could not be broadcast together with shapes (166566,2) (166566,) 




In [None]:
clf2=MLPClassifier(alpha=0.01, batch_size=270, epsilon=1e-08, hidden_layer_sizes=(400,), learning_rate='adaptive', max_iter=400)
clf2.fit(x_train,y_train)

In [None]:
y_pred = clf2.predict(x_test)

print("Training set score: {:.3f}".format(clf2.score(x_train, y_train)))
print("Test set score: {:.3f}".format(clf2.score(x_test, y_test)))
print(classification_report(y_test, y_pred))


Training set score: 0.742
Test set score: 0.574


In [None]:
def extract_feature_rnn(file_path, max_len=400):
    import soundfile
    import librosa
    import numpy as np

    try:
        with soundfile.SoundFile(file_path) as f:
            X = f.read(dtype="float32")
            sr = f.samplerate
            mfcc = librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40)  # shape: (40, T)

        mfcc = mfcc.T  # shape: (T, 40)

        if mfcc.shape[0] < max_len:
            pad_width = max_len - mfcc.shape[0]
            mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
        else:
            mfcc = mfcc[:max_len, :]

        return mfcc  # shape: (max_len, 40)
    except Exception as e:
        print(f" Error in {file_path}: {e}")
        return None


In [None]:
def load_data_rnn(data_path, test_size=0.2):
    import os
    import numpy as np
    from sklearn.model_selection import train_test_split

    X, y = [], []
    for label in os.listdir(data_path):
        emotion_path = os.path.join(data_path, label)
        if not os.path.isdir(emotion_path):
            continue
        for file in os.listdir(emotion_path):
            if not file.lower().endswith(".wav"):
                continue
            file_path = os.path.join(emotion_path, file)
            feat = extract_feature_rnn(file_path)
            if feat is not None and feat.shape == (400, 40):
                X.append(feat)
                y.append(label)

    X = np.stack(X)  # shape: (samples, 400, 40)
    y = np.array(y)
    return train_test_split(X, y, test_size=test_size, random_state=42)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = load_data_rnn(DATA_PATH)

encoder = LabelEncoder()
y_train_enc = to_categorical(encoder.fit_transform(y_train))
y_test_enc = to_categorical(encoder.transform(y_test))
num_classes = y_train_enc.shape[1]
    

2025-05-03 19:15:09.401899: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-03 19:15:09.409297: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 19:15:09.470213: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 19:15:09.507895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746317709.543941    4467 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746317709.55

 Error in /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Happy/03-01-03-01-02-01-20.wav: operands could not be broadcast together with remapped shapes [original->remapped]: (2,2)  and requested shape (3,2)
 Error in /home/verethragna/Documents/itmd-524/Audio-Emotion-Classification/src/Emotions/Fearful/03-01-06-01-01-02-20.wav: operands could not be broadcast together with remapped shapes [original->remapped]: (2,2)  and requested shape (3,2)


In [None]:
from tensorflow.keras.models import Sequential
import tensorflow as tf

model = Sequential()
model.add(tf.keras.layers.Conv1D(2048, kernel_size=5, strides=1, padding='same',
                                 activation='relu', input_shape=(400, 40)))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Conv1D(1024, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Conv1D(512, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.BatchNormalization())

#model.add(tf.keras.layers.LSTM(256, return_sequences=True))
#model.add(tf.keras.layers.LSTM(128))

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model.fit(X_train, y_train_enc, epochs=10, batch_size=32, validation_data=(X_test, y_test_enc))
loss, accuracy = model.evaluate(X_test, y_test_enc)
y_pred_probs = model.predict(X_test)

y_pred = tf.argmax(y_pred_probs, axis=1)
y_true = tf.argmax(y_test_enc, axis=1)


print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=encoder.classes_))

 
print(classification_report(y_test, y_pred))




Epoch 1/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m638s[0m 3s/step - accuracy: 0.2769 - loss: 1.7481 - val_accuracy: 0.3596 - val_loss: 1.3322
Epoch 2/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m665s[0m 3s/step - accuracy: 0.3598 - loss: 1.3467 - val_accuracy: 0.4497 - val_loss: 1.2429
Epoch 3/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 3s/step - accuracy: 0.4049 - loss: 1.2995 - val_accuracy: 0.5064 - val_loss: 1.1882
Epoch 4/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m672s[0m 3s/step - accuracy: 0.4126 - loss: 1.2668 - val_accuracy: 0.4591 - val_loss: 1.1886
Epoch 5/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m690s[0m 3s/step - accuracy: 0.4128 - loss: 1.2508 - val_accuracy: 0.4673 - val_loss: 1.1433
Epoch 6/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 3s/step - accuracy: 0.4312 - loss: 1.2300 - val_accuracy: 0.4766 - val_loss: 1.1229
Epoch 7/10
[1m214/214

ValueError: Mix of label input types (string and number)