In [1]:

import tensorflow as tf
import numpy as np
import os
import librosa
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Dataset Path (Replace with actual path)
dataset_folder = "/kaggle/input/speech-emotion-recognition-en/Tess"  # Folder containing subfolders of audio files

# Function to load and extract features from audio
def extract_features(file_path, sr=22050, n_mfcc=40):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc.T  # Transpose to get (time_steps, features)

# Prepare dataset
features, labels = [], []
label_map = {}
for idx, emotion in enumerate(sorted(os.listdir(dataset_folder))):
    emotion_folder = os.path.join(dataset_folder, emotion)
    if os.path.isdir(emotion_folder):
        label_map[idx] = emotion
        for file in os.listdir(emotion_folder):
            file_path = os.path.join(emotion_folder, file)
            if file_path.endswith(".wav"):
                mfcc = extract_features(file_path)
                features.append(mfcc)
                labels.append(idx)
                

In [3]:
print(label_map)

{0: 'OAF_Fear', 1: 'OAF_Pleasant_surprise', 2: 'OAF_Sad', 3: 'OAF_angry', 4: 'OAF_disgust', 5: 'OAF_happy', 6: 'OAF_neutral', 7: 'YAF_angry', 8: 'YAF_disgust', 9: 'YAF_fear', 10: 'YAF_happy', 11: 'YAF_neutral', 12: 'YAF_pleasant_surprised', 13: 'YAF_sad'}


In [5]:
max_len = max([f.shape[0] for f in features])
print(max_len)
x_data = np.array([np.pad(f, ((0, max_len - f.shape[0]), (0, 0)), mode='constant') for f in features])
y_labels = np.array(labels)

# Split dataset into train, validation, and test sets
x_train, x_temp, y_train, y_temp = train_test_split(x_data, y_labels, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# Convert labels to categorical
y_train = to_categorical(y_train, len(label_map))
y_val = to_categorical(y_val, len(label_map))
y_test = to_categorical(y_test, len(label_map))

# Hyperparameters
input_dim = x_data.shape[2]  # Feature size inferred from dataset
seq_len = x_data.shape[1]  # Sequence length inferred from dataset
hidden_dim = 128
output_dim = len(label_map)  # Number of emotion classes
num_layers = 2
learning_rate = 0.001
batch_size = 32
epochs = 5

# Define LSTM Model
model = Sequential()
model.add(LSTM(hidden_dim, return_sequences=True, input_shape=(seq_len, input_dim)))
for _ in range(num_layers - 1):
    model.add(LSTM(hidden_dim, return_sequences=False))
model.add(Dense(output_dim, activation='softmax'))

# Compile Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


129


In [6]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val))

# Evaluate Model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")
print("Training Complete!")


Epoch 1/5
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.2613 - loss: 2.1541 - val_accuracy: 0.6405 - val_loss: 0.9233
Epoch 2/5
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7758 - loss: 0.6507 - val_accuracy: 0.8143 - val_loss: 0.4844
Epoch 3/5
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8771 - loss: 0.3204 - val_accuracy: 0.6690 - val_loss: 0.9111
Epoch 4/5
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8285 - loss: 0.4511 - val_accuracy: 0.8881 - val_loss: 0.2682
Epoch 5/5
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9265 - loss: 0.1838 - val_accuracy: 0.9548 - val_loss: 0.1463
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9690 - loss: 0.1008
Test Accuracy: 0.9595
Training Complete!


In [7]:
def predict_emotion(file_path):
    mfcc = extract_features(file_path)
    mfcc_padded = np.pad(mfcc, ((0, max_len - mfcc.shape[0]), (0, 0)), mode='constant')
    mfcc_padded = np.expand_dims(mfcc_padded, axis=0)  # Add batch dimension
    prediction = model.predict(mfcc_padded)
    predicted_label = np.argmax(prediction)
    emotion = label_map[predicted_label]
    print(f"Predicted Emotion: {emotion}")
    return emotion

In [None]:
# predict_emotion("/kaggle/input/speech-emotion-recognition-en/Tess/YAF_disgust/YAF_back_disgust.wav")

In [None]:
# model.save("audio_emotion_model.keras")