In [None]:
import zipfile

# Path to the ZIP file and the folder to extract to
zip_file_path = '/content/drive/MyDrive/Speech Emotion Recognition.zip'  # Replace with your ZIP file path
extract_to_folder = '/content/SER'  # Replace with your desired extraction folder

# Extract files
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_folder)

print(f'Files extracted to {extract_to_folder}')


Files extracted to /content/SER


In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# Function to extract features from an audio file
def extract_features(audio_path, max_len=22050*5):  # 5 seconds of audio at 22050Hz
    # Load the audio file
    audio, sample_rate = librosa.load(audio_path, sr=None, duration=5)  # Limit to 5 seconds
    # Ensure consistent length by padding or truncating
    if len(audio) < max_len:
        padding = max_len - len(audio)
        audio = np.pad(audio, (0, padding), 'constant')
    else:
        audio = audio[:max_len]

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs = np.mean(mfccs.T, axis=0)  # Average across time axis

    return mfccs

# Function to load the dataset and extract features
def load_data(dataset_path):
    features = []
    labels = []

    # Iterate through each folder and file
    for label in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, label)

        # Only process if it's a folder
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith('.wav'):  # Only process .wav files
                    file_path = os.path.join(folder_path, file)
                    try:
                        mfccs = extract_features(file_path)
                        features.append(mfccs)
                        labels.append(label)
                    except Exception as e:
                        print(f"Error processing {file}: {e}")
                        continue

    return np.array(features), np.array(labels)

# Path to your dataset folder
dataset_path = '/content/SER/Speech Emotion Recognition/TESS Toronto emotional speech set data'

# Load the data
X, y = load_data(dataset_path)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = to_categorical(label_encoder.fit_transform(y))

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Reshape features for CNN input (CNN expects 4D input)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1, 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1, 1)



In [None]:
# Build a CNN model
model = Sequential()

# Add convolutional layers
model.add(Conv2D(32, (1, 1), activation='relu', padding='same', input_shape=(X_train.shape[1], 1, 1)))  # Kernel size (1,1) for MFCC features
model.add(MaxPooling2D((1, 1)))  # Pooling with (1,1) to avoid reducing dimensions too much
model.add(Dropout(0.3))

model.add(Conv2D(64, (1, 1), activation='relu', padding='same'))
model.add(MaxPooling2D((1, 1)))  # Pooling with (1,1)
model.add(Dropout(0.3))

model.add(Conv2D(128, (1, 1), activation='relu', padding='same'))
model.add(MaxPooling2D((1, 1)))  # Pooling with (1,1)
model.add(Dropout(0.3))

# Flatten the output
model.add(Flatten())

# Add dense layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

# Output layer (adjust output size according to the number of emotions)
num_classes = y_encoded.shape[1]  # Number of unique emotions
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save the model for future use
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path in your Google Drive where the model will be saved
model_save_path = '/content/drive/My Drive/speech_emotion_recognition_model.h5'

# Save the model to Google Drive
model.save(model_save_path)

print(f"Model saved to: {model_save_path}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.1031 - loss: 9.5568 - val_accuracy: 0.4411 - val_loss: 2.0153
Epoch 2/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.3370 - loss: 2.0039 - val_accuracy: 0.9089 - val_loss: 0.3867
Epoch 3/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6743 - loss: 1.0134 - val_accuracy: 0.9482 - val_loss: 0.1431
Epoch 4/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7956 - loss: 0.5843 - val_accuracy: 0.9625 - val_loss: 0.1022
Epoch 5/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8682 - loss: 0.3949 - val_accuracy: 0.9893 - val_loss: 0.0460
Epoch 6/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8976 - loss: 0.3287 - val_accuracy: 0.9875 - val_loss: 0.0438
Epoch 7/30
[1m70/70[0m [32m━━━━━━━━━



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to: /content/drive/My Drive/speech_emotion_recognition_model.h5


In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Load the trained model
from tensorflow.keras.models import load_model

# Load the model
model = load_model('/content/drive/MyDrive/speech_emotion_recognition_model.h5')

# Make predictions on the test set
predictions = model.predict(X_test)

# Convert predictions from probabilities to class indices
predicted_classes = np.argmax(predictions, axis=1)

# Convert actual labels from one-hot encoding to class indices
actual_classes = np.argmax(y_test, axis=1)

# Decode the labels back to their original form
predicted_labels = label_encoder.inverse_transform(predicted_classes)
actual_labels = label_encoder.inverse_transform(actual_classes)

# Print actual and predicted labels for each sample
print("Actual vs. Predicted Emotions:")
for i in range(len(actual_labels)):
    print(f"Audio {i+1}: Actual = {actual_labels[i]}, Predicted = {predicted_labels[i]}")

# Alternatively, you can use a classification report for a summary
print("\nClassification Report:")
print(classification_report(actual_labels, predicted_labels))




[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step
Actual vs. Predicted Emotions:
Audio 1: Actual = OAF_Pleasant_surprise, Predicted = OAF_Pleasant_surprise
Audio 2: Actual = OAF_Fear, Predicted = OAF_Fear
Audio 3: Actual = OAF_happy, Predicted = OAF_happy
Audio 4: Actual = OAF_Pleasant_surprise, Predicted = OAF_Pleasant_surprise
Audio 5: Actual = YAF_neutral, Predicted = YAF_neutral
Audio 6: Actual = OAF_angry, Predicted = OAF_angry
Audio 7: Actual = YAF_sad, Predicted = YAF_sad
Audio 8: Actual = YAF_pleasant_surprised, Predicted = YAF_pleasant_surprised
Audio 9: Actual = YAF_fear, Predicted = YAF_fear
Audio 10: Actual = OAF_Pleasant_surprise, Predicted = OAF_Pleasant_surprise
Audio 11: Actual = OAF_disgust, Predicted = OAF_disgust
Audio 12: Actual = OAF_disgust, Predicted = OAF_disgust
Audio 13: Actual = OAF_happy, Predicted = OAF_happy
Audio 14: Actual = OAF_Pleasant_surprise, Predicted = OAF_Pleasant_surprise
Audio 15: Actual = OAF_happy, Predicted = OAF_happ