In [None]:
# Required Libraries
import os
import csv
import librosa
import numpy as np
import pandas as pd
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
#RAVDESS Emotional speech audio
# https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio

data='/content/audio_speech_actors_01-24.zip'
with ZipFile(data,'r') as zip:
  zip.extractall()
  print('dataset is extracted successfully')

dataset is extracted successfully


In [None]:
# Let's read a sample audio using librosa
audio_file_path='/content/audio_speech_actors_01-24/Actor_06/03-01-06-02-01-02-06.wav'
librosa_audio_data,librosa_sample_rate=librosa.load(audio_file_path)

In [None]:
# Emotion class mapping (from ID to name)
emotion_mapping = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Folder containing the actor directories
parent_folder = 'audio_speech_actors_01-24'  # Adjust this path to your folder

# Prepare to write the CSV file
csv_filename = 'audio_emotion_data.csv'

# Open the CSV file in write mode
with open(csv_filename, mode='w', newline='') as csvfile:
    # Initialize CSV writer
    fieldnames = ['Filename', 'Folder', 'Class ID', 'Class Name']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Loop over each actor folder (Actor 01 to Actor 24)
    for actor_folder in os.listdir(parent_folder):
        actor_folder_path = os.path.join(parent_folder, actor_folder)

        # Ensure we're only looking at directories
        if os.path.isdir(actor_folder_path):
            # Loop through each file in the actor's folder
            for file in os.listdir(actor_folder_path):
                if file.endswith('.wav'):
                    # Split the filename by '-' and extract the required details
                    parts = file.split('-')

                    # Extract information from the filename
                    modality = parts[0]  # Modality (01 = audio-only, etc.)
                    speech = parts[1]    # Speech (01 = speech)
                    emotion_class_id = parts[2]  # Emotion (class ID, e.g., 06 for fearful)
                    emotion_class_name = emotion_mapping.get(emotion_class_id, 'Unknown')
                    emotional_intensity = parts[3]  # Emotional intensity (01 = normal)
                    statement = parts[4]  # Statement (01 = "Kids are talking...")
                    repetition = parts[5]  # Repetition (01 = 1st repetition)
                    actor_id = parts[6]  # Actor ID (01 to 24)

                    # Write row to CSV
                    writer.writerow({
                        'Filename': file,
                        'Folder': actor_folder,
                        'Class ID': emotion_class_id,
                        'Class Name': emotion_class_name
                    })

print(f"CSV file '{csv_filename}' has been created successfully.")

CSV file 'audio_emotion_data.csv' has been created successfully.


In [None]:
import pandas as pd
audio_df=pd.read_csv('audio_emotion_data.csv')
audio_df.head(10)

Unnamed: 0,Filename,Folder,Class ID,Class Name
0,03-01-02-01-02-01-01.wav,Actor_01,2,calm
1,03-01-08-01-02-02-01.wav,Actor_01,8,surprised
2,03-01-05-01-01-01-01.wav,Actor_01,5,angry
3,03-01-07-02-02-02-01.wav,Actor_01,7,disgust
4,03-01-03-02-02-02-01.wav,Actor_01,3,happy
5,03-01-01-01-02-02-01.wav,Actor_01,1,neutral
6,03-01-02-02-02-01-01.wav,Actor_01,2,calm
7,03-01-08-01-01-01-01.wav,Actor_01,8,surprised
8,03-01-08-02-02-01-01.wav,Actor_01,8,surprised
9,03-01-04-02-02-02-01.wav,Actor_01,4,sad


In [None]:
audio_df['Class Name'].value_counts()

Unnamed: 0_level_0,count
Class Name,Unnamed: 1_level_1
calm,192
surprised,192
angry,192
disgust,192
happy,192
sad,192
fearful,192
neutral,96


In [None]:
audio_df['Class Name'].isnull().sum()

0

In [None]:
audio_dataset_path='/content/audio_speech_actors_01-24'
metadata=pd.read_csv('audio_emotion_data.csv')
metadata.head()

Unnamed: 0,Filename,Folder,Class ID,Class Name
0,03-01-02-01-02-01-01.wav,Actor_01,2,calm
1,03-01-08-01-02-02-01.wav,Actor_01,8,surprised
2,03-01-05-01-01-01-01.wav,Actor_01,5,angry
3,03-01-07-02-02-02-01.wav,Actor_01,7,disgust
4,03-01-03-02-02-02-01.wav,Actor_01,3,happy


In [None]:
# Feature Extraction Function
def extract_features(file):
    audio, sample_rate = librosa.load(file, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)  # Take the mean of MFCCs over time
    return mfccs_scaled

In [None]:
# Extract Features and Labels
features = []
labels = []

for index_num, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(audio_dataset_path), str(row["Folder"]), str(row["Filename"]))
    class_label = row["Class Name"]
    try:
        features_ = extract_features(file_name)
        features.append(features_)
        labels.append(class_label)
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}, Error: {e}")

# Convert to Numpy Arrays
X = np.array(features)
y = np.array(labels)

# Encode Labels
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

# Class Weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

# One-Hot Encoding of Labels
y = pd.get_dummies(y).values

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ANN Model
model = Sequential()

# Input Layer
model.add(Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Hidden Layers
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Output Layer
model.add(Dense(y_train.shape[1], activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile Model
optimizer = Adam(learning_rate=1e-4)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6)

# Train Model
history = model.fit(
    X_train_scaled, y_train,
    batch_size=32,
    epochs=650,
    validation_split=0.1,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Evaluate Model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Epoch 1/650
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 155ms/step - accuracy: 0.1295 - loss: 3.2796 - val_accuracy: 0.1293 - val_loss: 2.0972 - learning_rate: 1.0000e-04
Epoch 2/650
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1337 - loss: 3.2715 - val_accuracy: 0.1293 - val_loss: 2.1054 - learning_rate: 1.0000e-04
Epoch 3/650
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1395 - loss: 3.2729 - val_accuracy: 0.1379 - val_loss: 2.1045 - learning_rate: 1.0000e-04
Epoch 4/650
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1434 - loss: 3.1443 - val_accuracy: 0.1466 - val_loss: 2.0963 - learning_rate: 1.0000e-04
Epoch 5/650
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1368 - loss: 3.0941 - val_accuracy: 0.1724 - val_loss: 2.0800 - learning_rate: 1.0000e-04
Epoch 6/650
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
# Building a Predictive System (Testing Some Audio Data)

filename='/content/audio_speech_actors_01-24/Actor_06/03-01-06-02-01-02-06.wav'
audio,sample_rate=librosa.load(filename)
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled = np.mean(mfccs.T, axis=0)  # Take the mean of MFCCs over time
print(mfccs_scaled)

mfccs_scaled_reshaped=mfccs_scaled.reshape(1,-1)
print(mfccs_scaled_reshaped.shape)
predicted_label=model.predict(mfccs_scaled_reshaped)

# Get the index of the class with the highest probability
predicted_index = np.argmax(predicted_label, axis=1)

prediction_class=labelencoder.inverse_transform(predicted_index)
print(f"Predicted Emotion: {prediction_class[0]}")

[-3.7622406e+02  5.6681504e+00 -4.9888969e+01 -1.2137798e+01
 -1.6262897e+01 -2.2261307e+01 -1.8829643e+01 -3.7612352e+00
 -4.0016875e+00  2.0488115e+01  1.1183810e+01  6.0422525e+00
  1.8064806e+00 -7.6966333e+00 -9.7748804e+00  8.4445751e-01
  2.2397048e+00  1.6709417e+00 -2.8776655e+00  3.3189039e+00
  8.8195601e+00 -3.1297016e+00 -6.6021667e+00  2.1168038e-01
 -2.1119630e+00 -1.3039175e-01  2.8393514e+00  4.9852858e+00
 -1.6064435e+00  4.8101797e+00  2.2546568e+00  1.3394228e+00
  2.3413353e+00  6.7924485e+00  1.5943357e+00  5.1745687e+00
  7.7707714e-01  1.5902046e+00  1.7126160e+00  2.4909937e+00]
(1, 40)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Predicted Emotion: calm
