In [25]:
# importing required packages

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import numpy as np
import soundfile
import librosa
import pickle
import glob
import os

In [10]:
# defining a function to extract mfcc, chroma and mel features from sound file

def extract_feature(file_name, mfcc, chroma, mel):
    # reading the soundfile
    
    with soundfile.SoundFile(file_name) as sound_file:
        sound = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        
        if chroma:
            # grabbing Short-Time Fourier Transform of sound
            stft = np.abs(librosa.stft(sound))
        result = np.array([])
        
        #mfcc: Mel Frequency Cepstral Coefficient, represents the short-term power spectrum of a sound
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=sound, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        
        # chroma: Pertains to the 12 different pitch classes
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        
        # mel: Mel Spectrogram Frequency
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(sound, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    
    #hstack(): stacks arrays in sequence horizontally - column fashion
    return result

In [11]:
# let's define a dictionary to hold numbers and emotions available in RAVDESS dataset
# and also a list to hold the emotions we want

emotions = {

    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"

}

# emotions we will observe
observed_emotions = ["calm", "happy", "fearful", "disgust", "sad"]

In [12]:
# let's load the dataset and extract features from each sound file

def load_data(test_size=0.2):
    
    features = []
    emotions_ = []
    
    # we will grab all the audio files using below command and pattern with glob
    for file in glob.glob("D:\\adity\\Projects\\speech_emotion_recognition\\ravdess-data\\Actor_*\\*.wav"):
        
        file_name = os.path.basename(file)
        # since the 3rd value in audio file name signifies emotion in the file
        emotion = emotions[file_name.split("-")[2]]
        
        if emotion not in observed_emotions:
             continue
        
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        features.append(feature)
        emotions_.append(emotion)
        
    return train_test_split(np.array(features), emotions_, test_size=test_size, random_state=9)

In [45]:
# splitting the dataset
x_train, x_test, y_train, y_test = load_data(test_size=0.2)

print((x_train.shape[0], x_test.shape[0]))

(614, 154)


In [46]:
# Number of features extracted
print("[INFO] Features extracted: {}".format(x_train.shape[1]))

[INFO] Features extracted: 180


In [47]:
# let's initialize our model MLPClassifier aka Multi-layer Perceptron Classifier
# This is a feedforward ANN model

print("[INFO] Initializing MLPClassifier model...")
model = MLPClassifier(alpha=0.1, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate="adaptive", max_iter=500)

[INFO] Initializing MLPClassifier model...


In [48]:
# let's begin training

#print("[INFO] The training begins...")
model.fit(x_train, y_train)
#print("[INFO] The training terminated...")

MLPClassifier(activation='relu', alpha=0.1, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [49]:
# let's predict for test set

print("[INFO] Predictions begin...")
y_preds = model.predict(x_test)

[INFO] Predictions begin...


In [50]:
# calculating model accuracy

accuracy = accuracy_score(y_true=y_test, y_pred=y_preds)
print("[INFO] Accuracy: {:.2f}%".format(accuracy * 100))

[INFO] Accuracy: 74.68%


In [51]:
# let's save our model using joblib
import joblib

print("[INFO] Saving model to disk...")
joblib.dump(model, "spe_motion_model.sav")

[INFO] Saving model to disk...


['spe_motion_model.sav']

In [None]:
#NOTE :: Don't re run the entire script if you simply want to test the model. Run cells below from here. Or else the saved model will be over-written by the new one... If you want to do that then you may..

In [None]:
# NOTE: for loading this trained model use joblib.load(model_name)

In [13]:
# let's it on an unknown sample
import joblib

print("[INFO] Loading model for predictions...")
load_model = joblib.load("spe_motion_model.sav")

[INFO] Loading model for predictions...


In [14]:
# Loading the sample audio .. you can use argparse to get input's location from user ... Use your own sample audio by placing in project directory and have fun...

print("[INFO] Loading the sample audio...")
audio = "audio.wav"
feature = extract_feature(audio, mfcc=True, chroma=True, mel=True)

[INFO] Loading the sample audio...


In [18]:
print("[INFO] Evaluating predictions...")
pred = load_model.predict(feature.reshape(1, -1))

[INFO] Evaluating predictions...


In [24]:
# let's display the result

print("[INFO] Emotion for given Audio: {}".format(pred[0]))

[INFO] Emotion for given Audio: fearful
