# Load data

We import the data from Kaggle

In [11]:
from google.colab import files
!pip install -q kaggle

In [12]:
uploaded = files.upload()


Saving kaggle.json to kaggle.json


In [13]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [14]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Downloading ravdess-emotional-speech-audio.zip to /content
 97% 417M/429M [00:04<00:00, 83.8MB/s]
100% 429M/429M [00:04<00:00, 107MB/s] 


In [15]:
!apt-get install zipfile


Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package zipfile


In [16]:
from zipfile import ZipFile
file_name="ravdess-emotional-speech-audio.zip"
with ZipFile(file_name,'r')as zip : 
  zip.extractall()
  print ("done")

done


In [17]:
! kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess

Downloading toronto-emotional-speech-set-tess.zip to /content
 99% 424M/428M [00:04<00:00, 48.8MB/s]
100% 428M/428M [00:04<00:00, 90.8MB/s]


In [18]:
from zipfile import ZipFile
file_name="toronto-emotional-speech-set-tess.zip"
with ZipFile(file_name,'r')as zip : 
  zip.extractall()
  print ("done")

done


#Extract Feature

In [19]:
def extract_feature(file_name, **kwargs):
   
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        X = librosa.to_mono(X)
        sample_rate = 22050
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result

#  data preparation 

**Ravdess**

In [20]:
import soundfile
import numpy as np
import librosa
import glob
import os
from sklearn.model_selection import train_test_split

# all emotions on RAVDESS dataset
int2emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# we allow only these  four emotions
AVAILABLE_EMOTIONS = {
    "angry",
    "sad",
    "neutral",
    "surprised"
}

In [21]:
def load_data(test_size=0.2):
    X, y = [], []
    
    for file in glob.glob("Actor_*/*.wav"):
        # get the base name of the audio file
        basename = os.path.basename(file)
        print(basename)
        # get the emotion label
        emotion = int2emotion[basename.split("-")[2]]
        # we allow only AVAILABLE_EMOTIONS we set
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        # extract speech features
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        # add to data
        X.append(features)
        l={'surprised':0.0,'sad':1.0,'neutral':3.0,'angry':4.0}
        y.append(l[emotion])
   
  # split the data to training and testing and return it
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [None]:
X_trainr, X_test, y_trainr, y_test = load_data(test_size=0.2)
# print some details
# number of samples in training data
print("[+] Number of training samples:", X_trainr.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", X_test.shape[0])
print("[+] Number of features:", X_trainr.shape[1])

In [13]:
type(y_trainr)

list



*   Number of training samples: 537
*   Number of testing samples: 135
*   Number of features: 180



In [14]:
import numpy as np
X_trainr = np.asarray(X_trainr)
y_trainr= np.asarray(y_trainr)
X_test=np.array(X_test)
y_test=np.array(y_test)

In [15]:
X_trainr.shape,y_trainr.shape,X_test.shape,y_test.shape

((537, 180), (537,), (135, 180), (135,))

**Tess**

In [22]:
#TESS LOAD DATA
import soundfile
import numpy as np
import librosa
import glob
import os
from sklearn.model_selection import train_test_split

# all emotions on RAVDESS dataset
int2emotiontess = {
    "ne": "neutral",
   
    "ha": "happy",
    "sa": "sad",
    "an": "angry",
    "fe": "fearful",
    "di": "disgust",
    "ps": "surprised"
}

# we allow only these  four emotions
AVAILABLE_EMOTIONStess = {
    "angry",
    "sad",
    "neutral",
    "surprised"
}

In [23]:
def load_data_tess():
    X, y = [], []
    
    for file in glob.glob("TESS Toronto emotional speech set data/*/*.wav"):
        # get the base name of the audio file
        basename = os.path.basename(file)
        print(basename)
        # get the emotion label
        emotion = int2emotiontess[basename.split("_")[2][0:2]]
        # we allow only AVAILABLE_EMOTIONS we set
        if emotion not in AVAILABLE_EMOTIONStess:
            continue
        # extract speech features
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        # add to data
        X.append(features)
        l={'surprised':0.0,'sad':1.0,'neutral':3.0,'angry':4.0}
        y.append(l[emotion])
   
  # split the data to training and testing and return it
    return (np.array(X), y, )

In [None]:
import glob 
import os
X_traint, y_traint= load_data_tess()
# print some details
# number of samples in training data
print("[+] Number of training samples:", X_traint.shape[0])
# number of samples in testing data
print("[+] Number of features:", X_traint.shape[1])

In [19]:
import numpy as np
y_traint= np.asarray(y_traint)
print("[+] Number of testing samples:",y_traint.shape)


[+] Number of testing samples: (1600,)


In [20]:
print(X_traint.shape)
print(y_traint.shape)

(1600, 180)
(1600,)


**Ravdess+Tess**

In [21]:
import numpy as np 

X_train= np.concatenate((X_trainr, X_traint), axis=0)

y_train= np.concatenate((y_trainr, y_traint), axis=0)
X_train.shape,y_train.shape

((2137, 180), (2137,))

In [22]:
x_traincnn = np.expand_dims(X_train, axis=2)
x_testcnn = np.expand_dims(X_test, axis=2)

In [23]:
x_traincnn.shape,x_testcnn.shape


((2137, 180, 1), (135, 180, 1))

# Deep Learning Approach

**Training**

In [None]:
import keras
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint

model = Sequential()

model.add(Conv1D(128, 5,padding='same',input_shape=(180,1)))        
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(MaxPooling1D(pool_size=(8)))



model.add(Conv1D(128, 5,padding='same',))                           
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Flatten())
model.add(Dense(8))                                                 
model.add(Activation('softmax'))
opt = keras.optimizers.RMSprop(lr=0.00005, rho=0.9, epsilon=None, decay=0.0)

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping

cnnhistory=model.fit(x_traincnn, y_train, batch_size=20, epochs=500, validation_data=(x_testcnn, y_test),callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500


In [None]:

loss, acc = model.evaluate(x_testcnn, y_test)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Restored model, accuracy: 75.56%


Restored model, accuracy: 75.56%

**Evaluation**

In [None]:
y_pred=model.predict(x_testcnn)

In [None]:
y_pred=np.argmax(y_pred, axis=1)

In [None]:
y_pred

array([0, 4, 4, 1, 4, 1, 3, 3, 1, 1, 0, 0, 0, 4, 4, 4, 4, 1, 4, 1, 1, 1,
       1, 3, 0, 0, 4, 3, 3, 1, 4, 0, 4, 4, 3, 4, 4, 4, 0, 0, 0, 1, 4, 1,
       4, 3, 1, 3, 3, 3, 0, 0, 1, 4, 0, 1, 1, 3, 4, 4, 0, 0, 0, 4, 1, 1,
       1, 0, 0, 0, 3, 4, 1, 0, 0, 0, 0, 0, 3, 0, 4, 0, 1, 4, 4, 4, 4, 3,
       1, 3, 0, 0, 3, 1, 3, 1, 4, 1, 3, 3, 0, 4, 0, 4, 4, 0, 1, 4, 4, 4,
       4, 4, 0, 1, 1, 4, 0, 4, 3, 3, 0, 1, 4, 3, 4, 0, 1, 0, 4, 4, 0, 1,
       1, 3, 1])

In [None]:
y_test

array([0., 4., 4., 1., 4., 1., 3., 3., 1., 1., 0., 0., 0., 4., 4., 4., 4.,
       1., 4., 3., 1., 1., 1., 3., 0., 0., 4., 1., 3., 1., 4., 0., 4., 4.,
       3., 4., 4., 4., 0., 0., 3., 1., 4., 1., 4., 3., 1., 3., 3., 3., 0.,
       4., 1., 4., 0., 1., 1., 1., 4., 4., 0., 0., 0., 4., 1., 1., 1., 0.,
       0., 0., 3., 4., 1., 0., 0., 0., 0., 0., 3., 0., 4., 0., 1., 0., 1.,
       4., 4., 1., 1., 1., 0., 1., 3., 1., 4., 1., 4., 1., 0., 3., 0., 4.,
       0., 4., 4., 0., 1., 4., 4., 4., 4., 4., 0., 1., 1., 4., 0., 4., 0.,
       3., 0., 1., 4., 3., 4., 0., 1., 0., 4., 4., 0., 1., 3., 3., 1.])

In [None]:
import pandas as pd
confusion_matrix=pd.crosstab(y_test,y_pred,rownames=["Actual"],colnames=["Predicted"])
print(confusion_matrix)


Predicted   0   1   3   4
Actual                   
0.0        34   0   2   1
1.0         1  31   4   1
3.0         1   2  16   0
4.0         1   0   1  40


* **The accuracy** of the model is basically the total number of correct predictions divided by total number of predictions.
* **The precision** of a class define how trustable is the result when the model answer that a point belongs to that class.
* **The recall** of a class expresses how well the model is able to detect that class.
* **The F1** score of a class is given by the harmonic mean of precision and recall (2×precision×recall / (precision + recall)), it combines precision and recall of a class in one metric.

For a given class, the different combinations of recall and precision have the following meanings :
* **high recall + high precision** : the class is perfectly handled by the model
* **low recall + high precision** : the model can’t detect the class well but is highly trustable when it does
* **high recall + low precision** : the class is well detected but the model also include points of other classes in it
* **low recall + low precision** : the class is poorly handled by the model


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
        l={'surprised':0.0,'sad':1.0,'neutral':3.0,'angry':4.0}


              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92        37
         1.0       0.94      0.84      0.89        37
         3.0       0.70      0.84      0.76        19
         4.0       0.95      0.95      0.95        42

    accuracy                           0.90       135
   macro avg       0.88      0.89      0.88       135
weighted avg       0.90      0.90      0.90       135



***The Classifier predictes well the emotion of surprise ,sadness ,anger  but it gets  confused when it comes to predict the neutral emotion  ***

# Machine Learning Approach 

Ravdess

In [None]:
X_trainr, X_test, y_trainr, y_test = load_data(test_size=0.2)
# print some details
# number of samples in training data
print("[+] Number of training samples:", X_trainr.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", X_test.shape[0])
print("[+] Number of features:", X_trainr.shape[1])

In [25]:
import numpy as np
X_trainr = np.asarray(X_trainr)
y_trainr= np.asarray(y_trainr)
X_test=np.array(X_test)
y_test=np.array(y_test)

In [26]:
X_trainr.shape,y_trainr.shape,X_test.shape,y_test.shape

((537, 180), (537,), (135, 180), (135,))

Tess

In [None]:
import glob 
import os
X_traint, y_traint= load_data_tess()
# print some details
# number of samples in training data
print("[+] Number of training samples:", X_traint.shape[0])
# number of samples in testing data
print("[+] Number of features:", X_traint.shape[1])

In [28]:
import numpy as np
y_traint= np.asarray(y_traint)
print("[+] Number of testing samples:",y_traint.shape)

[+] Number of testing samples: (1600,)


In [29]:
print(X_traint.shape)
print(y_traint.shape)

(1600, 180)
(1600,)


Ravdess+Tess

In [30]:
import numpy as np 

X_train= np.concatenate((X_trainr, X_traint), axis=0)

y_train= np.concatenate((y_trainr, y_traint), axis=0)
X_train.shape,y_train.shape

((2137, 180), (2137,))

**Support vector machine**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train, y_train)


In [34]:
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [35]:
from sklearn.metrics import classification_report
grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

         0.0       0.67      0.70      0.68        37
         1.0       0.63      0.70      0.67        37
         3.0       0.73      0.58      0.65        19
         4.0       0.88      0.83      0.85        42

    accuracy                           0.73       135
   macro avg       0.73      0.70      0.71       135
weighted avg       0.73      0.73      0.73       135



**RandomForest**

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

model_rf = RandomForestClassifier(max_depth=8,n_estimators =350)
model_rf.fit(X_train,y_train)

parameters = { "max_depth": [3, 4,5, 7,8, 9, 11, 13],"n_estimators": [150,200,250,300,350]}
model_gr= GridSearchCV(model_rf,parameters,cv=5, scoring="accuracy")
model_gr.fit(X_train,y_train)
print('Best parameters', model_gr.best_params_)
print("accuracy score of the best_estimator",model_gr.best_score_)

Best parameters {'max_depth': 13, 'n_estimators': 150}
accuracy score of the best_estimator 0.8859791197005844


In [46]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pandas as pd

c_p = model_gr.predict(X_test) 

print(accuracy_score(y_true=y_test,y_pred=c_p))
print("***********************")

print(classification_report(y_test,c_p)) 
print("***********************")

# creating a confusion matrix 
confusion_matrix=pd.crosstab(y_test,c_p,rownames=["Actual"],colnames=["Predicted"])
print(confusion_matrix)

0.7333333333333333
***********************
              precision    recall  f1-score   support

         0.0       0.62      0.74      0.68        31
         1.0       0.70      0.83      0.76        46
         3.0       0.86      0.60      0.71        20
         4.0       0.87      0.68      0.76        38

    accuracy                           0.73       135
   macro avg       0.76      0.71      0.73       135
weighted avg       0.75      0.73      0.73       135

***********************
Predicted  0.0  1.0  3.0  4.0
Actual                       
0.0         23    5    1    2
1.0          5   38    1    2
3.0          0    8   12    0
4.0          9    3    0   26


**From Classification metrics we conclude that deep learning model does a better job in classifiying the emotions  we are going to use it as our final model**


#Demo

In [None]:
audioinput,sr=librosa.load("Actor_01/03-01-05-01-01-01-01.wav")
sr

22050

In [None]:
features = extract_feature("Actor_01/03-01-05-01-01-01-01.wav", mfcc=True, chroma=True, mel=True)
x=[]
x.append(features)
x=np.array(x)

In [None]:
 l={'surprised':0.0,'sad':1.0,'neutral':3.0,'angry':4.0}
 x = np.expand_dims(x, axis=2)

res=model.predict(x)
 res=np.argmax(res, axis=1)
 key_list = list(l.keys())
val_list = list(l.values())
position = val_list.index(res[0])

In [None]:
print(key_list[position])


angry


In [None]:
from IPython.display import Audio
Audio("Actor_01/03-01-05-02-01-01-01.wav")

In [None]:
audio_input,sr=librosa.load("sample.wav",sr=22050)

In [None]:
features = extract_feature("sample.wav", mfcc=True, chroma=True, mel=True)
x=[]
x.append(features)
x=np.array(x)

  n_fft, y.shape[-1]


In [None]:
 l={'surprised':0.0,'sad':1.0,'neutral':3.0,'angry':4.0}


In [None]:
x = np.expand_dims(x, axis=2)

res=model.predict(x)

In [None]:
 res=np.argmax(res, axis=1)

In [None]:
res

array([0])

In [None]:
key_list = list(l.keys())
val_list = list(l.values())
position = val_list.index(res[0])

In [None]:
print(key_list[position])


surprised


#Load Model

In [None]:
print(model.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 180, 128)          768       
_________________________________________________________________
activation (Activation)      (None, 180, 128)          0         
_________________________________________________________________
dropout (Dropout)            (None, 180, 128)          0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 22, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 22, 128)           82048     
_________________________________________________________________
activation_1 (Activation)    (None, 22, 128)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 22, 128)           0

In [None]:

model.save("emtion_recognition.h5")

In [None]:
from keras.models import load_model
emo_reco=load_model("emtion_recognition.h5")

In [None]:
import soundfile
import librosa
import numpy as np
features = extract_feature("sample.wav", mfcc=True, chroma=True, mel=True)
x=[]
x.append(features)
x=np.array(x)

  n_fft, y.shape[-1]


In [None]:
x = np.expand_dims(x, axis=2)

res=emo_reco.predict(x)

In [None]:
 l={'surprised':0.0,'sad':1.0,'neutral':3.0,'angry':4.0}
 res=np.argmax(res, axis=1)
key_list = list(l.keys())
val_list = list(l.values())
position = val_list.index(res[0])

In [None]:
print(key_list[position])


surprised


In [None]:
type(key_list[position])

str