### Installation

In [1]:
!pip install librosa soundfile numpy sklearn pyaudio



In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.5.0-cp310-cp310-win_amd64.whl (10.4 MB)
     --------------------------------------- 10.4/10.4 MB 11.1 MB/s eta 0:00:00
Installing collected packages: pandas
Successfully installed pandas-1.5.0


In [3]:
!pip install keras tensorflow

In [4]:
!pip install pickle-mixin

Collecting pickle-mixin
  Downloading pickle-mixin-1.0.2.tar.gz (5.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pickle-mixin
  Building wheel for pickle-mixin (setup.py): started
  Building wheel for pickle-mixin (setup.py): finished with status 'done'
  Created wheel for pickle-mixin: filename=pickle_mixin-1.0.2-py3-none-any.whl size=6008 sha256=b45be5bd227e8dc006f24f476e5b96c36edaf2c32e86490ec1328f81dfa5ccf4
  Stored in directory: c:\users\born creative\appdata\local\pip\cache\wheels\3e\c6\e9\d1b0a34e1efc6c3ec9c086623972c6de6317faddb2af0a619c
Successfully built pickle-mixin
Installing collected packages: pickle-mixin
Successfully installed pickle-mixin-1.0.2


#### Imports

In [5]:
import librosa
import soundfile
import os, glob, pickle, sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pickle

#imports for cnn
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint


import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

### Preparing Data

Functions to add augmentation and extract Features from existing dataset

In [6]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def extract_features(data,sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally
    

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally
    

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally
   

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally
    

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data,sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data,sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch,sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [9]:
X,Y=[],[]

#emotions in dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
i = 0
for file in glob.glob("F:\\01 Code\\01 Projects\\SpeechEmotionRecognitionApp\\ser_model\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav"):
    file_name=os.path.basename(file)
    i+=1
    # Emotion From File 
    emotion=emotions[file_name.split("-")[2]]
    
    # Feature Extraction
    features = get_features(file)
    for ele in features:
        X.append(ele)
        Y.append(emotion)
print(i)

1440


In [10]:
Y[:10]

['neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral']

In [11]:
Dataframe_x = pd.DataFrame(X)
Dataframe_y = pd.DataFrame(Y)

In [12]:
Dataframe_x.shape



(4320, 162)

In [13]:
Dataframe_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152,153,154,155,156,157,158,159,160,161
0,0.224306,0.66419,0.693038,0.66539,0.673116,0.696442,0.684787,0.706183,0.748844,0.782297,...,8.778047000000001e-17,6.888786000000001e-17,6.993841000000001e-17,7.601734000000001e-17,7.305700000000001e-17,6.833248e-17,7.254372000000001e-17,7.962737e-17,7.395194000000001e-17,7.358765000000001e-17
1,0.2931,0.733478,0.800266,0.78663,0.782051,0.796719,0.733336,0.673699,0.707605,0.742666,...,1.055862e-05,1.116067e-05,1.099605e-05,1.075941e-05,1.087752e-05,1.097741e-05,1.106519e-05,1.124121e-05,1.107555e-05,1.155873e-05
2,0.169383,0.579495,0.662042,0.674849,0.631609,0.623019,0.687452,0.671907,0.692694,0.712601,...,2.503228e-15,1.88516e-15,1.743947e-15,1.590055e-15,1.477827e-15,1.528373e-15,1.689925e-15,1.679742e-15,1.602365e-15,1.500745e-15
3,0.196533,0.652948,0.692924,0.664361,0.648762,0.686783,0.688136,0.68301,0.735986,0.759067,...,8.858628e-17,7.385022e-17,6.953341000000001e-17,7.445123000000001e-17,7.997667e-17,7.75453e-17,8.110750000000001e-17,7.647289000000001e-17,7.359221e-17,7.287654000000001e-17
4,0.302043,0.758702,0.816364,0.80183,0.758661,0.770253,0.740508,0.65049,0.692977,0.726915,...,3.317626e-05,3.241854e-05,3.268253e-05,3.434911e-05,3.221454e-05,3.269329e-05,3.364182e-05,3.396521e-05,3.20878e-05,3.358697e-05


In [14]:
encoder = OneHotEncoder()
encoded_Y = encoder.fit_transform(np.array(Dataframe_y).reshape(-1,1)).toarray()


In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)

In [16]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [25]:
filename = 'std_scaler.sav'
pickle.dump(scaler, open(filename, 'wb'))

## Support Vector Machine

In [17]:
from sklearn.svm  import SVC
svm_model = SVC(kernel='linear',C=1,probability=True).fit(x_train,y_train)
svm_predictions = svm_model.predict(x_test)

In [18]:
svm_predictions

array(['surprised', 'sad', 'fearful', ..., 'surprised', 'happy', 'sad'],
      dtype='<U9')

In [19]:
print(accuracy_score(y_true=y_test,y_pred=svm_predictions))

0.5740740740740741


In [20]:
svm_model.predict_proba(x_test)

array([[0.02368857, 0.00108656, 0.01761809, ..., 0.00254003, 0.04819453,
        0.56481911],
       [0.14663265, 0.01412265, 0.05230928, ..., 0.0221292 , 0.39429783,
        0.0384749 ],
       [0.17757695, 0.00194729, 0.01283811, ..., 0.00243276, 0.00660411,
        0.0524984 ],
       ...,
       [0.04182418, 0.00341243, 0.27626148, ..., 0.01675358, 0.10557688,
        0.47280359],
       [0.0300042 , 0.18224874, 0.07555156, ..., 0.02477436, 0.03890998,
        0.02182713],
       [0.00903113, 0.30398158, 0.01377796, ..., 0.12682741, 0.41299421,
        0.01351434]])

In [21]:
test_predict = svm_model.predict(x_test)
# for x in range(len(y_test)):
#     print(y_test[x] +"->"+test_predict[x])
i = 0 
for x in range(len(y_test)):
    if y_test[x] == test_predict[x]:
        i+=1
print(i)
print(len(y_test))

620
1080


In [22]:
svm_model.classes_

array(['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad',
       'surprised'], dtype='<U9')

In [24]:
filename = 'svm_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))

In [None]:
data, sr = librosa.load('F:\\01 Code\\01 Projects\\SpeechEmotionRecognitionApp\\ser_model\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav')

## Random Forest


In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
  
 # create regressor object 
classifier = RandomForestClassifier(n_estimators = 100, random_state = 0) 
  


In [69]:
# fit the regressor with x and y data 
classifier.fit(x_train, y_train)   



In [70]:
rt_predictions = classifier.predict(x_test) 

In [71]:
print(rt_predictions)

['fearful' 'sad' 'fearful' ... 'surprised' 'fearful' 'sad']


In [72]:
classifier.predict_proba(x_test)

array([[0.04, 0.  , 0.05, ..., 0.  , 0.02, 0.13],
       [0.1 , 0.12, 0.15, ..., 0.13, 0.23, 0.14],
       [0.18, 0.  , 0.04, ..., 0.  , 0.03, 0.02],
       ...,
       [0.07, 0.02, 0.2 , ..., 0.  , 0.07, 0.43],
       [0.09, 0.18, 0.12, ..., 0.02, 0.06, 0.16],
       [0.  , 0.13, 0.08, ..., 0.14, 0.38, 0.06]])

In [74]:
print(accuracy_score(y_true=y_test,y_pred=rt_predictions))
#print(classification_report(y_test,rt_predictions)) 
# creating a confusion matrix 
#print(confusion_matrix(y_test,rt_predictions) )

0.687037037037037
