### Installation

In [1]:
!pip install librosa soundfile numpy sklearn pyaudio



In [2]:
!pip install pandas



In [3]:
!pip install keras tensorflow



In [4]:
!pip install pickle-mixin



#### Imports

In [6]:
import librosa
import soundfile
import os, glob, pickle, sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pickle

#imports for cnn
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint


import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

### Preparing Data

Functions to add augmentation and extract Features from existing dataset

In [7]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def extract_features(data,sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally
    

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally
    

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally
   

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally
    

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data,sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data,sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch,sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [39]:
X,Y=[],[]

#emotions in dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
i = 0
for file in glob.glob("E:\\Code\\Projects\\SpeechEmotionRecognition\\ser_model\\Notebooks\\Ravdess\\Actor_*\\*.wav"):
    file_name=os.path.basename(file)
    i+=1
    # Emotion From File 
    emotion=emotions[file_name.split("-")[2]]
    
    # Feature Extraction
    features = get_features(file)
    for ele in features:
        X.append(ele)
        Y.append(emotion)
print(i)

1440


In [9]:
df = pd.read_csv('ravdess.csv');
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,153,154,155,156,157,158,159,160,161,0.1
0,0,0.224306,0.66419,0.693038,0.66539,0.673116,0.696442,0.684787,0.706183,0.748844,...,6.888786000000001e-17,6.993841000000001e-17,7.601734000000001e-17,7.305700000000001e-17,6.833248e-17,7.254372000000001e-17,7.962737e-17,7.395194000000001e-17,7.358765000000001e-17,neutral
1,1,0.284351,0.744315,0.799083,0.779027,0.777929,0.788401,0.70703,0.673835,0.716503,...,5.998793e-06,6.341882e-06,6.158174e-06,6.040658e-06,6.382624e-06,6.5906e-06,6.014139e-06,6.10716e-06,5.836667e-06,neutral
2,2,0.169383,0.579495,0.662042,0.674849,0.631609,0.623019,0.687452,0.671907,0.692694,...,1.88516e-15,1.743947e-15,1.590055e-15,1.477827e-15,1.528373e-15,1.689925e-15,1.679742e-15,1.602365e-15,1.500745e-15,neutral
3,3,0.196533,0.652948,0.692924,0.664361,0.648762,0.686783,0.688136,0.68301,0.735986,...,7.385022e-17,6.953341000000001e-17,7.445123000000001e-17,7.997667e-17,7.75453e-17,8.110750000000001e-17,7.647289000000001e-17,7.359221e-17,7.287654000000001e-17,neutral
4,4,0.301604,0.760249,0.804322,0.790674,0.760021,0.781235,0.71909,0.647474,0.713795,...,3.702991e-05,3.877967e-05,3.796563e-05,3.779248e-05,3.834673e-05,3.640969e-05,3.681771e-05,3.691485e-05,3.721758e-05,neutral


In [37]:
X = df.loc[:-1,]
Y = df.loc[-2:,]

In [40]:
X.shape
Y.shape

AttributeError: 'list' object has no attribute 'shape'

In [41]:
Dataframe_x = pd.DataFrame(X)
Dataframe_y = pd.DataFrame(Y)

In [42]:
Dataframe_x.shape



(4320, 162)

In [43]:
Dataframe_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152,153,154,155,156,157,158,159,160,161
0,0.283226,0.739174,0.76011,0.738748,0.712194,0.687986,0.667279,0.694,0.742275,0.762594,...,3.812479e-06,4.312694e-06,3.293364e-06,2.149554e-06,2.260914e-06,4.56252e-06,4.985616e-06,1.324619e-06,1.469305e-07,3.053402e-09
1,0.300456,0.77349,0.810015,0.785742,0.792041,0.794837,0.698452,0.676887,0.717388,0.752915,...,2.034097e-05,2.171456e-05,2.113365e-05,1.914379e-05,1.984291e-05,2.242395e-05,2.293313e-05,1.922284e-05,1.701253e-05,1.705024e-05
2,0.181467,0.629995,0.730929,0.743651,0.680889,0.670727,0.6737,0.669033,0.709932,0.745144,...,7.15189e-07,8.582505e-07,9.570447e-07,7.74154e-07,5.212421e-07,3.252106e-07,4.883136e-07,2.35367e-07,2.240905e-08,9.753157e-11
3,0.25453,0.676571,0.72497,0.725722,0.668644,0.645344,0.67147,0.631493,0.681748,0.709889,...,7.128941e-06,6.987414e-06,7.038922e-06,6.659573e-06,6.937638e-06,1.117492e-05,5.767251e-06,2.024577e-06,1.944936e-07,2.70124e-09
4,0.330042,0.821314,0.833043,0.800141,0.802373,0.787649,0.668301,0.683149,0.732381,0.765758,...,0.0001429369,0.0001483695,0.000150982,0.0001511496,0.0001564441,0.0001654324,0.0001517133,0.000149424,0.0001497178,0.000150628


In [44]:
encoder = OneHotEncoder()
encoded_Y = encoder.fit_transform(np.array(Dataframe_y).reshape(-1,1)).toarray()


In [46]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)

In [47]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
filename = 'std_scaler.sav'
pickle.dump(scaler, open(filename, 'wb'))

## Support Vector Machine

In [48]:
from sklearn.svm  import SVC
svm_model = SVC(kernel='linear',C=1,probability=True).fit(x_train,y_train)
svm_predictions = svm_model.predict(x_test)

In [49]:
svm_predictions

array(['surprised', 'sad', 'angry', ..., 'surprised', 'happy', 'sad'],
      dtype='<U9')

In [50]:
print(accuracy_score(y_true=y_test,y_pred=svm_predictions))

0.5601851851851852


In [51]:
svm_model.predict_proba(x_test)

array([[0.0265221 , 0.00161056, 0.01857477, ..., 0.00290552, 0.05310171,
        0.50946981],
       [0.0850001 , 0.0071763 , 0.02606908, ..., 0.04127773, 0.57570557,
        0.03390906],
       [0.48494695, 0.00297846, 0.02992359, ..., 0.00351004, 0.00853511,
        0.03089058],
       ...,
       [0.03368501, 0.00222517, 0.19871574, ..., 0.01192587, 0.10788399,
        0.55196187],
       [0.03966366, 0.09570775, 0.11818455, ..., 0.02224944, 0.0478838 ,
        0.01273284],
       [0.00430238, 0.38269946, 0.01082627, ..., 0.13868054, 0.36127199,
        0.00528005]])

In [None]:
test_predict = svm_model.predict(x_test)
# for x in range(len(y_test)):
#     print(y_test[x] +"->"+test_predict[x])
i = 0 
for x in range(len(y_test)):
    if y_test[x] == test_predict[x]:
        i+=1
print(i)
print(len(y_test))

In [None]:
svm_model.classes_

In [None]:
filename = 'svm_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))

In [None]:
data, sr = librosa.load('F:\\01 Code\\01 Projects\\SpeechEmotionRecognitionApp\\ser_model\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav')

## Random Forest


In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
  
 # create regressor object 
classifier = RandomForestClassifier(n_estimators = 100, random_state = 0) 
  


In [54]:
# fit the regressor with x and y data 
classifier.fit(x_train, y_train)   



In [55]:
rt_predictions = classifier.predict(x_test) 

In [56]:
print(rt_predictions)

['fearful' 'sad' 'fearful' ... 'surprised' 'calm' 'sad']


In [57]:
classifier.predict_proba(x_test)

array([[0.08, 0.  , 0.09, ..., 0.  , 0.04, 0.2 ],
       [0.14, 0.1 , 0.15, ..., 0.08, 0.21, 0.13],
       [0.05, 0.  , 0.01, ..., 0.  , 0.1 , 0.06],
       ...,
       [0.04, 0.03, 0.14, ..., 0.04, 0.07, 0.52],
       [0.09, 0.2 , 0.1 , ..., 0.04, 0.06, 0.15],
       [0.  , 0.19, 0.04, ..., 0.13, 0.43, 0.07]])

In [58]:
print(accuracy_score(y_true=y_test,y_pred=rt_predictions))
#print(classification_report(y_test,rt_predictions)) 
# creating a confusion matrix 
#print(confusion_matrix(y_test,rt_predictions) )



0.6722222222222223


In [59]:
filename = 'rf_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

### CNN

In [60]:
model=Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=8, activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 162, 256)          1536      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 81, 256)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 81, 256)           327936    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 41, 256)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 41, 128)           163968    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 21, 128)          0

In [66]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

AttributeError: 'list' object has no attribute 'shape'

In [65]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
history=model.fit(x_train, y_train, batch_size=64, epochs=50, validation_data=(x_test, y_test), callbacks=[rlrp])

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'str'>"})