In [1]:
from keras.utils import np_utils
from speechemotionrecognition.mlmodel import NN, SVM, RF
from speechemotionrecognition.utilities import get_data, class_labels
import sys
from keras import Sequential
from keras.layers import LSTM as lstm, Dense, Dropout, Conv2D, Flatten, \
    BatchNormalization, Activation, MaxPooling2D
import numpy as np
import scipy.io.wavfile as wav
import os
import speechpy
from sklearn.model_selection import train_test_split








Using TensorFlow backend.


In [2]:
class_labels = ["Neutral", "Angry", "Happy", "Sad"]
mslen = 32000  


In [3]:
def read_wav(filename):
    """
    Read the wav file and return corresponding data
    :param filename: name of the file
    :return: return tuple containing sampling frequency and signal
    """
    return wav.read(filename)



In [4]:
def get_data(dataset_path, flatten=True, mfcc_len=39):
    """
    Read the files get the data perform the test-train split and return them to the caller
    :param dataset_path: path to the dataset folder
    :param mfcc_len: Number of mfcc features to take for each frame
    :param flatten: Boolean specifying whether to flatten the data or not
    :return: 4 arrays, x_train x_test y_train y_test
    """
    data = []
    labels = []
    max_fs = 0
    s = 0
    cnt = 0
    cur_dir = os.getcwd()
    #print('curdir', cur_dir)
    os.chdir(dataset_path)
    for i, directory in enumerate(class_labels):
        #print( "started reading folder", directory)
        os.chdir(directory)
        for filename in os.listdir('.'):
            fs, signal = read_wav(filename)
            #print((signal))
            #print(fs)
            max_fs = max(max_fs, fs)
            s_len = len(signal)
            #print(s_len)
            # pad the signals to have same size if lesser than required
            # else slice them
            if s_len < mslen:
                pad_len = mslen - s_len
                pad_rem = pad_len % 2
                pad_len /= 2
                p=int(pad_len)
                               
                
                signal = np.pad(signal, (p + pad_rem), 'constant', constant_values=0)
            else:
                pad_len = s_len - mslen
                pad_len /= 2
                p=int(pad_len)
                signal = signal[p:p + mslen]
                
            mfcc = speechpy.feature.mfcc(signal, fs, num_cepstral=mfcc_len)
            

            if flatten:
                # Flatten the data
                mfcc = mfcc.flatten()
            data.append(mfcc)
            labels.append(i)
            cnt += 1
        #print ("ended reading folder", directory)
        os.chdir('..')
    os.chdir(cur_dir)
    
    #x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    return  np.array(labels), np.array(data)


In [5]:
dataset_path = 'dataset'


In [6]:
labels, ds = get_data(dataset_path=dataset_path, flatten=False)
print(ds.shape)


nsamples, nx, ny = ds.shape
d2 = ds.reshape((nsamples,nx*ny))
print(d2.shape)
x_train, x_test, y_train, y_test = train_test_split(d2, labels, test_size=0.2, random_state=42)

#y_train = np_utils.to_categorical(y_train)
#y_test = np_utils.to_categorical(y_test)


    



(339, 198, 39)
(339, 7722)


In [7]:
from sklearn.svm import SVC
svm=SVC(kernel="linear",C=0.05,random_state=101)
svm.fit(x_train,y_train)
y_pred6=svm.predict(x_test)
print("svm \n")
print("accuracy")
print(svm.score(x_test,y_test))



svm 

accuracy
0.8088235294117647


In [8]:
k=(svm.n_support_)
print(k)

[56 89 63 41]


In [9]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

bnb = BernoulliNB(binarize=0.10)
bnb.fit(x_train, y_train)
y_pred4 = bnb.predict(x_test)

print("naive baiyes\n")
print("accuracy")
print(bnb.score(x_test,y_test))
#print("\n")


naive baiyes

accuracy
0.6617647058823529


In [10]:
x_train, x_test, y_train, y_test = train_test_split(ds, labels, test_size=0.2, random_state=42)
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

print(x_train.shape)

model= Sequential()
model.add(Conv2D(64, (17,4),input_shape=(x_train.shape[1],x_train.shape[2],1)))
model.add((Activation('relu')))
model.add(MaxPooling2D(pool_size=(10, 2)))


model.add(Conv2D(32, (3, 3),))
model.add((Activation('relu')))
model.add(MaxPooling2D(pool_size=(2, 2)))


model.add(Flatten())
model.add(Dense(32))

model.add(Dense(4))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


           

(271, 198, 39)
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 182, 36, 64)       4416      
_________________________________________________________________
activation_1 (Activation)    (None, 182, 36, 64)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 18, 18, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 32)        18464     
_________________________________________________________________
activation_2 (Activation)    (None, 16, 16, 32)        0         
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 8, 8, 32)          0         
_______________________________________________________

In [11]:
print(x_train.shape)
print(x_test.shape)


(271, 198, 39)
(68, 198, 39)


In [12]:
x_train = x_train.reshape(x_train.shape[0],x_train.shape[1],x_train.shape[2],1)
x_test=x_test.reshape(x_test.shape[0],x_test.shape[1],x_test.shape[2],1)




In [13]:
print(x_train.shape)
print(x_test.shape)


(271, 198, 39, 1)
(68, 198, 39, 1)


In [14]:
model.fit(x_train,y_train,batch_size=32,epochs=10,validation_split=0.065)


Instructions for updating:
Use tf.cast instead.
Train on 253 samples, validate on 18 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c3303beb8>

In [44]:
loss, acc = model.evaluate(x_test, y_test)

print(loss)
print(acc)

0.323242338265
0.860294117647


In [45]:
best_acc = 0
for i in range(50):
    # Shuffle the data for each epoch in unison inspired from https://stackoverflow.com/a/4602224
    p = np.random.permutation(len(x_train))
    x_train = x_train[p]
    y_train = y_train[p]
    model.fit(x_train, y_train, batch_size=32, epochs=1)
    loss, acc = model.evaluate(x_test, y_test)
    if acc > best_acc:
        best_acc = acc
    print(i)    
trained = True



Epoch 1/1
0
Epoch 1/1
1
Epoch 1/1
2
Epoch 1/1
3
Epoch 1/1
4
Epoch 1/1
5
Epoch 1/1
6
Epoch 1/1
7
Epoch 1/1
8
Epoch 1/1
9
Epoch 1/1
10
Epoch 1/1
11
Epoch 1/1
12
Epoch 1/1
13
Epoch 1/1
14
Epoch 1/1
15
Epoch 1/1
16
Epoch 1/1
17
Epoch 1/1
18
Epoch 1/1
19
Epoch 1/1
20
Epoch 1/1
21
Epoch 1/1
22
Epoch 1/1
23
Epoch 1/1
24
Epoch 1/1
25
Epoch 1/1
26
Epoch 1/1
27
Epoch 1/1
28
Epoch 1/1
29
Epoch 1/1
30
Epoch 1/1
31
Epoch 1/1
32
Epoch 1/1
33
Epoch 1/1
34
Epoch 1/1
35
Epoch 1/1
36
Epoch 1/1
37
Epoch 1/1
38
Epoch 1/1
39
Epoch 1/1
40
Epoch 1/1
41
Epoch 1/1
42
Epoch 1/1
43
Epoch 1/1
44
Epoch 1/1
45
Epoch 1/1
46
Epoch 1/1
47
Epoch 1/1
48
Epoch 1/1
49


In [46]:
print(best_acc)

0.933823529412


In [51]:
import librosa    
filename="C:/Users/vaibhav singh/Desktop/emotion/speech-emotion-recognition-master/angry.wav"
filename2="C:/Users/vaibhav singh/Desktop/emotion/speech-emotion-recognition-master/happy.wav"


In [61]:
def get_data_fx(file):
    data=[]
    max_fs=0
    signal, fs = librosa.load(file, sr=16000)
    max_fs = max(max_fs, fs)
    s_len = len(signal)
    if s_len < mslen:
        pad_len = mslen - s_len
        pad_rem = pad_len % 2
        pad_len /= 2
        p=int(pad_len)
        signal = np.pad(signal, (p + pad_rem), 'constant', constant_values=0)
    else:
        pad_len = s_len - mslen
        pad_len /= 2
        p=int(pad_len)
        signal = signal[p:p + mslen]
    mfcc = speechpy.feature.mfcc(signal, fs, num_cepstral=39)
    print(mfcc.shape)
    data.append(mfcc)
    x_trainn=np.array(data)
    x_trainn = x_trainn.reshape(x_trainn.shape[0],x_trainn.shape[1],x_trainn.shape[2],1)
    pr=model.predict(x_trainn).tolist()
    print(pr)


    

    


In [62]:
get_data_fx(filename)
get_data_fx(filename2)


(198, 39)
[[1.9956779340191133e-08, 0.9933685660362244, 0.006631484720855951, 2.1043168985102056e-08]]
(198, 39)
[[0.9999994039535522, 3.0564244468678226e-08, 1.3592926961791818e-07, 5.186241196497576e-07]]


In [136]:
x_train, x_test, y_train, y_test = train_test_split(ds, labels, test_size=0.2, random_state=42)
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

model2 = Sequential()
model2.add(lstm(128, input_shape=(198,39)))
model2.add(Dropout(0.5))
model2.add(Dense(32, activation='relu'))
model2.add(Dense(16, activation='tanh'))
model2.add(Dense(4, activation='softmax'))       
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())







_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_15 (LSTM)               (None, 128)               86016     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 32)                4128      
_________________________________________________________________
dense_35 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_36 (Dense)             (None, 4)                 68        
Total params: 90,740
Trainable params: 90,740
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model2.fit(x_train,y_train,batch_size=32,epochs=10,validation_split=0.065)
