In [1]:
import numpy as np
import os

In [2]:
import librosa
import soundfile
import glob

In [8]:
path = "./toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data"

In [14]:
directory = os.listdir(path)

In [15]:
directory

['OAF_sad',
 'YAF_disgust',
 'YAF_sad',
 'OAF_Pleasant_surprise',
 'YAF_angry',
 'YAF_happy',
 'OAF_neutral',
 'OAF_fear',
 'OAF_disgust',
 'YAF_neutral',
 'YAF_fear',
 'OAF_happy',
 'YAF_pleasant_surprise',
 'OAF_angry']

In [33]:
X  = []
audios = []
labels = []

In [34]:
for dir_name in directory : 
    emotion = (dir_name.split('_')[-1])
    path_for_audio = path + "/" + dir_name
    directory_audios=  os.listdir(path_for_audio)
    for audio in directory_audios : 
        audio_loc = path_for_audio + "/" +  audio
        x,sr = librosa.load(audio_loc)
        audios.append(x)
        labels.append(emotion)

    

In [35]:
labels=np.asarray(labels)

In [36]:
labels.shape

(2800,)

In [37]:
audios=np.asarray(audios)

In [38]:
audios.shape

(2800,)

In [40]:
# np.save('TESS_audios.npy', audios)
# np.save('TESS_labels.npy', labels)

In [42]:
audios = np.load('TESS_audios.npy', allow_pickle=True)
labels = np.load('TESS_labels.npy',allow_pickle=True)

In [47]:
emotion_dict = {
    0 : "angry",
    1 : "disgust",
    2 : "fear",
    3 : "happy",
    4 : "neutral",
    5 : "sad",
    6 : "surprise"
}

In [48]:
def extract_feature(audio, mfcc, chroma, mel):
    X = audio
    sample_rate = 22050
    if chroma:
        stft=np.abs(librosa.stft(X))
        result=np.array([])
    if mfcc:
        mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
    if chroma:
        chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
    if mel:
        mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
    return result

In [51]:
def load_data(test_size=0.2):
    X_new,Y_new=[],[]
    idx = 0;
    for audio in audios:
#         emotion = emotion_dict[labels[idx]]
#         print(emotion)
        idx +=1
#         if emotion not in observed_emotions:
#             continue

        feature=extract_feature(audio, mfcc=True, chroma=True, mel=True)
        X_new.append(feature)
#         Y_new.append(emotion)
        
    return (X_new,Y_new)

In [52]:
DATA = load_data()

In [58]:
X_new = np.array(DATA[0])
Y_new = np.array(labels)

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_validate

In [60]:
X_new_train, X_new_test, Y_new_train, Y_new_test  = train_test_split(X_new, Y_new,test_size=0.10,random_state=3)


In [61]:
print((X_new_train.shape[0], X_new_test.shape[0]))


(2520, 280)


In [62]:
model=MLPClassifier(alpha=0.01, batch_size=264, epsilon=1e-08,
                    hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=700, 
                    activation='tanh',learning_rate_init=0.001, verbose=True, early_stopping=True)


In [63]:
cvresults=cross_validate(model, X_new_train, Y_new_train, cv=10, return_train_score=True)

Iteration 1, loss = 2.06346126
Validation score: 0.246696
Iteration 2, loss = 1.48739203
Validation score: 0.748899
Iteration 3, loss = 1.08100292
Validation score: 0.788546
Iteration 4, loss = 0.82750438
Validation score: 0.854626
Iteration 5, loss = 0.64131445
Validation score: 0.903084
Iteration 6, loss = 0.49886492
Validation score: 0.973568
Iteration 7, loss = 0.38901661
Validation score: 0.982379
Iteration 8, loss = 0.29333309
Validation score: 0.977974
Iteration 9, loss = 0.21992714
Validation score: 1.000000
Iteration 10, loss = 0.16868391
Validation score: 1.000000
Iteration 11, loss = 0.12946237
Validation score: 0.995595
Iteration 12, loss = 0.10225903
Validation score: 1.000000
Iteration 13, loss = 0.07972999
Validation score: 1.000000
Iteration 14, loss = 0.06252932
Validation score: 1.000000
Iteration 15, loss = 0.04979532
Validation score: 0.995595
Iteration 16, loss = 0.03912992
Validation score: 1.000000
Iteration 17, loss = 0.03302605
Validation score: 1.000000
Iterat

Iteration 21, loss = 0.01771974
Validation score: 0.986784
Iteration 22, loss = 0.01665342
Validation score: 0.986784
Iteration 23, loss = 0.01397542
Validation score: 0.991189
Iteration 24, loss = 0.01248608
Validation score: 0.986784
Iteration 25, loss = 0.01094943
Validation score: 0.986784
Iteration 26, loss = 0.00988471
Validation score: 0.995595
Iteration 27, loss = 0.00933399
Validation score: 0.991189
Iteration 28, loss = 0.00844110
Validation score: 0.991189
Iteration 29, loss = 0.00881231
Validation score: 0.995595
Iteration 30, loss = 0.00745516
Validation score: 0.995595
Iteration 31, loss = 0.00699434
Validation score: 0.991189
Iteration 32, loss = 0.00660368
Validation score: 0.991189
Iteration 33, loss = 0.00627602
Validation score: 0.995595
Iteration 34, loss = 0.00606822
Validation score: 0.995595
Iteration 35, loss = 0.00585695
Validation score: 0.995595
Iteration 36, loss = 0.00554295
Validation score: 0.995595
Iteration 37, loss = 0.00530120
Validation score: 0.9955

Validation score: 0.748899
Iteration 3, loss = 1.07064698
Validation score: 0.819383
Iteration 4, loss = 0.84487729
Validation score: 0.828194
Iteration 5, loss = 0.66586812
Validation score: 0.881057
Iteration 6, loss = 0.52449303
Validation score: 0.933921
Iteration 7, loss = 0.40173725
Validation score: 0.973568
Iteration 8, loss = 0.30458858
Validation score: 0.977974
Iteration 9, loss = 0.23368815
Validation score: 0.986784
Iteration 10, loss = 0.17895399
Validation score: 0.986784
Iteration 11, loss = 0.13366063
Validation score: 0.982379
Iteration 12, loss = 0.09868352
Validation score: 0.991189
Iteration 13, loss = 0.07631197
Validation score: 0.991189
Iteration 14, loss = 0.06008609
Validation score: 0.991189
Iteration 15, loss = 0.04807556
Validation score: 0.991189
Iteration 16, loss = 0.03979966
Validation score: 0.991189
Iteration 17, loss = 0.03674605
Validation score: 0.991189
Iteration 18, loss = 0.02951703
Validation score: 0.991189
Iteration 19, loss = 0.02609328
Vali

In [75]:
cvresults

{'fit_time': array([2.43099666, 3.06758952, 3.89309216, 3.58353281, 3.91978693,
        3.25590992, 2.10922766, 2.96107459, 2.70141983, 2.64397192]),
 'score_time': array([0.00683379, 0.01501417, 0.02613616, 0.00625968, 0.01269102,
        0.00679255, 0.00597191, 0.0059135 , 0.00625181, 0.00534225]),
 'test_score': array([0.98832685, 0.99215686, 1.        , 1.        , 0.98804781,
        0.99203187, 0.98804781, 1.        , 0.988     , 0.992     ]),
 'train_score': array([0.98895272, 0.99779249, 1.        , 0.99955908, 0.99955928,
        0.99603349, 0.99647422, 1.        , 0.99779736, 1.        ])}

In [81]:
cvresults=cross_validate(model, X_new_test, cv=10)

TypeError: fit() missing 1 required positional argument: 'y'

In [70]:
# model.fit(X_new_train, Y_new_train)

Iteration 1, loss = 1.89715393
Validation score: 0.515873
Iteration 2, loss = 1.25315977
Validation score: 0.797619
Iteration 3, loss = 0.89023691
Validation score: 0.896825
Iteration 4, loss = 0.66762342
Validation score: 0.952381
Iteration 5, loss = 0.51851537
Validation score: 0.940476
Iteration 6, loss = 0.39837951
Validation score: 0.964286
Iteration 7, loss = 0.30572224
Validation score: 0.972222
Iteration 8, loss = 0.22984140
Validation score: 0.980159
Iteration 9, loss = 0.17808065
Validation score: 0.984127
Iteration 10, loss = 0.13961951
Validation score: 0.976190
Iteration 11, loss = 0.10894228
Validation score: 0.992063
Iteration 12, loss = 0.08178644
Validation score: 0.988095
Iteration 13, loss = 0.06403653
Validation score: 0.984127
Iteration 14, loss = 0.04894362
Validation score: 0.992063
Iteration 15, loss = 0.03818546
Validation score: 0.992063
Iteration 16, loss = 0.03084244
Validation score: 0.988095
Iteration 17, loss = 0.02502780
Validation score: 0.988095
Iterat

MLPClassifier(activation='tanh', alpha=0.01, batch_size=264, beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=700, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=True, warm_start=False)

In [71]:
Y_pred = model.predict(X_new_test)

In [72]:
cn = confusion_matrix(Y_pred,Y_new_test)

In [73]:
cn

array([[39,  0,  0,  0,  0,  0,  0],
       [ 0, 57,  0,  0,  0,  0,  0],
       [ 0,  0, 37,  0,  0,  0,  0],
       [ 0,  0,  0, 39,  0,  0,  0],
       [ 0,  0,  0,  0, 38,  0,  0],
       [ 0,  0,  0,  0,  0, 33,  0],
       [ 0,  1,  0,  0,  0,  0, 36]])