In [1]:
import keras
from keras.datasets import mnist
from keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense
import numpy as np
from sklearn.model_selection import train_test_split

import librosa 
from librosa import feature
import soundfile
import os, glob, pickle
import re #split strings

from sklearn import preprocessing

fn_list_i = [
 feature.chroma_stft,
 feature.spectral_centroid,
 feature.spectral_bandwidth,
 feature.spectral_rolloff
]
 
fn_list_ii = [
 feature.zero_crossing_rate
]
#feature.rmse,

emotions={
    0:'neutral',
    1:'calm',
    2:'happy',
    3:'sad',
    4:'angry',
    5:'fearful',
    6:'disgust',
    7:'surprised'
}

# Not stressed: happy(2), calm(1).
# Stressed:     sad(3), fearful(5)
stress_emotions = {
    1:0,
    2:0,
    3:1,
    5:1
}
selected_emotions = {1,2,3,5}


num_classes = 2 #8

In [2]:
#preprocesamiento de corpus
def preprocessing_data(filename):
    # 1) audio: mono 
    # 2) frecuencia de muestreo(SR): 16000Hz
    data, sr = librosa.load(filename, sr=16000, mono=True)
    #3) normalizacion
    X_scale = preprocessing.normalize([data])[0]
    #plot_data(normalizado, sr)
    #TODO: otros: farming, windowing, Voice Activity Detector (VAD), noise reduction,
    return [X_scale, sr]


In [3]:
def TEO(signal):
    out = []
    for i in range(0, len(signal)):
        if i == 0 or i == len(signal) - 1:
            out.append(signal[i])
        else:
            out.append(pow(signal[i], 2) - (signal[i + 1] * signal[i - 1]))
    return np.array(out)

In [4]:
def get_feature_vector(X_file_data,sample_rate): 
    result = np.array([])
  
    stft = np.abs(librosa.stft(X_file_data))
    
    #mfcc
    mfccs = np.mean(librosa.feature.mfcc(y=X_file_data, sr=sample_rate, n_mfcc=60).T, axis=0)
    result = np.hstack((result, mfccs))#concatena ([1,2][3,4]) => [1,2,3,4]

    mfccs_var = np.var(librosa.feature.mfcc(y=X_file_data, sr=sample_rate, n_mfcc=60).T, axis=0)
    result = np.hstack((result, mfccs_var)) 

    mfccs_std = np.std(librosa.feature.mfcc(y=X_file_data, sr=sample_rate, n_mfcc=60).T, axis=0)
    result = np.hstack((result, mfccs_std)) 
    
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    result = np.hstack((result, chroma))
    
    mel = np.mean(librosa.feature.melspectrogram(X_file_data, sr=sample_rate).T,axis=0)
    result = np.hstack((result, mel))
    
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    result = np.hstack((result, contrast))
    
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X_file_data), sr=sample_rate).T,axis=0)
    result = np.hstack((result, tonnetz))
    

    return result

In [5]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("/media/yonel/730D-8298/DATASETS/RAVDESS/Actor_*/*.wav"): #* PC
        file_name = os.path.basename(file) #            03-01-01-01-01-01-01.wav
        print('Progress:  %s' % (file_name), end='\r')
        file_name_no_type = file_name.split(".")[0] #   03-01-01-01-01-01-01
        list_emotions = file_name_no_type.split("-") #  ['03', '01', '01', '01', '01', '01', '01']
        emotion = int(list_emotions[2])-1 #iniciar contador de emociones desde 0 a 7 (8emociones)
        
        if emotion in selected_emotions:
            emotion = stress_emotions[emotion]
           
            signal, sample_rate = preprocessing_data(file)
            feature = get_feature_vector(signal, sample_rate)
            x.append(feature)
            y.append(emotion)

    return np.array(x), np.array(y)

In [6]:
min_max_scaler = preprocessing.MinMaxScaler()
X_raw,Y_raw = load_data()
print("\n")
print("X_data:", X_raw.shape, "Y_data:",  Y_raw.shape)

Progress:  03-01-08-02-02-02-24.wav

X_data: (768, 333) Y_data: (768,)


### Normalización de datos

In [78]:
Y = Y_raw #to_categorical(Y_raw, num_classes)
X = min_max_scaler.fit_transform(X_raw)
print("X_data:", X.shape, "Y_data:",  Y.shape)

X_data: (768, 333) Y_data: (768,)


### Reducción de dimensionalidad con PCA

In [79]:
from sklearn.decomposition import PCA
X =  PCA(n_components=250).fit_transform(X)
print("X_data:", X.shape, "Y_data:",  Y.shape)

X_data: (768, 250) Y_data: (768,)


In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=10) 
print("Train:", X_train.shape, Y_train.shape, "val & Test:", X_test.shape, Y_test.shape)

Train: (537, 250) (537,) val & Test: (231, 250) (231,)


In [81]:
#SVM lineal
from sklearn.svm import SVC
clf_SVC = SVC(kernel='rbf') #  gaussiano(RBF):

clf_SVC.fit(X_train,Y_train)

SVC()

In [82]:
clf_SVC.fit(X_train,Y_train)
print('Accuracy of linear SVC on training set: {:.2f}'.format(clf_SVC.score(X_train, Y_train)))
print('Accuracy of linear SVC on test set: {:.2f}'.format(clf_SVC.score(X_test, Y_test)))

Accuracy of linear SVC on training set: 0.93
Accuracy of linear SVC on test set: 0.81


### Tuning

In [83]:
#tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np

c_SVC = np.logspace(start = 0, stop = 10, num = 100, base = 2 , dtype = 'float64')
print( 'the generated array of c values')
print ( c_SVC )
param_grid_S = {'C': c_SVC}

the generated array of c values
[1.00000000e+00 1.07252413e+00 1.15030800e+00 1.23373308e+00
 1.32320850e+00 1.41917304e+00 1.52209732e+00 1.63248610e+00
 1.75088073e+00 1.87786182e+00 2.01405211e+00 2.16011948e+00
 2.31678026e+00 2.48480272e+00 2.66501086e+00 2.85828845e+00
 3.06558332e+00 3.28791207e+00 3.52636502e+00 3.78211156e+00
 4.05640590e+00 4.35059319e+00 4.66611616e+00 5.00452215e+00
 5.36747075e+00 5.75674188e+00 6.17424455e+00 6.62202624e+00
 7.10228290e+00 7.61736977e+00 8.16981285e+00 8.76232139e+00
 9.39780109e+00 1.00793684e+01 1.08103658e+01 1.15943781e+01
 1.24352503e+01 1.33371059e+01 1.43043679e+01 1.53417796e+01
 1.64544288e+01 1.76477719e+01 1.89276611e+01 2.03003732e+01
 2.17726400e+01 2.33516817e+01 2.50452420e+01 2.68616263e+01
 2.88097423e+01 3.08991436e+01 3.31400770e+01 3.55435321e+01
 3.81212958e+01 4.08860094e+01 4.38512315e+01 4.70315038e+01
 5.04424225e+01 5.41007151e+01 5.80243221e+01 6.22324854e+01
 6.67458420e+01 7.15865259e+01 7.67782761e+01 8.23465

In [84]:
print("\n Array of means \n")
clf = GridSearchCV(clf_SVC, param_grid =param_grid_S, cv=20 , scoring='accuracy')
clf.fit(X_train, Y_train)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
print(means)


 Array of means 

[0.75769231 0.75769231 0.7707265  0.77635328 0.78190883 0.78746439
 0.78561254 0.78753561 0.78938746 0.79309117 0.79878917 0.79878917
 0.80249288 0.80804843 0.80619658 0.81182336 0.81189459 0.81189459
 0.81374644 0.81189459 0.81566952 0.81566952 0.81381766 0.81381766
 0.81574074 0.81574074 0.81574074 0.81574074 0.81574074 0.81574074
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259 0.81759259
 0.81759259 0.81759259 0.81759259 0.8175925

In [85]:
y_true, y_pred = Y_test, clf.predict(X_test)
print( '\nClassification report\n' )
print(classification_report(y_true, y_pred))


Classification report

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       114
           1       0.86      0.85      0.85       117

    accuracy                           0.85       231
   macro avg       0.85      0.85      0.85       231
weighted avg       0.85      0.85      0.85       231

