### Installation

In [None]:
!pip install librosa soundfile numpy sklearn pyaudio

In [49]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.5.0-cp310-cp310-win_amd64.whl (10.4 MB)
     -------------------------------------- 10.4/10.4 MB 249.7 kB/s eta 0:00:00
Installing collected packages: pandas
Successfully installed pandas-1.5.0


In [163]:
import warnings
warnings.filterwarnings('ignore')

#### Imports

In [149]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Preparing Data

Functions to add augmentation and extract Features from existing dataset

In [150]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def extract_features(data,sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally
    

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally
    

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally
   

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally
    

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data,sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data,sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch,sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [162]:
X,Y=[],[]

#emotions in dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
i = 0
for file in glob.glob("..\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav"):
    file_name=os.path.basename(file)
    i+=1
    # Emotion From File 
    emotion=emotions[file_name.split("-")[2]]
    
    # Feature Extraction
    features = get_features(file)
    for ele in features:
        X.append(ele)
        Y.append(emotion)
print(i)

  return librosa.effects.time_stretch(data, rate)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


1440


In [164]:
Y[:10]

['neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral']

In [165]:
Dataframe_x = pd.DataFrame(X)
Dataframe_y = pd.DataFrame(Y)

In [166]:
Dataframe_x.shape



(4320, 162)

In [167]:
Dataframe_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152,153,154,155,156,157,158,159,160,161
0,0.224306,0.66419,0.693038,0.66539,0.673116,0.696442,0.684787,0.706183,0.748844,0.782297,...,8.778047000000001e-17,6.888786000000001e-17,6.993841000000001e-17,7.601734000000001e-17,7.305700000000001e-17,6.833248e-17,7.254372000000001e-17,7.962737e-17,7.395194000000001e-17,7.358765000000001e-17
1,0.326063,0.797169,0.838089,0.818593,0.808306,0.821393,0.730495,0.671541,0.727748,0.766877,...,8.18899e-05,7.478722e-05,7.986107e-05,7.532304e-05,7.343673e-05,7.295405e-05,7.710532e-05,7.870138e-05,7.814987e-05,7.755212e-05
2,0.169383,0.579495,0.662042,0.674849,0.631609,0.623019,0.687452,0.671907,0.692694,0.712601,...,2.503228e-15,1.88516e-15,1.743947e-15,1.590055e-15,1.477827e-15,1.528373e-15,1.689925e-15,1.679742e-15,1.602365e-15,1.500745e-15
3,0.196533,0.652948,0.692924,0.664361,0.648762,0.686783,0.688136,0.68301,0.735986,0.759067,...,8.858628e-17,7.385022e-17,6.953341000000001e-17,7.445123000000001e-17,7.997667e-17,7.75453e-17,8.110750000000001e-17,7.647289000000001e-17,7.359221e-17,7.287654000000001e-17
4,0.281331,0.763708,0.77861,0.742306,0.747656,0.755848,0.648114,0.66803,0.709139,0.745,...,7.994031e-06,7.831942e-06,8.176215e-06,8.002242e-06,7.604738e-06,7.781833e-06,7.566834e-06,7.094236e-06,7.800415e-06,8.211715e-06


In [168]:
encoder = OneHotEncoder()
encoded_Y = encoder.fit_transform(np.array(Dataframe_y).reshape(-1,1)).toarray()


In [169]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)

In [170]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Support Vector Machine

In [178]:
from sklearn.svm  import SVC
svm_model = SVC(kernel='linear',C=1,probability=True).fit(x_train,y_train)
svm_predictions = svm_model.predict(x_test)

In [179]:
print(accuracy_score(y_true=y_test,y_pred=svm_predictions))

0.5601851851851852


In [212]:
svm_model.predict_proba(x_test)

array([[0.02225061, 0.00091374, 0.01273491, ..., 0.00196093, 0.04013307,
        0.58696896],
       [0.16783919, 0.01415094, 0.04466222, ..., 0.02567973, 0.35941953,
        0.03727406],
       [0.07239241, 0.00136246, 0.00339217, ..., 0.00168242, 0.00660219,
        0.01240351],
       ...,
       [0.05366456, 0.00257321, 0.28922011, ..., 0.01198721, 0.07280021,
        0.50482598],
       [0.03708848, 0.10965868, 0.08388042, ..., 0.02936245, 0.04135935,
        0.01483191],
       [0.007531  , 0.31793564, 0.00971219, ..., 0.1176742 , 0.39429721,
        0.01187161]])

'fearful'

In [216]:
test_predict = svm_model.predict(x_test)
# for x in range(len(y_test)):
#     print(y_test[x] +"->"+test_predict[x])
i = 0 
for x in range(len(y_test)):
    if y_test[x] == test_predict[x]:
        i+=1
print(i)
print(len(y_test))

605
1080


In [211]:
svm_model.classes_

array(['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad',
       'surprised'], dtype='<U9')