### Installation

In [1]:
!pip install librosa soundfile numpy sklearn pyaudio



In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.5.0-cp310-cp310-win_amd64.whl (10.4 MB)
     --------------------------------------- 10.4/10.4 MB 11.1 MB/s eta 0:00:00
Installing collected packages: pandas
Successfully installed pandas-1.5.0


In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Imports

In [4]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Preparing Data

Functions to add augmentation and extract Features from existing dataset

In [5]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def extract_features(data,sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally
    

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally
    

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally
   

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally
    

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data,sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data,sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch,sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [60]:
X,Y=[],[]

#emotions in dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
i = 0
for file in glob.glob("C:\\Users\\rohit\\Downloads\\ravdessAudio\\Actor_*\\*.wav"):
    file_name=os.path.basename(file)
    i+=1
    # Emotion From File 
    emotion=emotions[file_name.split("-")[2]]
    
    # Feature Extraction
    features = get_features(file)
    for ele in features:
        X.append(ele)
        Y.append(emotion)
print(i)

1440


In [61]:
Y[:10]

['neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral']

In [62]:
Dataframe_x = pd.DataFrame(X)
Dataframe_y = pd.DataFrame(Y)

In [63]:
Dataframe_x.shape



(4320, 162)

In [64]:
Dataframe_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152,153,154,155,156,157,158,159,160,161
0,0.283226,0.739174,0.76011,0.738748,0.712194,0.687986,0.667279,0.694,0.742275,0.762594,...,3.81248e-06,4.312694e-06,3.293364e-06,2.149553e-06,2.260914e-06,4.56252e-06,4.985617e-06,1.324619e-06,1.469305e-07,3.053402e-09
1,0.311126,0.77946,0.818881,0.796993,0.801266,0.81057,0.715827,0.666192,0.712326,0.74605,...,3.835843e-05,3.811369e-05,3.850856e-05,3.554462e-05,3.45282e-05,3.813979e-05,3.840054e-05,3.431616e-05,3.306224e-05,3.247374e-05
2,0.181467,0.629995,0.730929,0.743651,0.680889,0.670727,0.6737,0.669033,0.709932,0.745144,...,7.15189e-07,8.582505e-07,9.570447e-07,7.74154e-07,5.212421e-07,3.252106e-07,4.883136e-07,2.35367e-07,2.240905e-08,9.753157e-11
3,0.25453,0.676571,0.72497,0.725722,0.668644,0.645344,0.67147,0.631493,0.681748,0.709889,...,7.128941e-06,6.987414e-06,7.038922e-06,6.659573e-06,6.937638e-06,1.117492e-05,5.767251e-06,2.024577e-06,1.944936e-07,2.70124e-09
4,0.28271,0.719157,0.769756,0.78052,0.746777,0.739917,0.741489,0.645501,0.673915,0.711552,...,1.394095e-05,1.415216e-05,1.450559e-05,1.451675e-05,1.433405e-05,1.846004e-05,1.333356e-05,9.180705e-06,7.212535e-06,7.446048e-06


In [65]:
encoder = OneHotEncoder()
encoded_Y = encoder.fit_transform(np.array(Dataframe_y).reshape(-1,1)).toarray()


In [66]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)

In [67]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Support Vector Machine

In [14]:
from sklearn.svm  import SVC
svm_model = SVC(kernel='linear',C=1,probability=True).fit(x_train,y_train)
svm_predictions = svm_model.predict(x_test)

In [38]:
svm_predictions

array(['surprised', 'sad', 'angry', ..., 'surprised', 'fearful', 'sad'],
      dtype='<U9')

In [15]:
print(accuracy_score(y_true=y_test,y_pred=svm_predictions))

0.5666666666666667


In [16]:
svm_model.predict_proba(x_test)

array([[0.02342421, 0.00132087, 0.01372079, ..., 0.00221445, 0.0507681 ,
        0.52996794],
       [0.10162292, 0.00814391, 0.02371009, ..., 0.02132989, 0.53016156,
        0.03219339],
       [0.46700087, 0.00146471, 0.02407404, ..., 0.00172739, 0.0126197 ,
        0.03542008],
       ...,
       [0.03339453, 0.00118936, 0.21605118, ..., 0.00553866, 0.0924504 ,
        0.56466461],
       [0.05566125, 0.03180724, 0.14081782, ..., 0.01501602, 0.04826041,
        0.01898504],
       [0.00384309, 0.39806908, 0.00853966, ..., 0.12012176, 0.38535673,
        0.00397381]])

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2698483599.py, line 1)

In [17]:
test_predict = svm_model.predict(x_test)
# for x in range(len(y_test)):
#     print(y_test[x] +"->"+test_predict[x])
i = 0 
for x in range(len(y_test)):
    if y_test[x] == test_predict[x]:
        i+=1
print(i)
print(len(y_test))

612
1080


In [18]:
svm_model.classes_

array(['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad',
       'surprised'], dtype='<U9')

## Random Forest


In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
  
 # create regressor object 
classifier = RandomForestClassifier(n_estimators = 100, random_state = 0) 
  


In [69]:
# fit the regressor with x and y data 
classifier.fit(x_train, y_train)   



In [70]:
rt_predictions = classifier.predict(x_test) 

In [71]:
print(rt_predictions)

['fearful' 'sad' 'fearful' ... 'surprised' 'fearful' 'sad']


In [72]:
classifier.predict_proba(x_test)

array([[0.04, 0.  , 0.05, ..., 0.  , 0.02, 0.13],
       [0.1 , 0.12, 0.15, ..., 0.13, 0.23, 0.14],
       [0.18, 0.  , 0.04, ..., 0.  , 0.03, 0.02],
       ...,
       [0.07, 0.02, 0.2 , ..., 0.  , 0.07, 0.43],
       [0.09, 0.18, 0.12, ..., 0.02, 0.06, 0.16],
       [0.  , 0.13, 0.08, ..., 0.14, 0.38, 0.06]])

In [74]:
print(accuracy_score(y_true=y_test,y_pred=rt_predictions))
#print(classification_report(y_test,rt_predictions)) 
# creating a confusion matrix 
#print(confusion_matrix(y_test,rt_predictions) )

0.687037037037037
