In [1]:
import librosa # used for sound and musical analysis
import soundfile # handling audio files
import os, glob # access files from folders, glob is used for acessing particular parsed files using filenames
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Feature Extraction
We are extracting three types of features for our audio dataset

In [2]:
def extract_feature(file_name):
    
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate    
        result=np.array([])
        
        mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
        
        stft=np.abs(librosa.stft(X))
        chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
        
        mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
            
    return result

In [3]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

emotions_to_be_observed=['happy','sad','angry']

In [4]:
def load_data(test_size=0.2):
    i=0
    x,y=[],[]
    for file in glob.glob(r"B:\Semesters\7th sem\BTP\Datasets\Emotion data\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in emotions_to_be_observed:
            continue
        feature=extract_feature(file)
        x.append(feature)
        y.append(emotion)
        print(i,emotion)
        i=i+1
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [5]:
x_train,x_test,y_train,y_test = load_data(test_size=0.20)

0 happy
1 happy
2 happy
3 happy
4 happy
5 happy
6 happy
7 happy
8 sad
9 sad
10 sad
11 sad
12 sad
13 sad
14 sad
15 sad
16 angry
17 angry
18 angry
19 angry
20 angry
21 angry
22 angry
23 angry
24 happy
25 happy
26 happy
27 happy
28 happy
29 happy
30 happy
31 happy
32 sad
33 sad
34 sad
35 sad
36 sad
37 sad
38 sad
39 sad
40 angry
41 angry
42 angry
43 angry
44 angry
45 angry
46 angry
47 angry
48 happy
49 happy
50 happy
51 happy
52 happy
53 happy
54 happy
55 happy
56 sad
57 sad
58 sad
59 sad
60 sad
61 sad
62 sad
63 sad
64 angry
65 angry
66 angry
67 angry
68 angry
69 angry
70 angry
71 angry
72 happy
73 happy
74 happy
75 happy
76 happy
77 happy
78 happy
79 happy
80 sad
81 sad
82 sad
83 sad
84 sad
85 sad
86 sad
87 sad
88 angry
89 angry
90 angry
91 angry
92 angry
93 angry
94 angry
95 angry
96 happy
97 happy
98 happy
99 happy
100 happy
101 happy
102 happy
103 happy
104 sad
105 sad
106 sad
107 sad
108 sad
109 sad
110 sad
111 sad
112 angry
113 angry
114 angry
115 angry
116 angry
117 angry
118 angry


In [7]:
print((x_train.shape[0], x_test.shape[0]))

(460, 116)


In [8]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


## Training using LDA Model

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model1 = LDA()

In [10]:
model1.fit(x_train, y_train)

LinearDiscriminantAnalysis()

In [11]:
y_pred=model1.predict(x_test)

In [12]:
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 64.66%


Now we will fine tune our model by altering the parameters of our LDA model

In [13]:
model2 = LDA(solver='lsqr',shrinkage='auto')

In [14]:
model2.fit(x_train, y_train)
y_pred=model2.predict(x_test)
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 68.97%


In [15]:
from sklearn.covariance import ShrunkCovariance
model3 = LDA(solver='eigen',covariance_estimator=ShrunkCovariance())

In [16]:
model3.fit(x_train, y_train)
y_pred=model3.predict(x_test)
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 75.00%


In [17]:
model4 = LDA(solver='lsqr',shrinkage=0.0025)

In [18]:
model4.fit(x_train, y_train)
y_pred=model4.predict(x_test)
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 76.72%


In [19]:
import speech_recognition as sr
r = sr.Recognizer()

Let's check if our model is performing good or not by recording our own live audio and checking the live predicted emotion by our best performing model.

In [22]:
with sr.Microphone() as source:
    print("Speak Anything :")
    test_rec = r.listen(source)
    with open("test_rec.wav", "wb") as f:
        f.write(test_rec.get_wav_data())
    print(model4.predict(np.array([extract_feature("test_rec.wav")])))

Speak Anything :
['happy']
