In [1]:
import librosa 
from librosa import display
import pylab as plt
import os
import pandas as pd
import glob
import numpy as np
from numpy import savetxt
from numpy import asarray
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
path ='../RAVDESS'
emotion = []
emotion_code =[]
gender = []
actor = []
file_path = []
for subdir,dirs,files in os.walk(path):
    for file in files:
        part = file.split('.')[0].split('-')
        emotion.append(int(part[2]))
        emotion_code.append(int(part[2]))
        actor.append(int(part[6]))
        bg = int(part[6])
        if bg%2 == 0:
            bg = "female"
        else:
            bg = "male"
        gender.append(bg)
        file_path.append('../RAVDESS/' + file)
audio_df = pd.DataFrame(emotion)
audio_df = audio_df.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
audio_df = pd.concat([pd.DataFrame(gender),pd.DataFrame(emotion_code),audio_df,pd.DataFrame(actor)],axis=1)
audio_df.columns = ['gender','emotion_code','emotion','actor']
audio_df = pd.concat([audio_df,pd.DataFrame(file_path, columns = ['path'])],axis=1)

In [3]:
df = pd.DataFrame(columns=['mel_spectrogram'])
counter=0
for index,path in enumerate(audio_df.path):
    X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=3,sr=44100,offset=0.5)
    
    #get the mel-scaled spectrogram (ransform both the y-axis (frequency) to log scale, and the “color” axis (amplitude) to Decibels, which is kinda the log scale of amplitudes.)
    spectrogram = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128,fmax=8000) 
    db_spec = librosa.power_to_db(spectrogram)
    df.loc[counter] = [db_spec]
    counter=counter+1
    
audio_df['mel_spectrogram']= df['mel_spectrogram']
print(audio_df.head())
print(audio_df.shape)

In [4]:
df2 = audio_df

In [5]:
to_drop = []
for i in range(2452):
    if df2['mel_spectrogram'].iloc[i].shape[1]!=259:
        to_drop.append(i)

df2 = df2.drop(to_drop)
print(df2.shape)

In [6]:
train,test = train_test_split(df2,test_size=0.2)
temp_X_train = train.iloc[:,5]
temp_X_test = test.iloc[:,5]

y_train = train.iloc[:,1]
y_test = test.iloc[:,1]

X_train = []
X_test = []

for i in range(temp_X_train.shape[0]):
    X_train.append(temp_X_train.iloc[i])

X_train = np.array(X_train)

for i in range(temp_X_test.shape[0]):
    X_test.append(temp_X_test.iloc[i])

X_test = np.array(X_test)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [7]:
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train)
DummyClassifier()
dummy_clf.predict(X_test)
print(dummy_clf.score(X_test, y_test))

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
!nvidia-smi

In [9]:
X_train = torch.from_numpy(X_train) 
y_train = torch.from_numpy(np.asarray(y_train)) 
X_test = torch.from_numpy(X_test) 
y_test = torch.from_numpy(np.asarray(y_test)) 

In [10]:
print(X_train.size())
print(y_train.size())

y_train = y_train-1
y_test = y_test-1

In [11]:
print(max(y_train))
print(y_test.shape)

In [12]:
mean= X_train.mean()

print(mean)

In [13]:
std= X_train.std()

print(std)

In [29]:
net = nn.Sequential(
        nn.Conv2d(1,   50,  kernel_size=3,  padding=1 ),
        nn.BatchNorm2d(50),
        nn.ReLU(inplace = True),
        nn.MaxPool2d(2,2),
       
        nn.Conv2d(50, 100,  kernel_size=3,  padding=1 ),
        nn.Dropout(0.5),
        nn.ReLU(inplace = True),
        nn.MaxPool2d(2,2),
        nn.Flatten(start_dim=1),
        nn.Linear(204800, 100),
        nn.ReLU(inplace=True),
        nn.Linear(100,8),
)
#net=emotion_classifier()
#print(net)
net = net.to(device)

mean=mean.to(device)

std=std.to(device)
criterion = nn.CrossEntropyLoss()

my_lr=0.001

bs= 10

In [33]:
def get_error( scores , labels ):

    bs=scores.size(0)
    predicted_labels = scores.argmax(dim=1)
    print(predicted_labels)
    indicator = (predicted_labels == labels)
    num_matches=indicator.sum()
    
    return 1-num_matches.float()/bs 

In [45]:
def eval_on_test_set():

    running_error=0
    num_batches=0

    for i in range(0,415,bs):

        minibatch_data =  X_test[i:i+bs].unsqueeze(dim=1)
        minibatch_label= y_test[i:i+bs]

        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        inputs = (minibatch_data - mean)/std   

        scores=net( inputs ) 

        error = get_error( scores , minibatch_label)

        running_error += error.item()

        num_batches+=1


    total_error = running_error/num_batches
    return total_error

In [30]:
training_error = []
testing_error = []

for epoch in range(1,60):
    
    if not epoch%5:
        my_lr = my_lr / 1.5
        
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    running_loss=0
    running_error=0
    num_batches=0
    
    shuffled_indices=torch.randperm(1660)
 
    for count in range(0,1660,bs):
        
        
    
        optimizer.zero_grad()
             
        indices=shuffled_indices[count:count+bs]
        minibatch_data =  X_train[indices].unsqueeze(dim=1)
        minibatch_label=  y_train[indices]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        
        inputs = (minibatch_data - mean)/std    
        
        inputs.requires_grad_()

        scores=net( inputs ) 

        loss =  criterion( scores , minibatch_label) 
        #print(loss)
          
        loss.backward()
        
        optimizer.step()
        

       
        
        running_loss += loss.detach().item()
        
        error = get_error( scores.detach() , minibatch_label)
        running_error += error.item()
        
        num_batches+=1       
    
    if epoch % 2 == 1:
            torch.save(net.state_dict(), f'./{epoch}.pth')
    
    
    
    total_loss = running_loss/num_batches
    total_error = running_error/num_batches
    
    
    print('epoch=',epoch,'\t lr=', my_lr  ,'\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
    testing_e = eval_on_test_set()
    print( 'error rate on test set =', testing_e*100 ,'percent')
    print(' ')
    training_error.append(total_error)
    testing_error.append(testing_e)

In [86]:
plt.plot(training_error, 'darkorange')
plt.plot(testing_error, 'navajowhite')
plt.xlabel("Training epoch")
plt.ylabel("Training loss")
plt.title("Training epoch to loss")
print(np.asarray(testing_error).min())

In [91]:
def eval_on_test_set():
    bs =1
    running_error=0
    num_batches=0

    for i in range(0,1,bs):

        minibatch_data =  X_test[i:i+bs].unsqueeze(dim=1)
        minibatch_label= y_test[i:i+bs]

        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        inputs = (minibatch_data - mean)/std   

        scores=net( inputs ) 

        error = get_error( scores , minibatch_label)

        running_error += error.item()

        num_batches+=1


    total_error = running_error/num_batches
    return total_error
def get_error( scores , labels ):
    
    x = []
    x.append("sad")
    x.append("neutral")
    x.append("angry")
    x.append("surprise")
    x.append("fear")
    x.append("calm")
    x.append("happy")
    x.append("disgust")
    
    
    
    bs=scores.size(0)

    predicted_labels = scores.argmax(dim=1)

    indicator = (predicted_labels == labels)
    num_matches=indicator.sum()
    
    y = []
    temp = scores[0]
    for item in temp:
        y.append(item.item())
    
    plt.scatter(x,y,200)
    plt.scatter(x[y.index(max(y))],max(y),200)
    plt.xlabel("Emotions")
    plt.ylabel("Probability")
    plt.title("Predicted probability")
    
    
    return 1-num_matches.float()/bs 

eval_on_test_set()