In [121]:
import librosa
import librosa.display
import glob
import torch
import torch.nn as nn
import os, json, random
import numpy as np


In [122]:
def normalize_audio(audio):
    #audio = audio / np.max(np.abs(audio))
    for ind,freq in enumerate(audio):
        audio[ind] = freq / np.max(np.abs(freq))
    return audio


In [123]:
file_name = 'Test_Voices/' # folder dir
file_ext = '*.wav'
n_mfcc = 32
samples = []
longest = 0

for fn in glob.glob(os.path.join(file_name, file_ext)):
    sound_clip,s = librosa.load(fn)
    mfcc = librosa.feature.mfcc(y=sound_clip, sr=s, n_mfcc=n_mfcc)
    mfcc = normalize_audio(mfcc.T)
    name = fn.split("/")[-1].split(".")[0]
    longest = max(longest, mfcc.shape[0])
    samples.append((name, mfcc))


In [124]:
names, sizes, mfccs = [], [], []

for name, mfcc in samples:
    names.append(name)
    sizes.append(mfcc.shape[0])
    to_pad = np.zeros((longest-mfcc.shape[0], n_mfcc))
    mfccs.append(np.vstack((mfcc, to_pad)))

In [125]:
sizes = torch.tensor(sizes)
tensor = torch.tensor([mfcc for mfcc in mfccs])

In [126]:
def gather_columns(y_out, x_lengths):
    """
    Gets a vector, that's at the position indicated 
    by the corresponding value in x_lengths, from each batch datapoint in y_out.
    """
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1
    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)


class RNN(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size, dropout_rate=0.1, batch_first=True):
        super(RNN, self).__init__()
        
        self.rnn = nn.GRU(input_size=input_size, hidden_size=hidden_size, batch_first=batch_first)
        self.linear1 = nn.Linear(in_features=hidden_size, out_features=32)
        self.linear2 = nn.Linear(in_features=32, out_features=num_classes)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.relu = nn.ReLU()
        
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        out, _ = self.rnn(x_in)
        
        if x_lengths is not None:
            output = gather_columns(out, x_lengths)
        else:
            output = out[:, -1, :]
            
        output = self.linear1(output)
        output = self.dropout(output)
        output = self.relu(output)
        output = self.linear2(output)
        if apply_softmax:
            output = self.softmax(output)
        return output

    

In [127]:
model = RNN(input_size = n_mfcc, num_classes = 5, hidden_size = 64)
checkpoint = torch.load('Model/model1.pth')
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [128]:
model.eval()
output = model(x_in = tensor.float(), x_lengths = sizes, apply_softmax=True)
output = output.tolist()

In [129]:
for ind, out in enumerate(output):
    output[ind] = [round(num, 4) for num in out]

In [130]:
f = open('predictions.txt', 'w')
correct, total = 0, 0
for name, out in zip(names, output):
    out = [str(num) for num in out]
    label = 0
    total += 1
    if '1' in name:
        label = 0
    elif '2' in name:
        label = 1
    elif '3' in name:
        label = 2
    elif '4' in name:
        label = 3
    else:
        label = 4
    correct += (1 if label == np.argmax(np.array(out)) else 0)
    text = name + ' - ' + out[0] + ', ' + out[1] + ', ' + out[2] + ', ' + out[3] + ', ' + out[4] + '\n'
    f.write(text)
    
print('Accuracy on test - {}%'.format(correct / total * 100))
f.close()

Accuracy on test - 88.39285714285714%
