# RNN

Here, the dataset is generated using:

`python downloader.py --export-folder 'RNN_data' --splits 'train' --cliplength 10 --balance 1:1`

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

import numpy as np
import pandas as pd

#Import torch stuff.
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim import lr_scheduler

#pip install git+https://github.com/facebookresearch/WavAugment.git
import augment

import IPython.display as ipd
import matplotlib.pyplot as plt

from tqdm import trange, tqdm

import bom1.wakeword as wf
import bom1.bom1 as bom1

from sklearn.metrics import accuracy_score

import os

import pickle

In [None]:
#Set the notebook to run on the GPU, if available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'This notebook is running on the {device.type}.')

if device.type == 'cuda':
    torch.cuda.current_device()
    torch.cuda.set_device(1)

In [None]:
#THIS IS A TOO SIMPLE WAY OF DOING IT!
def generate_targets(c, n = 1375):
    #c is the class, either 1 or 0.
    target = torch.zeros((n))
    if c == 0:
        #Class 0
        return target
    else:
        #Class 1
        target[np.where((np.linspace(0, 10, n) > 6)&(np.linspace(0, 10, n) <= 6.36))[0]] = 1
        return target

In [None]:
class WakewordDataset(Dataset):
    '''
    Construct a dataset with sound files.
    '''
    
    def __init__(self, f, folder, sr = 22050, normalize = True, transforms=None):
        
        self.paths  = [os.path.join(folder, x) for x in os.listdir(folder) if 'C_1' in x] #At the moment, we only want cases where we actually have a "Kan I se det" in the sentence.

        folderinfo  = [wf.info_from_path(x) for x in self.paths] #Already here, it's shuffled.
        self.ID, self.t1, self.t2, self.target = [x[0] for x in folderinfo], [x[1] for x in folderinfo], [x[2] for x in folderinfo], [x[3] for x in folderinfo]
         
        self.transforms = transforms
        self.f          = f
        self.normalize  = normalize
        
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        path            = self.paths[idx]
        
        audio, sr, x    = wf.load_data(path, f = self.f, transforms=self.transforms, normalize=self.normalize)
        target          = generate_targets(self.target[idx])
        ID              = self.ID[idx] 
        
        return audio, sr, x, target, path, ID

In [None]:
folder = '/work3/s164419/01005WakeWordData/RNN_data/train/'
train_dataset = WakewordDataset(f=T.Spectrogram(hop_length=40), folder=folder, normalize=True, #normalize the audio when reading it with torchaudio. 
                                              transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050*10)]) #sr * length of the clip


folder = '/work3/s164419/01005WakeWordData/RNN_data/val/'
val_dataset = WakewordDataset(f=T.Spectrogram(hop_length=40), folder=folder, normalize=True, #normalize the audio when reading it with torchaudio. 
                                              transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050*10)]) #sr * length of the clip


#Create the loaders.
if device.type == 'cpu':
    batch_size = 16
else:
    batch_size = 512
    
train_loader  = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader  = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

In [None]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv = nn.Sequential(
                              nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (1, 6), stride=(1, 4)),
                              nn.BatchNorm2d(num_features = 1),
                              nn.ReLU(),
                              nn.Dropout(p=0.2),
                            )
        
        self.GRU1 = nn.GRU(batch_first = True, input_size = 201, hidden_size = 128)
        self.dropout1 = nn.Dropout(p=0.2)
        
        self.GRU2 = nn.GRU(batch_first = True, input_size = 128, hidden_size = 128)
        self.dropout2 = nn.Dropout(p=0.2)
        
        self.fc = nn.Linear(in_features = 128, out_features = 2, bias=False)
        
    def forward(self, x):
        x    = self.conv(x) # nbatch x 1 x 201 x L      -> nbatch x 196 x 201 x L/4ish. 
        x    = x.squeeze(1) # nbatch x 1 x 201 x L/4ish -> nbatch x 201 x L/4ish.
        x    = x.permute(0,2,1) # nbatch x 201 x L/4ish -> nbatch x L/4ish x 201
        
        x, _ = self.GRU1(x) #nbatch x L/4ish x 201 -> nbatch x L/4ish x 128
        x    = self.dropout1(x)
        
        x, _ = self.GRU2(x)
        x    = self.dropout2(x)
        
        #Output - no softmax!
        x    = self.fc(x)
        return x

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv = nn.Conv2d(in_channels = 1, out_channels = 1, kernel_size = (1, 15), stride = (1, 4))
        self.GRU  = nn.GRU(batch_first = True, input_size = 201, hidden_size = 128, num_layers = 2, dropout=0.2)
        
        self.fc1  = nn.Linear(in_features = 128, out_features = 64)
        self.fc2  = nn.Linear(in_features = 64,  out_features = 2)

        self.ReLU = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2) 
        
    def forward(self, x):
        x = self.conv(x)
        x = self.ReLU(x)
        x = self.dropout(x)

        x = x.squeeze(1)     
        x = x.permute(0,2,1) 

        output, _ = self.GRU(x)

        x = output.squeeze(0)

        x    = self.fc1(x)
        x    = self.ReLU(x)
        x    = self.fc2(x) 
        return x

In [None]:
rnn = SimpleRNN().to(device)

#weights = torch.tensor([1., 5.]).to(device)
#criterion = nn.CrossEntropyLoss(weight=weights)

criterion = nn.CrossEntropyLoss()

#optimizer = optim.Adam(rnn.parameters())
optimizer = optim.SGD(rnn.parameters(), lr=0.01, momentum=0.9)

scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

nepoch = 5

epochs, train_losses, test_losses = wf.train_rnn(rnn, criterion, optimizer, train_loader, device, nepoch, val_loader = val_loader, silent=False, scheduler=scheduler)

with open('/work3/s164419/01005WakeWordData/models/RNN_V04_results.p', 'wb') as f:
    pickle.dump([epochs, train_losses, test_losses], f)

In [None]:
torch.save(rnn.state_dict(), '/work3/s164419/01005WakeWordData/models/RNN_V04.pth')

# Load a pretrained model

In [None]:
predict_batch = False

if predict_batch:

    rnn_pretrained = SimpleRNN()
    rnn_pretrained.load_state_dict(torch.load('/work3/s164419/01005WakeWordData/models/RNN_V04.pth', map_location=torch.device('cpu')))

    with open('/work3/s164419/01005WakeWordData/models/RNN_V04_results.p', 'rb') as f:
        epochs, train_losses, test_losses = pickle.load(f)


    audio, sr, x, target, path, ID = next(iter(train_loader))

    with torch.no_grad():
        outputs = rnn_pretrained(x)
        p = torch.softmax(outputs, dim=-1)

    fig, axs = plt.subplots(nrows=4, ncols=4, figsize=(15,10))
    axs = axs.flatten()

    for k, ax in enumerate(axs):
        ax.plot(p[k,:,1])
        ax.plot(target[k])
        #ax.set_title(path[k])
        
    plt.tight_layout()

plt.figure()
plt.plot(epochs, train_losses)
plt.plot(epochs, val_losses)

# Dummy - Train on a Single Utterance

In [None]:
#weights = torch.tensor([1., 5.]).to(device)
#criterion = nn.CrossEntropyLoss(weight = weights)

#scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
#rnn = SimpleRNN().to(device)
#
#weights = torch.tensor([1., 5.]).to(device)
#criterion = nn.CrossEntropyLoss(weight = weights)
#optimizer = optim.Adam(rnn.parameters())
#
#audio, sr, x, target, path, ID = next(iter(train_loader))
#
#fig = plt.figure()
#ax = plt.gca()
#
#for i in range(5000):
#    ipd.clear_output(wait=True)
#
#    inputs = x[:2,:,:,:]
#    targets = target[:2]
#    outputs = rnn(inputs)
#
#    # zero the parameter gradients
#    optimizer.zero_grad()
#
#    # forward + backward + optimize
#    #outputs = rnn(inputs.float())

#    #If we use CrossEntropyLoss
#    outputs = rnn(inputs.float()) 
#
#    loss = criterion(outputs.permute(0,2,1), targets.long()) #Permute according to https://discuss.pytorch.org/t/loss-functions-for-batches/20488/6
#    loss.backward()
#    optimizer.step()
#
#    p = torch.softmax(outputs, dim=-1)
#    
#    idx = np.random.randint(low=0, high=2)
#    plt.plot(p[idx,:, 1].detach())
#    plt.plot(target[idx].detach())
#    plt.title(i)
#    plt.show()