# RNN

Here, the dataset is generated using:

`python downloader.py --export-folder 'RNN_data' --splits 'train' --cliplength 10 --balance 1:1`

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

import numpy as np
import pandas as pd

#Import torch stuff.
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim import lr_scheduler

#pip install git+https://github.com/facebookresearch/WavAugment.git
import augment

import IPython.display as ipd
import matplotlib.pyplot as plt

from tqdm import trange, tqdm

#Append the path outside so we can load bom1.
import sys
sys.path.append('..')

import bom1.wakeword as wf
import bom1.bom1 as bom1
from   bom1.toolbox import WakewordDataset

from sklearn.metrics import accuracy_score

import os

import pickle

In [2]:
#Set the notebook to run on the GPU, if available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'This notebook is running on the {device.type}.')

if device.type == 'cuda':
    torch.cuda.current_device()
    torch.cuda.set_device(1)

This notebook is running on the cuda.


In [3]:
folder = '/work3/s164419/01005WakeWordData/RNN_data/train/'
train_dataset = WakewordDataset(f=T.Spectrogram(hop_length=40), folder=folder, normalize=True, #normalize the audio when reading it with torchaudio. 
                                              transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050*10)]
                                              ) #sr * length of the clip

folder = '/work3/s164419/01005WakeWordData/RNN_data/val/'
val_dataset = WakewordDataset(f=T.Spectrogram(hop_length=40), folder=folder, normalize=True, #normalize the audio when reading it with torchaudio. 
                                              transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050*10)]
                                              ) #sr * length of the clip


#Create the loaders.
if device.type == 'cpu':
    batch_size = 16
else:
    batch_size = 512
    
train_loader  = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader    = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

In [4]:
from bom1.models import RNN_V1

rnn = RNN_V1().to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight = torch.Tensor([20]).to(device))
optimizer = optim.Adam(rnn.parameters(), lr=0.0001)
nepoch = 50

In [None]:
train_losses = []
val_losses   = []

for epoch in tqdm(range(nepoch), total=nepoch, desc='Epoch'):
    
    train_loss = 0
    val_loss   = 0
    
    for data in train_loader:
        #Reset the gradients.
        optimizer.zero_grad()

        #Fetch the data
        x, contains_catchphrase, path = data

        x = x.to(device)

        #Forward pass
        #outputs = rnn(x)
        outputs = rnn(x).squeeze(-1)
        
        #Construct the targets.
        targets = torch.zeros(torch.Size([outputs.shape[0], outputs.shape[1]]))
        #825:873 is equivalent from 6s to 6.36s.
        targets[torch.where(contains_catchphrase)[0], 618:756] = 1
        targets = targets.to(device)

        #Calculate the loss
        #loss = criterion(outputs.permute(0,2,1), targets.long())
        loss = criterion(outputs, targets) #BCE Loss.
        
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
     
    with torch.no_grad():
        for data in val_loader:
            #Fetch the data
            x, contains_catchphrase, path = data
            x = x.to(device)
            
            #Forward pass
            #outputs = rnn(x)
            outputs = rnn(x).squeeze(-1)
            
            #Construct the targets.
            targets = torch.zeros(torch.Size([outputs.shape[0], outputs.shape[1]]))
            #825:873 is equivalent from 6s to 6.36s.
            #targets[torch.where(contains_catchphrase)[0], 825:873] = 1
            
            
            #618 : 756 is equivalent with the entire duration (1s) of the "Kan I se det", placed in the middle of the clip.
            targets[torch.where(contains_catchphrase)[0], 618:756] = 1
            
            targets = targets.to(device)
            
            #Calculate the validation loss
            #loss = criterion(outputs.permute(0,2,1), targets.long())
            loss = criterion(outputs, targets) #BCE Loss.
            
            val_loss += loss.item()
            
    #Find the average batch loss.
    train_loss = train_loss / len(train_loader)
    val_loss   = val_loss / len(val_loader)
    
    #Append the training loss and the validation loss.
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    torch.save(rnn.state_dict(), os.path.join('/work3/s164419/01005WakeWordData/models/RNN_V1', 'model.pth'))
    
    print(f'[{epoch}] Train loss: {train_loss}, Val loss: {val_loss}')
    

Epoch:   0%|                                             | 0/50 [00:00<?, ?it/s]

In [None]:
plt.plot(train_losses, 'b--', label='train')
plt.plot(val_losses, 'r--', label='val')

plt.legend()

In [None]:
#Fetch the data
x, contains_catchphrase, path = data

x = x.to(device)

#Forward pass
#outputs = rnn(x)
outputs = rnn(x).squeeze(-1)

#Construct the targets.
targets = torch.zeros(torch.Size([outputs.shape[0], outputs.shape[1]]))
#825:873 is equivalent from 6s to 6.36s.
targets[torch.where(contains_catchphrase)[0], 618:756] = 1
targets = targets.to(device)

In [None]:
idx_positive = np.random.choice(torch.where(contains_catchphrase)[0])
idx_negative = np.random.choice(torch.where(contains_catchphrase==0)[0])

plt.figure(figsize=(20,5))

plt.subplot(2,2,1)
audio, sr = torchaudio.load(path[idx_positive])
plt.plot(audio.mean(axis=0))
plt.title('Positive sound')

plt.subplot(2,2,2)
audio, sr = torchaudio.load(path[idx_negative])
plt.plot(audio.mean(axis=0))
plt.title('Negative sound')

plt.subplot(2,2,3)
plt.title('Positive example')
plt.plot(torch.sigmoid(outputs[idx_positive]).detach().cpu())
plt.plot(targets[idx_positive].detach().cpu())
plt.axhline(0.5, linestyle='--', color='black')
plt.ylim([0, 1])

plt.subplot(2,2,4)
plt.title('Negative example')
plt.plot(torch.sigmoid(outputs[idx_negative]).detach().cpu())
plt.plot(targets[idx_negative].detach().cpu())
plt.axhline(0.5, linestyle='--', color='black')
plt.ylim([0, 1])

plt.tight_layout()
plt.show()