# Dependencies

First, we want to import some of the dependencies.

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"

import numpy as np
import pandas as pd

#Import torch stuff.
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

#pip install git+https://github.com/facebookresearch/WavAugment.git
import augment

import IPython.display as ipd
import matplotlib.pyplot as plt

from tqdm.notebook import trange, tqdm

import bom1.wakeword as wf
import bom1.bom1 as bom1

from sklearn.metrics import accuracy_score

In [2]:
#Set the notebook to run on the GPU, if available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'This notebook is running on the {device.type}.')

if device.type == 'cuda':
    torch.cuda.current_device()
    torch.cuda.set_device(1)

This notebook is running on the cuda.


# Defining Dataset and Dataloader

In [4]:
class WakewordDataset(Dataset):
    '''
    Construct a dataset with sound files.
    '''
    
    def __init__(self, f, folder, sr = 22050, normalize = True, transforms=None):
        
        #assert not ((folder is None) and (dataframe is None)), 'Both folder and dataframe cannot be none.'
        #assert (folder is None) or (dataframe is None), 'Either folder should be None or dataframe should be None.'
    
        #if folder is not None: 
            #f is the function that takes audio and returns the spectrogram.
        self.paths  = [os.path.join(folder, x) for x in os.listdir(folder)]

        folderinfo  = [wf.info_from_path(x) for x in os.listdir(folder)] #Already here, it's shuffled.
        self.ID, self.t1, self.t2, self.target = [x[0] for x in folderinfo], [x[1] for x in folderinfo], [x[2] for x in folderinfo], [x[3] for x in folderinfo]
            
        #    self.folder = True
            
        #elif dataframe is not None:

       #     self.paths = [os.path.join('/work3/s164419/01005WakeWordData/lectures', f'{x}.wav') for x in dataframe['ID'].tolist()]
            
            #Fetch all of the stuff from the dataframe.
       #     self.ID     = dataframe['ID'].tolist()
       #     self.t1     = dataframe['t1'].tolist()
       #     self.t2     = dataframe['t2'].tolist()
       #     self.target = dataframe['class'].tolist()
            
       #     self.folder = False
            
        #else:
        #    raise NotImplementedError()
        
        
        self.transforms = transforms
        self.f          = f
        self.normalize  = normalize
        
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        path            = self.paths[idx]
        
        #if self.folder:
        audio, sr, x    = wf.load_data(path, f = self.f, transforms=self.transforms, normalize=self.normalize)
        #else:
        #    t1, t2 = self.t1[idx], self.t2[idx]
        #    audio, sr, x    = wf.load_data(path, f = self.f, transforms=self.transforms, normalize=self.normalize, t1=t1, t2=t2)
            
        target          = self.target[idx]
        ID              = self.ID[idx] 
        
        return audio, sr, x, target, path, ID

In [6]:
train_dataset = WakewordDataset(folder='/work3/s164419/01005WakeWordData/every50_1s/train/', f = T.Spectrogram(), 
                                normalize=True, #normalize the audio when reading it with torchaudio. 
                                transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050)] #sr * cliplength
                               )

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=128, num_workers=4)

# CNN with default spectrogram

Let us make a completely vanilla CNN where `x` is the default spectrogram. 

In [12]:
%timeit audio, sr, x, target, path, ID = train_dataset.__getitem__(0)

3.57 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Let's read in the first batch.

In [9]:
%timeit audio, sr, x, target, path, ID = next(iter(train_loader))

1.14 s ± 32.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Defining the Network

In [14]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.sequential = nn.Sequential(
                              nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=16),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),

                              nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=16),
                              nn.ReLU(),
                              

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=32),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),

                              nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=32),
                              nn.ReLU(),
                              

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=64),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),
            
                              nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=64),
                              nn.ReLU(),

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=128),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),
            
                              nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=128),
                              nn.ReLU(),

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size=3, stride=1),
                              nn.BatchNorm2d(num_features=256),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),
            
                              nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size=3, stride=1),
                              nn.BatchNorm2d(num_features=256),
                              nn.ReLU(),

                              #nn.MaxPool2d(kernel_size=2),

                              #nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size=3, stride=1),
                              #nn.BatchNorm2d(num_features=512),
                              #nn.ReLU(),

                              #nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size=3, stride=1),
                              #nn.BatchNorm2d(num_features=512),
                              #nn.ReLU(),

                              #Fully connected part
                              nn.Conv2d(in_channels=256, out_channels=256*8*2, kernel_size=(8,2)),
                              nn.ReLU(),

                              #nn.Conv2d(in_channels=512*2*2, out_channels=512*2*2, kernel_size=1),
                              #nn.ReLU(),

                              #Output - no softmax!
                              nn.Conv2d(in_channels=256*8*2, out_channels=2, kernel_size=1),
                            )
        
    def forward(self, x):
        x = self.sequential(x)
        x = torch.squeeze(x, 3)
        x = torch.squeeze(x, 2)
        return x

## Constructing the dataloaders, network and some unit testing.

In [17]:
#Create the datasets.
train_dataset = WakewordDataset(folder='/work3/s164419/01005WakeWordData/every50_1s/train/', f = T.Spectrogram(), 
                                normalize=True, #normalize the audio when reading it with torchaudio. 
                                transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050)]
                               )

val_dataset = WakewordDataset(folder='/work3/s164419/01005WakeWordData/every50_1s/val/', f = T.Spectrogram(), 
                                normalize=True, #normalize the audio when reading it with torchaudio. 
                                transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050)]
                               )

#test_dataset = WakewordDataset(dataframe='/work3/s164419/01005WakeWordData/every50_1s/test/', f = T.Spectrogram(), 
#                                normalize=True, #normalize the audio when reading it with torchaudio. 
#                                transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
#                                              wf.TransformMono(), 
#                                              wf.Padder(22050)]
#                               )

#Create the loaders.
batch_size = 256
train_loader  = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader    = DataLoader(val_dataset  , shuffle=True, batch_size=batch_size)
#test_loader   = DataLoader(test_dataset , shuffle=True, batch_size=batch_size)

#Let us load a batch for unit tests.
audio, sr, x, targets, paths, ids = next(iter(train_loader))

#Construct the network.
cnn = CNN()

assert cnn(x).shape == torch.Size([batch_size, 2])

## Training the Network

In [18]:
cnn = CNN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=1e-3)
#optimizer = optim.SGD(cnn.parameters(), lr=0.01, momentum = 0.9)

epochs, train_losses, train_accs, val_losses, val_accs = wf.train_cnn(cnn, criterion, optimizer, train_loader, val_loader, device, nepoch=10)

Epoch:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training:   0%|          | 0/5634 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Save the Model

In [None]:
torch.save(cnn.state_dict(), '/work3/s164419/01005WakeWordData/models/cnn_1_to_3_1s.pth')

# Evaluate the Model on a Lecture

In [None]:
cnn = CNN()
cnn.load_state_dict(torch.load('/work3/s164419/01005WakeWordData/models/cnn_1_to_3_1s.pth'))
cnn.eval().to(device)

In [None]:
_, _, test = wf.get_splits()

ID = test['ID'].unique()[25]

with torch.no_grad():
    data = wf.load_data(f'/work3/s164419/01005WakeWordData/lectures/{ID}.wav', T.Spectrogram(), normalize=True, transforms=[wf.TransformMono()])
    audio, sr, x = data
    outputs = cnn(x.unsqueeze(0).to(device))
    p = outputs.squeeze(0).softmax(dim=0).detach().cpu().numpy().T[:,1]

In [None]:
plt.style.use('seaborn')
t = np.linspace(0, wf.lecture_durations()[ID], len(p))
plt.figure(figsize=(20,10))
plt.plot(t, p, alpha=0.5, label='Classified')
plt.vlines(test.loc[test['ID'] == ID][['t1', 't2']].mean(axis=1).to_numpy(), ymin=0, ymax=1, color='r', linestyles='--', label='GT')
plt.legend(loc='best')
plt.show()

In [None]:
[bom1.seconds_to_timestamp(float(x)) for x in t[p > 0.9]][-10:]