In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"

import numpy as np
import pandas as pd
import pickle

#Import torch stuff.
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

#pip install git+https://github.com/facebookresearch/WavAugment.git
import augment

import IPython.display as ipd
import matplotlib.pyplot as plt

from tqdm import trange, tqdm

import bom1.wakeword as wf
import bom1.bom1 as bom1

from sklearn.metrics import accuracy_score

# Evaluate a Lecture

In [2]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv = nn.Sequential(
                              nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=16),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=32),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),
                            
                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=64),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
                              nn.BatchNorm2d(num_features=128),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size=3, stride=1),
                              nn.BatchNorm2d(num_features=256),
                              nn.ReLU(),
                              nn.Dropout2d(p=0.2),

                              nn.MaxPool2d(kernel_size=2),

                              nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size=3, stride=1),
                              nn.BatchNorm2d(num_features=512),
                              nn.ReLU(),
                            )

        self.fc1 = nn.Conv2d(in_channels = 512, out_channels = (512*3*14), kernel_size=(3, 14))
        self.fc2 = nn.Conv2d(in_channels = (512*3*14), out_channels = 2, kernel_size=(1,1))

        self.ReLU = torch.nn.ReLU()
        
    def forward(self, x):
        x = self.conv(x)
        x = self.fc1(x)
        x = self.ReLU(x)

        #No activation function after the last layer.
        x = self.fc2(x)
        x = x.squeeze(-1).squeeze(-1)
        return x

In [3]:
class WakewordDataset(Dataset):
    '''
    Construct a dataset with sound files.
    '''
    def __init__(self, f, folder, sr = 22050, normalize = True, transforms=None):
        
        self.paths  = [os.path.join(folder, x) for x in os.listdir(folder)]

        folderinfo  = [wf.info_from_path(x) for x in self.paths] #Already here, it's shuffled.
        self.ID, self.t1, self.t2, self.target = [x[0] for x in folderinfo], [x[1] for x in folderinfo], [x[2] for x in folderinfo], [x[3] for x in folderinfo]

        self.transforms = transforms
        self.f          = f
        self.normalize  = normalize
        
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        path            = self.paths[idx]
        audio, sr, x    = wf.load_data(path, f = self.f, transforms=self.transforms, normalize=self.normalize)
        target          = self.target[idx]
        ID              = self.ID[idx] 
        t1, t2 = self.t1[idx], self.t2[idx]
        
        return audio, sr, x, target, path, ID, t1, t2

In [5]:
#Get the splits
val_dataset = WakewordDataset(folder='/work3/s164419/01005WakeWordData/every50_1s_val_test/val/', f = T.Spectrogram(hop_length=40),
                                normalize=True, #normalize the audio when reading it with torchaudio. 
                                transforms = [#wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(22050)]
                               )

batch_size = 128
print(f'Batch size is {batch_size}')

val_loader  = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)

Batch size is 128


In [6]:
#Set the notebook to run on the GPU, if available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'This notebook is running on the {device.type}.')

if device.type == 'cuda':
    torch.cuda.current_device()
    torch.cuda.set_device(1)
    

cnn = CNN()
cnn.load_state_dict(torch.load('/work3/s164419/01005WakeWordData/models/CNN_1_to_5_1s_hoplength40.pth', map_location=device))
cnn.eval().to(device)

This notebook is running on the cpu.


CNN(
  (conv): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout2d(p=0.2, inplace=False)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU()
    (8): Dropout2d(p=0.2, inplace=False)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): Dropout2d(p=0.2, inplace=False)
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Conv2d(64, 128, kernel_size=(3, 3), stride=(

In [7]:
#Set it up for evaluation on validation set.
cnn.eval()  

#Save the ps and the path.
ps    = []
paths = []

val_loss = 0

with torch.no_grad():
    for minibatch_no, data in tqdm(enumerate(val_loader), total=len(val_loader), desc="Validation"):

        # get the inputs; data is a list of [inputs, labels]
        _, _, x, _, path, _, _, _ = data
    
        #Get that stuff on the GPU
        x = x.to(device)

        # forward + backward + optimize
        outputs = cnn(x.float())

        #Save predictions and targets
        p = torch.softmax(outputs, dim=-1)

        p = p[:,1].tolist()

        path = list(path)
    
        #Append to the list
        ps    += p
        paths += path

with open(f'/work3/s164419/01005WakeWordData/results/CNN_1_to_5_1s_hoplength40_val_performance_V2.p', 'wb') as f:
    pickle.dump([paths, ps], f)

Validation:   0%|          | 0/2091 [00:36<?, ?it/s]


KeyboardInterrupt: 