In [1]:
## TODO: Identification
## TODO: Verification

In [2]:
import os

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

# from sklearn.metrics import 

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [3]:
class IdentificationDataset(Dataset):
    
    def __init__(self, path, train, transform=None):
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        
        if train:
            phases = [1, 2]
        
        else:
            phases = [3]
            
        mask = split['phase'].isin(phases)
        self.dataset = split['path'][mask].reset_index(drop=True)
        self.path = path
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx]
        audio_path = os.path.join(self.path, 'audio', track_path)

        # read
        rate, samples = wavfile.read(audio_path)
        # extract label from path like id10003/L9_sh8msGV59/00001.txt
        label = int(track_path.split('/')[0].replace('id1', '')) - 1

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        
        if self.train:
            # segment selection
            segment_len = 3 # sec
            upper_bound = len(samples) - segment_len * rate
            start = np.random.randint(0, upper_bound)
            end = start + segment_len * rate
            samples = samples[start:end]
        
        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in paper and here "the same"
        spec *= rate / 10 + 13
        
        if self.transform:
            spec = self.transform(spec)

        return label, spec

In [4]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

In [None]:
## ADD BN on FC7

In [None]:
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=4096)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=4096, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        # 96* 254* 147
        x = self.mpool1(x)
        # 96, 126, 73
        x = self.relu(self.bn2(self.conv2(x)))
        # 256, 62, 36        
        x = self.mpool2(x)
        # 256, 30, 17
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        # 256, 9, 8
        
        x = self.relu(self.bn6(self.fc6(x)))
        # 4096, 1, 8
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        # 4096, 1, 1
        x = x.view(x.size(0), -1)
        x = self.relu(self.bn7(self.fc7(x)))
        x = self.fc8(x)
        
        return self.softmax(x)

In [None]:
# class VoiceNet(nn.Module):
#     "Implementation Ref: https://github.com/kuangliu/pytorch-cifar"
#     def __init__(self, num_classes=None):
#         super(VoiceNet, self).__init__()
#         VGG16 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 
#                  512, 512, 512, 'M', 512, 512, 512, 'M']
#         self.features = self._make_layers(VGG16)
#         self.classifier = nn.Linear(512, num_classes)

#     def forward(self, x):
#         out = self.features(x)
#         out = out.view(out.size(0), -1)
#         out = self.classifier(out)
#         return out

#     def _make_layers(self, cfg):
#         layers = []
#         in_channels = 3
#         for x in cfg:
#             if x == 'M':
#                 layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
#             else:
#                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
#                            nn.BatchNorm2d(x),
#                            nn.ReLU(inplace=True)]
#                 in_channels = x
#         layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
#         return nn.Sequential(*layers)

In [None]:
DATASET_PATH = '/home/nvme/data/vc1/'
LOG_PATH = '/home/nvme/logs/VoxCeleb/first_run'
EPOCH_NUM = 50
B = 96
WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
DEVICE = 'cuda:0'
NUM_WORKERS = 4
EVAL_THRESHOLD = 0.5
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [None]:
transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS)

testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1)

net = VoiceNet(num_classes=1251)
# net = VoiceNet(num_classes=2)
net.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

In [None]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings (known bug)

In [None]:
# # SIZE0 = (100, 3, 32, 32)
# # SIZE1 = (100, 3, 32, 32)
# SIZE0 = (100, 1, 512, 298)
# SIZE1 = (100, 1, 512, 298)
# class0_data = torch.rand(SIZE0) - 1
# class1_data = torch.rand(SIZE1) + 1

# labels0 = torch.zeros(100).type(torch.LongTensor)
# labels1 = torch.ones(100).type(torch.LongTensor)

# dataset = torch.cat([class0_data, class1_data])
# datalabels = torch.cat([labels0, labels1])

# shuffling_idxs = torch.randperm(200)
# dataset = dataset[shuffling_idxs]
# datalabels = datalabels[shuffling_idxs]

# # optimizer = optim.SGD(net.parameters(), 1e-5, MOMENTUM, weight_decay=WEIGHT_DECAY)
# optimizer = optim.SGD(net.parameters(), 1e-4, MOMENTUM, weight_decay=WEIGHT_DECAY)

# for epoch_num in range(EPOCH_NUM):
# #     lr_scheduler.step()
    
#     # train
#     net.train()
    
#     for iter_num, specs in tqdm(enumerate(dataset)):
#         labels, specs = datalabels[iter_num].view(1), specs.view(1, 1, 512, 298)
# #         labels, specs = datalabels[iter_num].view(1), specs.view(1, 3, 32, 32)
#         optimizer.zero_grad()
#         labels, specs = labels.to(DEVICE), specs.to(DEVICE)
#         probs = net(specs)
#         loss = criterion(probs, labels)
#         loss.backward()
#         optimizer.step()

#         # TBoard
#         step_num = epoch_num * len(dataset) + iter_num
#         TBoard.add_scalar('TrainLoss', loss.item(), step_num)

# #         TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
# #         TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
# #         TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
# #         TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
# #         TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
# #         TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
# #         TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
# #         TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
# #         TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
# #         TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)

In [None]:
for epoch_num in range(EPOCH_NUM):
    lr_scheduler.step()
    
    # train
    net.train()
    
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        probs = net(specs)
        loss = criterion(probs, labels)
        loss.backward()
        optimizer.step()
        
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('TrainLoss', loss.item(), step_num)
        
        TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
        TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
        TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
        TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
        TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
        TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
        TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
        TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
        TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
        TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)
        
        
#     # test
#     net.eval()
    
#     for iter_num, (label, spec) in tqdm(enumerate(testsetloader)):
#         optimizer.zero_grad()
#         labels, specs = label.to(DEVICE), spec.to(DEVICE)
#         prob = net(spec)
#         loss = criterion(prob, label)
        
#         # TBoard
#         TBoard.add_scalar('TestLoss', loss.item(), epoch_num+1)

398it [01:31,  4.42it/s]