In [3]:
import os
import time

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [4]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

In [5]:
class IdentificationDatasetTrain(Dataset):
    
    def __init__(self, path, transform=None):
        self.path = path
        self.transform = transform
        
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        split['label'] = split['path'].apply(lambda x: int(x.split('/')[0].replace('id1', '')) - 1)
        
        # make train/test id split (in paths class id numbering starts with 1)
        fullid_arr = np.arange(1251) # 1--1251
        testid_arr = np.arange(269, 309) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
        # subsetting ids for training
        mask = split['label'].isin(trainid_arr)
        self.dataset = split['path'][mask].reset_index(drop=True)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx]
        audio_path = os.path.join(self.path, 'audio', track_path)

        # read .wav
        rate, samples = wavfile.read(audio_path)
        # extract label from path like id10003/L9_sh8msGV59/00001.txt
        # subtracting 1 because PyTorch assumes that C_i in [0, 1251-1]
        label = int(track_path.split('/')[0].replace('id1', '')) - 1
        # PyTorch complains if label > num_classes. For ex, num_classes=1211
        # label is 1250. train labels \in [0, ..., 268, 309, ..., 1250]. (269 + 942 = 1211)
        # therefore, we subtract 40 (# of test classes) from a label => label \in [0, 1211]
        if label >= 309:
            label -= 40

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        
        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        
        # segment selection
        segment_len = 3 # sec
        upper_bound = len(samples) - segment_len * rate
        start = np.random.randint(0, upper_bound)
        end = start + segment_len * rate
        samples = samples[start:end]
        
        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec *= rate / 10
        
        if self.transform:
            spec = self.transform(spec)

        return label, spec

In [7]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

In [8]:
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=4096)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=4096, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
    def forward_once(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.mpool1(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.mpool2(x)
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        x = self.relu(self.bn6(self.fc6(x)))
        
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        
        x = x.view(x.size(0), -1)
        
        if self.training:
            x = self.relu(self.bn7(self.fc7(x)))
            x = self.fc8(x)
        
        # we use the fc7 output for Hard Negative Mining (inference)
        else:
            x = self.fc7(x)
            x = F.normalize(x)
        
        # during training, there's no need for SoftMax because CELoss calculates it
        return x
    
    # phase: [training_iden, inference_negative_mining, training_siamese, verif_test]
    def forward(self, voice1, voice2=None, phase='train_iden'):
        if phase in ['train_iden', 'eval_mining']:
            return self.forward_once(voice1)
        
        elif phase in ['train_veri', 'eval_veri']:
            voice1 = self.forward_once(voice1)
            voice2 = self.forward_once(voice2)
            return voice1, voice2

In [15]:
DATASET_PATH = '/home/nvme/data/vc1/'
LOG_PATH = '/home/nvme/logs/VoxCeleb/verif_class'
EPOCH_NUM = 30

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 100

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
DEVICE = 'cuda:0'
NUM_WORKERS = 4
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [16]:
net = VoiceNet(num_classes=1211)
net.to(DEVICE)

transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDatasetTrain(DATASET_PATH, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, 
                                             num_workers=NUM_WORKERS, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

In [11]:
for epoch_num in range(EPOCH_NUM):
    lr_scheduler.step()
    
    # train
    net.train()
    
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        scores = net(specs)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('Metrics/train_loss', loss.item(), step_num)
        TBoard.add_scalar('Metrics/lr', lr_scheduler.get_lr()[0], step_num)
        
# when the training is finished save the model
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.txt'.format(time.time())))
TBoard.close()

1487it [07:20,  4.08it/s]
1487it [07:25,  4.10it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.08it/s]
1487it [07:25,  4.08it/s]
1487it [07:25,  4.02it/s]
1487it [07:25,  4.10it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.03it/s]
1487it [07:25,  4.08it/s]
1487it [07:24,  4.09it/s]
1487it [07:25,  4.10it/s]
18it [00:06,  3.33it/s]Process Process-59:
Process Process-57:
Process Process-60:
Process Process-58:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vladimir/venv/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  F

KeyboardInterrupt: 

In [14]:
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.txt'.format(time.time())))
TBoard.close()

In [21]:
pretrained_dict = torch.load(os.path.join(LOG_PATH, 'model_snapshot_1542979501.519298.txt'))

net = VoiceNet(num_classes=1211)
net.to(DEVICE)

model_dict = net.state_dict()

# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict) 
# 3. load the new state dict
net.load_state_dict(model_dict)

In [22]:
B = 10

net.train()
v1 = torch.rand((B, 1, 512, 298)).to(DEVICE)
print(net(v1, phase='train_iden').shape) # B, 1211

net.eval()
v1 = torch.rand((B, 1, 512, 298)).to(DEVICE)
v2 = torch.rand((B, 1, 512, 298)).to(DEVICE)
print(net(v1, phase='eval_mining').shape) # B, 1024
print(net(v2, phase='eval_mining').shape) # B, 1024

net.train()
for param in net.parameters():
    param.requires_grad = False
net.fc8 = nn.Linear(net.fc8.in_features, 1024).to(DEVICE)
v1 = torch.rand((B, 1, 512, 298)).to(DEVICE)
v2 = torch.rand((B, 1, 512, 298)).to(DEVICE)
print(net(v1, v2, phase='train_veri')[1].shape) # B, 1024

net.eval()
v1 = torch.rand((1, 1, 512, 2324)).to(DEVICE)
v2 = torch.rand((1, 1, 512, 3245)).to(DEVICE)
print(net(v1, v2, phase='eval_veri')[1].shape) # 1, 1024

torch.Size([10, 1211])
torch.Size([10, 1024])
torch.Size([10, 1024])
torch.Size([10, 1024])
torch.Size([1, 1024])


In [25]:
fullid_arr = np.arange(1251) # 1--1251
testid_arr = np.arange(269, 309) # 270--309
trainid_arr = np.setdiff1d(fullid_arr, testid_arr)
print(len(trainid_arr))
print(trainid_arr[1200])

1211
1240


In [None]:
class HardNegativeMining(Dataset):
    
    def __init__(self, path, model, transform=None):
        self.path = path
        self.model = model
        self.transform = transform
        
        fullid_arr = np.arange(1251) # 1--1251
        testid_arr = np.arange(269, 309) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
        self.dataset = trainid_arr
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        label = self.dataset
#         return (<B positives + B/2 random negatives + B/2 10% hardest> x 1 x 512 x 298)

In [73]:
fullid_arr = np.arange(1, 1252) # 1--1251
testid_arr = np.arange(270, 310) # 270--309
trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
a = np.random.choice(trainid_arr, 1211, replace=False)

In [74]:
np.array_split(a, 1211 // 100)

[array([  26,  568,  527,  740,  452,  785,  763, 1091, 1185,  858, 1077,
        1221,  537,  342, 1202,  810, 1149,   40,   66,  580,  442,  691,
         542,   15,  438,  877,  696,  500,  615,  777,  243,  635,  686,
         140, 1208,  232, 1003,  775, 1206,  656,  311,  581,  987,   90,
        1081,  796,  772, 1141,   36,  398,  862,  550,  385,  824,  974,
         636,  382, 1095,  773,  392,  173,  268, 1119,  141,  692, 1164,
         571,   68,  715,  351,  454,  754,  618,  988,  531,  989,  214,
        1043, 1161, 1192, 1154,  887,  888,  118, 1006,  724,  943,  972,
         595,  344, 1201,  757,  803, 1147,  933,  448,  826, 1009,   46,
         986,  739]),
 array([ 793,  813, 1170,  870, 1108,  319, 1090,  245,  624,  714,  146,
         162, 1059, 1159, 1085,   74, 1068,  113,  902, 1215,  980,  421,
         657,  850,  926,  658,  579, 1033,  661,  119,  868,  126, 1246,
          86,  770,  953,  776,  318,  328,  137, 1184,  347,  465,  735,
         979,  6

In [None]:
class VerificationDatasetTrain(Dataset):
    
    def __init__(self, path, model, batch_size, transform=None):
        self.path = path
        self.model = model
        self.transform = transform
        
        fullid_arr = np.arange(1, 1252) # 1--1251
        testid_arr = np.arange(270, 310) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
        
        # split the set of ids into `len(trainid_arr) // batch_size` subsets
        splits = np.array_split(trainid_arr, len(trainid_arr) // batch_size)