In [11]:
import os
import time

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [12]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

In [13]:
def wav2spectrogram(path, segment_len=3, window='hamming', Tw=25, Ts=10, 
                    pre_emphasis=0.97, alpha=0.99, return_onesided=False):
    # read .wav file
    rate, samples = wavfile.read(path)
    
    ## parameters
    # frame duration (samples)
    Nw = int(rate * Tw * 1e-3)
    Ns = int(rate * (Tw - Ts) * 1e-3)
    # overlapped duration (samples)
    # 2 ** to the next pow of 2 of (Nw - 1)
    nfft = 2 ** (Nw - 1).bit_length()

    # preemphasis filter
    samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

    # removes DC component of the signal and add a small dither
    samples = signal.lfilter([1, -1], [1, -alpha], samples)
    dither = np.random.uniform(-1, 1, samples.shape)
    spow = np.std(samples)
    samples = samples + 1e-6 * spow * dither

    # segment selection
    upper_bound = len(samples) - segment_len * rate
    start = np.random.randint(0, upper_bound)
    end = start + segment_len * rate
    samples = samples[start:end]

    # spectogram
    _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                    mode='magnitude', return_onesided=return_onesided)

    # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
    spec *= rate / 10
    
    return spec

In [14]:
class IdentificationDatasetTrain(Dataset):
    
    def __init__(self, path, transform=None):
        self.path = path
        self.transform = transform
        
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        split['label'] = split['path'].apply(lambda x: int(x.split('/')[0].replace('id1', '')) - 1)
        
        # make train/test id split (in paths class id numbering starts with 1)
        fullid_arr = np.arange(1251) # 1--1251
        testid_arr = np.arange(269, 309) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
        # subsetting ids for training
        mask = split['label'].isin(trainid_arr)
        self.dataset = split['path'][mask].reset_index(drop=True)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx]
        audio_path = os.path.join(self.path, 'audio', track_path)
        
        # extract label from path like id10003/L9_sh8msGV59/00001.txt
        # subtracting 1 because PyTorch assumes that C_i in [0, 1251-1]
        label = int(track_path.split('/')[0].replace('id1', '')) - 1
        # PyTorch complains if label > num_classes. For ex, num_classes=1211
        # label is 1250. train labels \in [0, ..., 268, 309, ..., 1250]. (269 + 942 = 1211)
        # therefore, we subtract 40 (# of test classes) from a label => label \in [0, 1211]
        if label >= 309:
            label -= 40
        
        # make a spectrogram from a .wavfile
        spec = wav2spectrogram(audio_path)
        
        if self.transform:
            spec = self.transform(spec)

        return label, spec

In [15]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

In [16]:
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=4096)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=4096, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
    def forward_once(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.mpool1(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.mpool2(x)
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        x = self.relu(self.bn6(self.fc6(x)))
        
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        
        x = x.view(x.size(0), -1)
        
        if self.training:
            x = self.relu(self.bn7(self.fc7(x)))
            x = self.fc8(x)
        
        # we use the fc7 output for Hard Negative Mining (inference)
        else:
            x = self.fc7(x)
            x = F.normalize(x)
        
        # during training, there's no need for SoftMax because CELoss calculates it
        return x
    
    # phase: [training_iden, inference_negative_mining, training_siamese, verif_test]
    def forward(self, voice1, voice2=None, phase='train_iden'):
        if phase in ['train_iden', 'eval_mining']:
            return self.forward_once(voice1)
        
        elif phase in ['train_veri', 'eval_veri']:
            voice1 = self.forward_once(voice1)
            voice2 = self.forward_once(voice2)
            return voice1, voice2

In [17]:
DATASET_PATH = '/home/nvme/data/vc1/'
LOG_PATH = '/home/vladimir/nvme/logs/VoxCeleb/verif_class'
EPOCH_NUM = 30

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 100

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
DEVICE = 'cuda:0'
NUM_WORKERS = 4
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [16]:
net = VoiceNet(num_classes=1211)
net.to(DEVICE)

transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDatasetTrain(DATASET_PATH, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, 
                                             num_workers=NUM_WORKERS, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

In [11]:
for epoch_num in range(EPOCH_NUM):
    lr_scheduler.step()
    
    # train
    net.train()
    
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        scores = net(specs)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('Metrics/train_loss', loss.item(), step_num)
        TBoard.add_scalar('Metrics/lr', lr_scheduler.get_lr()[0], step_num)
        
# when the training is finished save the model
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.txt'.format(time.time())))
TBoard.close()

1487it [07:20,  4.08it/s]
1487it [07:25,  4.10it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.08it/s]
1487it [07:25,  4.08it/s]
1487it [07:25,  4.02it/s]
1487it [07:25,  4.10it/s]
1487it [07:25,  4.09it/s]
1487it [07:25,  4.03it/s]
1487it [07:25,  4.08it/s]
1487it [07:24,  4.09it/s]
1487it [07:25,  4.10it/s]
18it [00:06,  3.33it/s]Process Process-59:
Process Process-57:
Process Process-60:
Process Process-58:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vladimir/venv/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  F

KeyboardInterrupt: 

In [14]:
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.txt'.format(time.time())))
TBoard.close()

In [18]:
pretrained_dict = torch.load(os.path.join(LOG_PATH, 'model_snapshot_1542979501.519298.txt'))

net = VoiceNet(num_classes=1211)
net.to(DEVICE)

model_dict = net.state_dict()

# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict) 
# 3. load the new state dict
net.load_state_dict(model_dict)

FileNotFoundError: [Errno 2] No such file or directory: '/home/vladimir/nvme/logs/VoxCeleb/verif_class/model_snapshot_1542979501.519298.txt'

In [22]:
B = 10

net.train()
v1 = torch.rand((B, 1, 512, 298)).to(DEVICE)
print(net(v1, phase='train_iden').shape) # B, 1211

net.eval()
v1 = torch.rand((B, 1, 512, 298)).to(DEVICE)
v2 = torch.rand((B, 1, 512, 298)).to(DEVICE)
print(net(v1, phase='eval_mining').shape) # B, 1024
print(net(v2, phase='eval_mining').shape) # B, 1024

net.train()
for param in net.parameters():
    param.requires_grad = False
net.fc8 = nn.Linear(net.fc8.in_features, 1024).to(DEVICE)
v1 = torch.rand((B, 1, 512, 298)).to(DEVICE)
v2 = torch.rand((B, 1, 512, 298)).to(DEVICE)
print(net(v1, v2, phase='train_veri')[1].shape) # B, 1024

net.eval()
v1 = torch.rand((1, 1, 512, 2324)).to(DEVICE)
v2 = torch.rand((1, 1, 512, 3245)).to(DEVICE)
print(net(v1, v2, phase='eval_veri')[1].shape) # 1, 1024

torch.Size([10, 1211])
torch.Size([10, 1024])
torch.Size([10, 1024])
torch.Size([10, 1024])
torch.Size([1, 1024])


In [25]:
fullid_arr = np.arange(1251) # 1--1251
testid_arr = np.arange(269, 309) # 270--309
trainid_arr = np.setdiff1d(fullid_arr, testid_arr)
print(len(trainid_arr))
print(trainid_arr[1200])

1211
1240


In [None]:
class HardNegativeMining(Dataset):
    
    def __init__(self, path, model, transform=None):
        self.path = path
        self.model = model
        self.transform = transform
        
        fullid_arr = np.arange(1251) # 1--1251
        testid_arr = np.arange(269, 309) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
        self.dataset = trainid_arr
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        label = self.dataset
#         return (<B positives + B/2 random negatives + B/2 10% hardest> x 1 x 512 x 298)

In [209]:
fullid_arr = np.arange(1, 1252) # 1--1251
testid_arr = np.arange(270, 310) # 270--309
trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
# a = np.random.choice(trainid_arr, 1211, replace=False)

In [211]:
np.array_split(trainid_arr, 1211 // 100)

[array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101]),
 array([102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
        141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
        167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 1

In [19]:
class VerificationDatasetTrain(Dataset):
    
    def __init__(self, path, model, batch_size, device, transform=None):
        self.path = path
        self.model = model
        self.transform = transform
        self.device = device
        
        fullid_arr = np.arange(1, 1252) # 1--1251
        testid_arr = np.arange(270, 310) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309

        # split the set of ids into `len(trainid_arr) // batch_size` subsets
        self.splits = np.array_split(trainid_arr, len(trainid_arr) // batch_size)
        
    def __len__(self):
        return len(self.splits)
    
    def __getitem__(self, idx):
        
        ## POSITIVE PART
        ids = self.splits[idx]
        anchors = [0] * len(ids)
        positives = [0] * len(ids)
        
        for i, id in enumerate(ids):
            # 265 -> id10265
            full_id = 'id1{:04d}'.format(id)
            # list all tracks for that id
            track_list = os.listdir(os.path.join(self.path, 'audio', full_id))
            # randomly select two tracks without replacement
            track1, track2 = np.random.choice(track_list, 2, replace=False)
            # select two voice segments
            voice1_path = os.path.join(self.path, 'audio', full_id, track1)
            voice2_path = os.path.join(self.path, 'audio', full_id, track2)
            # create spectrograms for selected .wav files
            spec1 = wav2spectrogram(voice1_path) 
            spec2 = wav2spectrogram(voice2_path)
            # add to the list
            anchors[i] = spec1
            positives[i] = spec2
            
        anchors = torch.cat(anchors).to(self.device)
        positives = torch.cat(positives).to(self.device)
        
        self.model.eval()
        anchors = self.model(anchors, 'eval_mining') # B, 1024
        positives = self.model(positives, 'eval_mining') # --//---
        
        dists = (anchors ** 2).sum(dim=1).view(-1, 1) + \
                (positives ** 2).sum(dim=1) - \
                2 * anchors.matmul(positives.t())
        dists = torch.sqrt(dists)
        
        # divide the current set of ids into two parts
        negatives_hnm, negatives_rand = np.array_split(ids, 2)
        
        ## HARD NEGATIVE MINING PART
        
        
        ##  RANDOM PART
        

In [223]:
np.random.shuffle(self_splits[0], )

In [240]:
fullid_arr = np.arange(1, 1252) # 1--1251
testid_arr = np.arange(270, 310) # 270--309
trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309

# split the set of ids into `len(trainid_arr) // batch_size` subsets
self_splits = np.array_split(trainid_arr, 1211 // 100)
ids = np.random.permutation(self_splits[0])
print(ids)
negatives_rand, negatives_hnm = np.array_split(ids, 2)
print(negatives_hnm)

[ 79  80  85  93  12  23  39  37  52  58  82  27  17  42  56  44   3  19
  70  45  28  36   1  66  49  54  31  71  61  22  90  10  95  46  84  91
  75  53   9   5   6  87  88  78  92  77  62  73  99  41  55  51  30  26
  63  59  47   2  68  43  35  96  33  48  25  65  89  11  57  50  38  32
  24 101  13  64  83  15  14  76  29  74   4  40  81  86 100  20   8  98
  21  72  34  67  18  97  69  94  16   7  60]
[ 51  30  26  63  59  47   2  68  43  35  96  33  48  25  65  89  11  57
  50  38  32  24 101  13  64  83  15  14  76  29  74   4  40  81  86 100
  20   8  98  21  72  34  67  18  97  69  94  16   7  60]


In [242]:
B = 3
anchors = torch.randint(high=10, size=(B, 10))
positives = torch.randint(high=10, size=(B, 10))

In [243]:
print(anchors)
print()
print(positives)
dists = (anchors**2).sum(dim=1).view(-1, 1) + (positives**2).sum(dim=1) - 2*anchors.matmul(positives.t())
dists = np.sqrt(dists)
print(dists)
dists_sorted, dists_sorted_idx = dists.sort(dim=1)
print(dists_sorted, dists_sorted_idx)
idx_threshold = round(tau * (B-1))
# Given a distance matrix Dij, if i=j a value corresponds to a distance between 
# positive pairs -> we need to prevent them from getting to the negative samples
# First, we need to remove i=j elements.
mask = (dists_sorted_idx != torch.arange(B).repeat(1, B).view(B, B).t())
dists_sorted_idx_rm = dists_sorted_idx[mask].view(B, B-1)
print(dists_sorted_idx_rm)
selected_idx = dists_sorted_idx_rm[:, idx_threshold]
print(selected_idx)
# tau = 0.1
# position = round(tau * (B-1))

tensor([[4., 7., 5., 1., 0., 1., 6., 7., 7., 5.],
        [6., 9., 5., 7., 0., 0., 6., 2., 3., 7.],
        [8., 2., 2., 6., 0., 0., 5., 8., 7., 0.]])

tensor([[9., 1., 0., 5., 7., 7., 5., 5., 7., 7.],
        [7., 3., 6., 7., 8., 5., 6., 0., 5., 0.],
        [0., 2., 4., 3., 6., 0., 9., 1., 5., 2.]])
tensor([[14.0000, 14.8324, 11.8743],
        [15.0333, 13.5647, 13.3041],
        [12.7671, 13.3041, 13.6382]])
tensor([[11.8743, 14.0000, 14.8324],
        [13.3041, 13.5647, 15.0333],
        [12.7671, 13.3041, 13.6382]]) tensor([[2, 0, 1],
        [2, 1, 0],
        [0, 1, 2]])
tensor([[2, 1],
        [2, 0],
        [0, 1]])
tensor([2, 2, 0])
