In [1]:
import os

import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [None]:
class VerificationDatasetTrain(Dataset):
    
    def __init__(self, path, transform=None):
        self.path = path
        self.transform = transform
        
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        split['label'] = split['path'].apply(lambda x: int(x.split('/')[0].replace('id1', '')) - 1)
        
        # make train/test id split (in paths class id numbering starts with 1)
        fullid_arr = np.arange(1251) # 1--1251
        testid_arr = np.arange(269, 309) # 270--309
        trainid_arr = np.setdiff1d(fullid_arr, testid_arr) # 1--1251 \ 270--309
        # subsetting ids for training
        mask = split['label'].isin(trainid_arr)
        self.dataset = split['path'][mask].reset_index(drop=True)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx]
        audio_path = os.path.join(self.path, 'audio', track_path)

        # read .wav
        rate, samples = wavfile.read(audio_path)
        # extract label from path like id10003/L9_sh8msGV59/00001.txt
        # subtracting 1 because PyTorch assumes that C_i in [0, 1251-1]
        label = int(track_path.split('/')[0].replace('id1', '')) - 1

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        
        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        
        # segment selection
        segment_len = 3 # sec
        upper_bound = len(samples) - segment_len * rate
        start = np.random.randint(0, upper_bound)
        end = start + segment_len * rate
        samples = samples[start:end]
        
        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec *= rate / 10
        
        if self.transform:
            spec = self.transform(spec)

        return label, spec

In [None]:
class VerificationDatasetTest(Dataset):
    
    def __init__(self, path, transform=None):
        self.path = path
        self.transform = transform
        veri_split_path = os.path.join(path, 'veri_test.txt')
        self.dataset = pd.read_table(veri_split_path, sep=' ', header=None, 
                                     names=['label', 'voice1', 'voice2'])
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        label, voice1_path, voice2_path = self.dataset.iloc[idx]
        voice1_path = os.path.join(self.path, 'audio', voice1_path)
        voice2_path = os.path.join(self.path, 'audio', voice2_path)

        # read .wav
        rate, voice1 = wavfile.read(voice1_path)
        rate, voice2 = wavfile.read(voice2_path)

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        voice1 = np.append(voice1[0], voice1[1:] - pre_emphasis * voice1[:-1])
        voice2 = np.append(voice2[0], voice2[1:] - pre_emphasis * voice2[:-1])
        
        
        # removes DC component of the signal and add a small dither
        voice1 = signal.lfilter([1, -1], [1, -0.99], voice1)
        voice2 = signal.lfilter([1, -1], [1, -0.99], voice2)
        dither1 = np.random.uniform(-1, 1, voice1.shape)
        dither2 = np.random.uniform(-1, 1, voice2.shape)
        spow1 = np.std(voice1)
        spow2 = np.std(voice2)
        voice1 = voice1 + 1e-6 * spow1 * dither1
        voice2 = voice2 + 1e-6 * spow2 * dither2
        
        # spectogram
        _, _, spec1 = signal.spectrogram(voice1, rate, window, Nw, Ns, nfft, 
                                         mode='magnitude', return_onesided=False)
        _, _, spec2 = signal.spectrogram(voice2, rate, window, Nw, Ns, nfft, 
                                         mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec1 *= rate / 10
        spec2 *= rate / 10
        
        if self.transform:
            spec1 = self.transform(spec1)
            spec2 = self.transform(spec2)

        return label, spec1, spec2

In [None]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

In [None]:
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=4096)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=4096, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
    def forward(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.mpool1(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.mpool2(x)
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        x = self.relu(self.bn6(self.fc6(x)))
        
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        
        x = x.view(x.size(0), -1)
        x = self.relu(self.bn7(self.fc7(x)))
        x = self.fc8(x)
        
        # during training, there's no need for SoftMax because CELoss calculates it
        return x

In [None]:
DATASET_PATH = '/home/nvme/data/vc1/'
LOG_PATH = '/home/nvme/logs/VoxCeleb/verif1'
EPOCH_NUM = 30

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 100

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
DEVICE = 'cuda:0'
NUM_WORKERS = 4
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [None]:
net = VoiceNet(num_classes=1211)
net.to(DEVICE)

transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = VerificationDatasetTrain(DATASET_PATH, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, 
                                             num_workers=NUM_WORKERS, shuffle=True)