# Import libraries & parse arguments

In [16]:
import torch
import numpy as np
import IPython.display
import torch
import torch.nn as nn
from tqdm import tqdm

from model.embedder import SpeechEmbedder
from datasets.ZaloAIDataset import create_dataset
from utils.hparams import HParam
from utils.eer import EER

# Prepare

Get all folder paths (speaker based). Format will be a single list of folder paths

In [3]:
hp = HParam("config.yaml")

  for doc in docs:


In [4]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def test_cuda_collate_fn(batch):
    w1_mel_list = list()
    w2_mel_list = list()
    label_list = list()
    
    for _, _, _, _, w1_mel, w2_mel, label, *_ in batch:
        w1_mel_list.append(w1_mel)
        w2_mel_list.append(w2_mel)
        label_list.append(label)
    w1_mel_list = pad_sequence(w1_mel_list, batch_first=True)
    w2_mel_list = pad_sequence(w2_mel_list, batch_first=True)
    label_list = torch.stack(label_list, dim=0)

    return w1_mel_list, w2_mel_list, label_list

In [5]:
dataset = create_dataset(hp, "test")
dataloader = DataLoader(dataset=dataset,
            batch_size=4,
            shuffle=False,
            num_workers=0,
            collate_fn=test_cuda_collate_fn,
            pin_memory=True,
            drop_last=True,
            sampler=None)

# Main

In [6]:
embedder_pt = torch.load('embedder.pt',map_location="cpu")
embedder = SpeechEmbedder(hp)
embedder.load_state_dict(embedder_pt)
embedder = embedder.cuda()
embedder.eval()

SpeechEmbedder(
  (lstm): LSTM(40, 768, num_layers=3, batch_first=True)
  (proj): LinearNorm(
    (linear_layer): Linear(in_features=768, out_features=256, bias=True)
  )
)

In [18]:
s1_path, s2_path, w1, w2, m1, m2, label = dataset[0]
s1_path, s2_path, w1, w2, m1, m2, label

('datasets/ZaloAI2020/private-test/0/0/q0X0yY4gYC6jt0rYOcwY.wav',
 'datasets/ZaloAI2020/private-test/0/0/kjeWQBJ70qDLk7R7M7Ih.wav',
 array([ 3.0029297e-02,  6.2164307e-02,  7.3486328e-02, ...,
        -6.1035156e-03, -5.1574707e-03, -3.0517578e-05], dtype=float32),
 array([ 0.01809692,  0.02774048,  0.02139282, ...,  0.03521729,
         0.01431274, -0.00427246], dtype=float32),
 tensor([[-0.6013, -2.2244, -2.2547,  ..., -2.1950, -2.8161, -2.4075],
         [ 0.0330, -0.6561, -1.1946,  ..., -0.9638, -1.1659, -1.5474],
         [ 0.4425,  0.3706, -0.3333,  ..., -0.4274, -0.6720, -1.1104],
         ...,
         [-4.7263, -5.8529, -5.6487,  ..., -5.7564, -5.6639, -5.6716],
         [-4.5932, -5.8565, -5.8931,  ..., -5.7741, -5.7262, -5.6187],
         [-4.4717, -5.7999, -5.8866,  ..., -5.7750, -5.7579, -5.8008]]),
 tensor([[-1.6268, -1.7113, -2.4918,  ..., -1.8787, -1.8889, -0.1788],
         [-0.8937, -1.8994, -1.8591,  ..., -0.5464, -0.4369,  0.5396],
         [-0.2642, -0.1607, -0.025

In [19]:
%%time
with torch.no_grad():
    e1 = embedder(m1.cuda(non_blocking=True))
    e2 = embedder(m2.cuda(non_blocking=True))

CPU times: user 4.31 ms, sys: 6.81 ms, total: 11.1 ms
Wall time: 14.4 ms


In [20]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
cos(e1, e2)

tensor(0.7805, device='cuda:0')

In [41]:
eer = EER(compute_on_step=False)



In [42]:
pred = []
for i in tqdm(range(len(dataset))):
    _, _, _, _, m1, m2, label = dataset[i]
    with torch.no_grad():
        e1 = embedder(m1.cuda(non_blocking=True))
        e2 = embedder(m2.cuda(non_blocking=True))
    sim = cos(e1, e2).cpu()
    pred.append(sim.item())
    eer(sim.reshape((1, 1)), torch.tensor(label).reshape((1, 1)))

100%|██████████| 50000/50000 [15:42<00:00, 53.03it/s]


In [53]:
pred = np.array(pred)

In [43]:
eer.compute()

  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]
  y_new = slope*(x_new - x_lo)[:, None] + y_lo


(0.0, array(nan))

In [45]:
from scipy.optimize import brentq
from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve

In [47]:
fpr, tpr, thresholds = roc_curve(dataset.data["label"], pred, pos_label=1)
eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
thresh = interp1d(fpr, thresholds)(eer)

In [51]:
eer, thresh

(0.16911323547058113, array(0.51920239))

In [63]:
np.sum(dataset.data["label"] == (pred > 0.519)*1)/len(pred)

0.83072