In [2]:
from google.colab import drive
import os
import sys
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
os.chdir('/content/drive/MyDrive/SpeakerVerification/SpeakerRecognition_tutorial/')

In [None]:
# !python train.py


Training set 21600 utts (90.0%)
Validation set 2400 utts (10.0%)
Total 24000 utts

Number of classes (speakers):
240

  * Validation: Loss 2.3673	Acc 36.3934
  * Validation: Loss 0.8645	Acc 73.2884
  * Validation: Loss 0.5708	Acc 82.0985
  * Validation: Loss 0.6255	Acc 80.4631
  * Validation: Loss 0.4546	Acc 86.4945
  * Validation: Loss 0.4162	Acc 86.8870
  * Validation: Loss 0.4481	Acc 86.5533
  * Validation: Loss 0.3897	Acc 88.2804
  * Validation: Loss 0.3224	Acc 89.9813
  * Validation: Loss 0.4051	Acc 87.4203
  * Validation: Loss 0.3952	Acc 88.5338
  * Validation: Loss 0.3070	Acc 91.2508
  * Validation: Loss 0.3004	Acc 90.7571
  * Validation: Loss 0.3229	Acc 89.6389
  * Validation: Loss 0.2541	Acc 91.4436
  * Validation: Loss 0.2880	Acc 90.7977
  * Validation: Loss 0.2347	Acc 93.3233
  * Validation: Loss 0.2609	Acc 92.9970
  * Validation: Loss 0.2191	Acc 93.5540
  * Validation: Loss 0.2665	Acc 91.5391
  * Validation: Loss 0.2662	Acc 91.9481
  * Validation: Loss 0.3233	Acc 91.5158
E

In [None]:
from verification import load_model, l2_norm
import torch
import torch.nn.functional as F
import math
from SR_Dataset import ToTensorTestInput

In [None]:
log_dir = 'model_saved' # Where the checkpoints are saved

# Settings
use_cuda = True # Use cuda or not
embedding_size = 128 # Dimension of speaker embeddings
cp_num = 22 # Which checkpoint to use?
n_classes = 240 # How many speakers in training data?

# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)

=> loading checkpoint


In [None]:
def get_embedding_from_feature(use_cuda, input, model, test_frames):
    
    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
    activation = 0
    with torch.no_grad():
        for i in range(tot_segments):
            temp_input = input[i*test_frames:i*test_frames+test_frames]
            
            TT = ToTensorTestInput()
            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
    
            if use_cuda:
                temp_input = temp_input.cuda()
            temp_activation,_ = model(temp_input)
            activation += torch.sum(temp_activation, dim=0, keepdim=True)
    
    activation = l2_norm(activation, 1)

    return activation

In [None]:
!pip install git+https://github.com/jameslyons/python_speech_features.git
import librosa
import numpy as np
from python_speech_features import fbank

def normalize_frames(m,Scale=True):
    if Scale:
        return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12)
    else:
        return (m - np.mean(m, axis=0))

def get_feature_from_wav(filename, sample_rate=16000):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=40, winlen=0.025)
    filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5))
    feature = normalize_frames(filter_banks, Scale=False)
    return feature


Collecting git+https://github.com/jameslyons/python_speech_features.git
  Cloning https://github.com/jameslyons/python_speech_features.git to /tmp/pip-req-build-nz22l4c3
  Running command git clone -q https://github.com/jameslyons/python_speech_features.git /tmp/pip-req-build-nz22l4c3


In [None]:
wav_dir = '/content/drive/MyDrive/SpeakerVerification/audio_wav'
test_frames = 100

feature = {}
embedding = {}

for f in sorted(os.listdir(wav_dir)):
  if f.endswith('.wav'):
    spk = f.replace('.wav','')
    wav_path = os.path.join(wav_dir, f)
    feature[spk] = get_feature_from_wav(wav_path)
    embedding[spk] = get_embedding_from_feature(use_cuda, feature[spk], model, test_frames)


In [None]:
feature[yg_en]

In [None]:
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
np.set_printoptions(precision=3, threshold=sys.maxsize)

In [None]:
score_matrix = np.zeros((len(embedding),len(embedding)))
truth_matrix = np.zeros((len(embedding),len(embedding)))

for i, spk_i in enumerate(embedding):
  for j, spk_j in enumerate(embedding):
    score = F.cosine_similarity(embedding[spk_i], embedding[spk_j])
    score = score.data.cpu().numpy()
    score_matrix[i,j] = score
    if spk_i.split("_")[0] == spk_j.split("_")[0]:
      truth_matrix[i,j] = 1
    else:
      truth_matrix[i,j]=0

print(embedding.keys())
print(score_matrix)
print(truth_matrix)

dict_keys(['gh_1', 'gh_2', 'gh_en_1', 'hb_1', 'hb_2', 'hb_en_angry', 'hb_en_fear', 'hb_en_joy', 'hb_en_neutral', 'hb_en_sad', 'yg_1', 'yg_2', 'yg_en_1', 'yg_en_2', 'yg_en_3', 'yg_en_4', 'yg_en_5', 'yg_en_6', 'yg_en_angry', 'yg_en_sad'])
[[1.    0.989 0.956 0.923 0.93  0.952 0.953 0.943 0.881 0.853 0.918 0.92  0.903 0.924 0.928 0.94  0.961 0.856 0.955 0.898]
 [0.989 1.    0.952 0.916 0.928 0.947 0.95  0.922 0.879 0.858 0.914 0.906 0.903 0.917 0.918 0.923 0.938 0.849 0.93  0.896]
 [0.956 0.952 1.    0.851 0.853 0.955 0.969 0.952 0.86  0.828 0.927 0.916 0.929 0.91  0.948 0.914 0.925 0.853 0.956 0.909]
 [0.923 0.916 0.851 1.    0.982 0.889 0.889 0.886 0.876 0.907 0.817 0.838 0.807 0.827 0.842 0.813 0.923 0.731 0.859 0.759]
 [0.93  0.928 0.853 0.982 1.    0.907 0.901 0.899 0.908 0.901 0.824 0.836 0.835 0.845 0.836 0.844 0.905 0.754 0.864 0.779]
 [0.952 0.947 0.955 0.889 0.907 1.    0.987 0.965 0.942 0.862 0.925 0.924 0.946 0.937 0.919 0.906 0.923 0.838 0.944 0.897]
 [0.953 0.95  0.969 0.889

In [None]:
from sklearn.metrics import roc_curve

y_pred = score_matrix.flatten()
y = truth_matrix.flatten()

fpr, tpr, threshold = roc_curve(y, y_pred, pos_label=1)
fnr = 1 - tpr
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
EER = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
print(eer_threshold)
print(EER)

0.9125440120697021
0.5352112676056338


In [None]:
# !y | ffmpeg -i yg_kr.m4a -acodec pcm_s16le -ac 1 -ar 16000 yg_kr.wav

# !for i in *.m4a; do ffmpeg -i "$i" -acodec pcm_s16le -ac 1 -ar 16000 "${i%.*}.wav"; done

/bin/bash: y: command not found
ffmpeg version 3.4.8-0ubuntu0.2 Copyright (c) 2000-2020 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.2 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxm