### filter

In [73]:
import os
import librosa
import soundfile as sf
from glob import glob
from tqdm import tqdm

In [74]:
input_path = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/speaker_diarization/outputs/segments"
output_path = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization"

speakers = glob(f'{input_path}/speaker_*')
for speaker in tqdm(speakers):
    speaker_name = speaker.split("/")[-1]
    wavs = glob(f'{speaker}/*.wav')

    out_path = f'{output_path}/{speaker_name}'
    if not os.path.exists(out_path):
        os.mkdir(out_path)

    for wav_path in wavs:
        wav, sr = librosa.load(wav_path, sr=16000)

        if wav.shape[0]/sr < 1:
            continue
        wav_name = wav_path.split("/")[-1]
        wav_path = f'{out_path}/{wav_name}'
        wav = wav / abs(wav.max())
        sf.write(wav_path, wav, samplerate=sr)


100%|██████████| 5/5 [00:00<00:00,  6.66it/s]


### speaker verification

In [75]:
import os
import nemo
import re
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
from glob import glob
import random
from tqdm import tqdm
import shutil
import json

In [76]:
# load data
path = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization"
files = glob(f'{path}/*/*.wav')

speaker2wavs = {}
for _file in files:
    path = _file
    temp = _file.split("/")[-1]
    wav_id = re.sub("_\d+\.wav","", temp)
    
    if wav_id not in speaker2wavs:
        speaker2wavs[wav_id] = [path,]
    else:
        speaker2wavs[wav_id].append(path)

In [None]:
!bash prepare_data_for_inference.sh 

In [None]:
MODEL_CONFIG = 'config/titanet-large.yaml'
config = OmegaConf.load(MODEL_CONFIG)

verification_model = nemo_asr.models.EncDecSpeakerLabelModel.load_from_checkpoint(
    "outputs/ckpts/TitaNet-L--val_loss=1.0419-epoch=45.ckpt",
    map_location="cpu",
    hparams_file="config/hparams.yml")

In [48]:
def load_inference_data(path):
    datas = []
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

        for line in lines:
            data = json.loads(line.strip())
            datas.append(data)
    return datas

path = "inference.json"
datas =load_inference_data(path)
datas[0:5]

[{'audio_filepath': '/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_0.wav',
  'offset': 0,
  'duration': 1.42,
  'label': 'speaker_0'},
 {'audio_filepath': '/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_1.wav',
  'offset': 0,
  'duration': 1.18,
  'label': 'speaker_0'},
 {'audio_filepath': '/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_16.wav',
  'offset': 0,
  'duration': 1.5,
  'label': 'speaker_0'},
 {'audio_filepath': '/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_18.wav',
  'offset': 0,
  'duration': 2.299937,
  'label': 'speaker_0'},
 {'audio_filepath': '/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_19.wav',
  'offset': 0,
  'duration': 1.18,
  'label': 'sp

In [None]:
def infer_verification_v1(paths_1, paths_2):
    embedding_1 = []
    for path_1 in paths_1:
        embedding = verification_model.get_embedding(path_1)
        embedding_1.append(embedding)
    
    for path_2 in paths_2:
        res = verification_model.verify_speakers(path_1, path_2, threshold=0.7)
        embedding = verification_model.get_embedding(path_1)
        embedding_1.append(embedding)
    return True

In [68]:
manifest_filepath = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/speaker_verification/inference.json"
embs, logits, gt_labels, _ = verification_model.batch_inference(manifest_filepath, batch_size=4, sample_rate=16000, device='cpu')

[NeMo I 2023-05-17 11:54:29 collections:298] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-05-17 11:54:29 collections:299] Dataset loaded with 195 items, total duration of  0.08 hours.
[NeMo I 2023-05-17 11:54:29 collections:301] # 195 files loaded accounting to # 5 labels


100%|██████████| 49/49 [00:23<00:00,  2.05it/s]


In [69]:
import torch
from torchmetrics.functional import pairwise_cosine_similarity
x = torch.from_numpy(embs)
y = torch.from_numpy(embs)
cos_sim = pairwise_cosine_similarity(x, y)
similarity_score = (cos_sim + 1) / 2
similarity_score

tensor([[1.0000, 0.6850, 0.6269,  ..., 0.4720, 0.5814, 0.5839],
        [0.6850, 1.0000, 0.5817,  ..., 0.5754, 0.6009, 0.5841],
        [0.6269, 0.5817, 1.0000,  ..., 0.5484, 0.5463, 0.6000],
        ...,
        [0.4720, 0.5754, 0.5484,  ..., 1.0000, 0.6117, 0.5661],
        [0.5814, 0.6009, 0.5463,  ..., 0.6117, 1.0000, 0.7036],
        [0.5839, 0.5841, 0.6000,  ..., 0.5661, 0.7036, 1.0000]])

In [70]:
from numpy import dot
from numpy.linalg import norm
import numpy as np

# a = np.array([2, 3])
# b = np.array([1, 0])
a = embs[0]
b = embs[1]
cos_sim = dot(a, b)/(norm(a)*norm(b))

similarity_score = (cos_sim + 1) / 2
similarity_score

0.6849957406520844

In [129]:
import torch
from torchmetrics.functional import pairwise_cosine_similarity
embs = torch.from_numpy(embs)
cosin_sim = pairwise_cosine_similarity(embs, embs)
similarity_score = (cosin_sim + 1) / 2
print(similarity_score)

tensor([[1.0000, 0.6850, 0.6269,  ..., 0.4720, 0.5814, 0.5839],
        [0.6850, 1.0000, 0.5817,  ..., 0.5754, 0.6009, 0.5841],
        [0.6269, 0.5817, 1.0000,  ..., 0.5484, 0.5463, 0.6000],
        ...,
        [0.4720, 0.5754, 0.5484,  ..., 1.0000, 0.6117, 0.5661],
        [0.5814, 0.6009, 0.5463,  ..., 0.6117, 1.0000, 0.7036],
        [0.5839, 0.5841, 0.6000,  ..., 0.5661, 0.7036, 1.0000]])


In [133]:
similarity_score[190:, 190:]

tensor([[1.0000, 0.6438, 0.5568, 0.6101, 0.5715],
        [0.6438, 1.0000, 0.5374, 0.6085, 0.5216],
        [0.5568, 0.5374, 1.0000, 0.6117, 0.5661],
        [0.6101, 0.6085, 0.6117, 1.0000, 0.7036],
        [0.5715, 0.5216, 0.5661, 0.7036, 1.0000]])

In [71]:
_embs1 = verification_model.get_embedding(
    "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_0.wav")

_embs2 = verification_model.get_embedding(
    "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_0.wav")

In [72]:
_embs1 - embs[0]

tensor([[ 1.7670e-02, -3.4352e-01,  8.6713e-02, -2.8117e-02,  7.8431e-02,
         -1.6679e-01, -1.0103e-01,  1.5043e-01,  2.6947e-01,  2.8312e-01,
         -5.3565e-01, -1.6358e-01,  1.9985e-01, -3.7958e-01,  6.2543e-01,
         -4.0281e-01,  6.3645e-01, -2.6868e-02, -3.4456e-02, -5.6311e-02,
          2.0020e-01, -2.5885e-01, -2.3779e-02, -6.7199e-01,  4.3342e-01,
          2.6027e-01, -6.3479e-01,  1.1432e-01,  2.2223e-01, -4.0906e-01,
          1.1657e-02,  3.9485e-01, -1.8807e-02, -7.5467e-02, -3.3649e-01,
         -5.3120e-01,  4.9089e-01, -4.6499e-01, -3.5212e-01,  3.6194e-01,
         -3.8006e-01, -6.4640e-02, -1.6535e-01, -1.8004e-01, -2.1679e-01,
          2.2369e-02, -2.6370e-01,  4.1147e-01,  1.6129e-01,  5.4755e-02,
          2.2633e-01,  1.6895e-01,  3.7133e-01,  3.4853e-02, -2.0270e-02,
         -2.7125e-01, -1.1679e-01,  3.5852e-01,  1.4446e-01,  2.0770e-01,
         -7.1427e-02,  1.4939e-01, -1.7122e-01, -5.6010e-02,  1.5233e-01,
         -3.5558e-01, -2.9836e-01,  5.

In [55]:
for sample in tqdm(datas):
    path = sample["audio_filepath"]
    _embs1 = verification_model.get_embedding(path)

    if (embs[0] - _embs1.numpy()).sum() == 0:
        print(path)

100%|██████████| 195/195 [00:29<00:00,  6.67it/s]


In [61]:
embs[0].shape

(192,)

In [63]:
_embs1.numpy()[0].shape

(192,)

In [34]:
path_1 = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_0.wav"
path_2 = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_0/wav_0.rttm_speaker_0_1.wav"

verification_model.verify_speakers(path_1, path_2, threshold=0.6)


tensor(0.6923)
[NeMo I 2023-05-17 10:58:31 label_models:507]  two audio files are from same speaker


True

In [31]:
embs1 = _embs1.squeeze()
embs2 = _embs2.squeeze()

X = embs1 / torch.linalg.norm(embs1)
Y = embs2 / torch.linalg.norm(embs2)
# Score
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2
similarity_score


tensor(0.6923)

In [125]:
_embs1 =verification_model.get_embedding(
    "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_4/wav_2.rttm_speaker_4_8.wav")

_embs2 =verification_model.get_embedding(
    "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/speaker_4/wav_2.rttm_speaker_4_9.wav")

In [126]:
embs1 = _embs1
embs2 = _embs2

X = embs1 / torch.linalg.norm(embs1, dim=1)
Y = embs2 / torch.linalg.norm(embs2, dim=1)

In [93]:
torch.matmul(X, Y.T) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)

RuntimeError: 1D tensors expected, but got 2D and 2D tensors

In [None]:
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2


In [94]:
speaker_map = {}

for speaker_1, wavs_1 in tqdm(speaker2wavs.items()):
    wavs_1 = random.sample(wavs_1, 2)
    for speaker_2, wavs_2 in speaker2wavs.items():
        if speaker_1 == speaker_2:
            continue
        if speaker_1 in speaker_map and speaker_2 in speaker_map:
            continue
        wavs_2 = random.sample(wavs_2, 2)
        res = infer_verification(wavs_1, wavs_2)

        if res == True:
            if speaker_1 not in speaker_map:
                speaker_map[speaker_1] = [speaker_2,]
            else:
                speaker_map[speaker_1].append(speaker_2)

            if speaker_2 not in speaker_map:
                speaker_map[speaker_2] = [speaker_1,]
            else:
                speaker_map[speaker_2].append(speaker_1)

  0%|          | 0/10 [00:00<?, ?it/s]

tensor(0.6030)
[NeMo I 2023-05-17 00:39:19 label_models:510]  two audio files are from different speakers
tensor(0.6419)
[NeMo I 2023-05-17 00:39:20 label_models:510]  two audio files are from different speakers
tensor(0.6229)
[NeMo I 2023-05-17 00:39:20 label_models:510]  two audio files are from different speakers
tensor(0.6024)
[NeMo I 2023-05-17 00:39:21 label_models:510]  two audio files are from different speakers
tensor(0.6220)
[NeMo I 2023-05-17 00:39:21 label_models:510]  two audio files are from different speakers
tensor(0.7134)
[NeMo I 2023-05-17 00:39:21 label_models:507]  two audio files are from same speaker
tensor(0.6015)
[NeMo I 2023-05-17 00:39:22 label_models:510]  two audio files are from different speakers
tensor(0.7746)
[NeMo I 2023-05-17 00:39:22 label_models:507]  two audio files are from same speaker
tensor(0.6839)
[NeMo I 2023-05-17 00:39:22 label_models:510]  two audio files are from different speakers
tensor(0.6398)
[NeMo I 2023-05-17 00:39:23 label_models:51

 10%|█         | 1/10 [00:11<01:45, 11.74s/it]

tensor(0.5774)
[NeMo I 2023-05-17 00:39:31 label_models:510]  two audio files are from different speakers
tensor(0.5574)
[NeMo I 2023-05-17 00:39:31 label_models:510]  two audio files are from different speakers
tensor(0.5956)
[NeMo I 2023-05-17 00:39:31 label_models:510]  two audio files are from different speakers
tensor(0.5941)
[NeMo I 2023-05-17 00:39:31 label_models:510]  two audio files are from different speakers
tensor(0.5520)
[NeMo I 2023-05-17 00:39:32 label_models:510]  two audio files are from different speakers
tensor(0.5192)
[NeMo I 2023-05-17 00:39:32 label_models:510]  two audio files are from different speakers
tensor(0.5341)
[NeMo I 2023-05-17 00:39:32 label_models:510]  two audio files are from different speakers
tensor(0.5795)
[NeMo I 2023-05-17 00:39:32 label_models:510]  two audio files are from different speakers
tensor(0.5858)
[NeMo I 2023-05-17 00:39:33 label_models:510]  two audio files are from different speakers
tensor(0.5693)
[NeMo I 2023-05-17 00:39:33 lab

 20%|██        | 2/10 [00:21<01:22, 10.36s/it]

tensor(0.5236)
[NeMo I 2023-05-17 00:39:40 label_models:510]  two audio files are from different speakers
tensor(0.5494)
[NeMo I 2023-05-17 00:39:40 label_models:510]  two audio files are from different speakers
tensor(0.6229)
[NeMo I 2023-05-17 00:39:40 label_models:510]  two audio files are from different speakers
tensor(0.5312)
[NeMo I 2023-05-17 00:39:41 label_models:510]  two audio files are from different speakers
tensor(0.5536)
[NeMo I 2023-05-17 00:39:41 label_models:510]  two audio files are from different speakers
tensor(0.5971)
[NeMo I 2023-05-17 00:39:41 label_models:510]  two audio files are from different speakers
tensor(0.5874)
[NeMo I 2023-05-17 00:39:42 label_models:510]  two audio files are from different speakers
tensor(0.7070)
[NeMo I 2023-05-17 00:39:42 label_models:507]  two audio files are from same speaker
tensor(0.6031)
[NeMo I 2023-05-17 00:39:42 label_models:510]  two audio files are from different speakers
tensor(0.5580)
[NeMo I 2023-05-17 00:39:42 label_mod

 30%|███       | 3/10 [00:28<01:02,  8.87s/it]

tensor(0.6751)
[NeMo I 2023-05-17 00:39:47 label_models:510]  two audio files are from different speakers
tensor(0.7216)
[NeMo I 2023-05-17 00:39:47 label_models:507]  two audio files are from same speaker
tensor(0.5858)
[NeMo I 2023-05-17 00:39:48 label_models:510]  two audio files are from different speakers
tensor(0.5956)
[NeMo I 2023-05-17 00:39:48 label_models:510]  two audio files are from different speakers
tensor(0.5667)
[NeMo I 2023-05-17 00:39:48 label_models:510]  two audio files are from different speakers
tensor(0.5776)
[NeMo I 2023-05-17 00:39:48 label_models:510]  two audio files are from different speakers
tensor(0.5483)
[NeMo I 2023-05-17 00:39:49 label_models:510]  two audio files are from different speakers
tensor(0.6002)
[NeMo I 2023-05-17 00:39:49 label_models:510]  two audio files are from different speakers
tensor(0.6776)
[NeMo I 2023-05-17 00:39:49 label_models:510]  two audio files are from different speakers
tensor(0.6032)
[NeMo I 2023-05-17 00:39:49 label_mod

 40%|████      | 4/10 [00:39<00:58,  9.67s/it]

tensor(0.5573)
[NeMo I 2023-05-17 00:39:58 label_models:510]  two audio files are from different speakers
tensor(0.5341)
[NeMo I 2023-05-17 00:39:58 label_models:510]  two audio files are from different speakers
tensor(0.5931)
[NeMo I 2023-05-17 00:39:58 label_models:510]  two audio files are from different speakers
tensor(0.5678)
[NeMo I 2023-05-17 00:39:59 label_models:510]  two audio files are from different speakers
tensor(0.5361)
[NeMo I 2023-05-17 00:39:59 label_models:510]  two audio files are from different speakers
tensor(0.5397)
[NeMo I 2023-05-17 00:39:59 label_models:510]  two audio files are from different speakers
tensor(0.5226)
[NeMo I 2023-05-17 00:39:59 label_models:510]  two audio files are from different speakers
tensor(0.6285)
[NeMo I 2023-05-17 00:40:00 label_models:510]  two audio files are from different speakers
tensor(0.5355)
[NeMo I 2023-05-17 00:40:00 label_models:510]  two audio files are from different speakers
tensor(0.5503)
[NeMo I 2023-05-17 00:40:00 lab

 50%|█████     | 5/10 [00:48<00:48,  9.66s/it]

tensor(0.6169)
[NeMo I 2023-05-17 00:40:08 label_models:510]  two audio files are from different speakers
tensor(0.5423)
[NeMo I 2023-05-17 00:40:08 label_models:510]  two audio files are from different speakers
tensor(0.5921)
[NeMo I 2023-05-17 00:40:08 label_models:510]  two audio files are from different speakers
tensor(0.5805)
[NeMo I 2023-05-17 00:40:08 label_models:510]  two audio files are from different speakers
tensor(0.5230)
[NeMo I 2023-05-17 00:40:09 label_models:510]  two audio files are from different speakers
tensor(0.5489)
[NeMo I 2023-05-17 00:40:09 label_models:510]  two audio files are from different speakers
tensor(0.5633)
[NeMo I 2023-05-17 00:40:09 label_models:510]  two audio files are from different speakers
tensor(0.6437)
[NeMo I 2023-05-17 00:40:09 label_models:510]  two audio files are from different speakers
tensor(0.5954)
[NeMo I 2023-05-17 00:40:10 label_models:510]  two audio files are from different speakers
tensor(0.5905)
[NeMo I 2023-05-17 00:40:10 lab

 60%|██████    | 6/10 [00:58<00:38,  9.55s/it]

tensor(0.5169)
[NeMo I 2023-05-17 00:40:17 label_models:510]  two audio files are from different speakers
tensor(0.5685)
[NeMo I 2023-05-17 00:40:17 label_models:510]  two audio files are from different speakers
tensor(0.4944)
[NeMo I 2023-05-17 00:40:17 label_models:510]  two audio files are from different speakers
tensor(0.6203)
[NeMo I 2023-05-17 00:40:18 label_models:510]  two audio files are from different speakers
tensor(0.6327)
[NeMo I 2023-05-17 00:40:18 label_models:510]  two audio files are from different speakers
tensor(0.5960)
[NeMo I 2023-05-17 00:40:18 label_models:510]  two audio files are from different speakers
tensor(0.7003)
[NeMo I 2023-05-17 00:40:19 label_models:507]  two audio files are from same speaker
tensor(0.6448)
[NeMo I 2023-05-17 00:40:19 label_models:510]  two audio files are from different speakers
tensor(0.6096)
[NeMo I 2023-05-17 00:40:19 label_models:510]  two audio files are from different speakers
tensor(0.6131)
[NeMo I 2023-05-17 00:40:19 label_mod

 70%|███████   | 7/10 [01:05<00:26,  8.93s/it]

tensor(0.4966)
[NeMo I 2023-05-17 00:40:25 label_models:510]  two audio files are from different speakers
tensor(0.5506)
[NeMo I 2023-05-17 00:40:25 label_models:510]  two audio files are from different speakers
tensor(0.5200)
[NeMo I 2023-05-17 00:40:25 label_models:510]  two audio files are from different speakers
tensor(0.5213)
[NeMo I 2023-05-17 00:40:25 label_models:510]  two audio files are from different speakers
tensor(0.4994)
[NeMo I 2023-05-17 00:40:26 label_models:510]  two audio files are from different speakers
tensor(0.5188)
[NeMo I 2023-05-17 00:40:26 label_models:510]  two audio files are from different speakers
tensor(0.6094)
[NeMo I 2023-05-17 00:40:26 label_models:510]  two audio files are from different speakers
tensor(0.6279)
[NeMo I 2023-05-17 00:40:26 label_models:510]  two audio files are from different speakers
tensor(0.5200)
[NeMo I 2023-05-17 00:40:27 label_models:510]  two audio files are from different speakers
tensor(0.5583)
[NeMo I 2023-05-17 00:40:27 lab

 80%|████████  | 8/10 [01:15<00:18,  9.33s/it]

tensor(0.6111)
[NeMo I 2023-05-17 00:40:35 label_models:510]  two audio files are from different speakers
tensor(0.5447)
[NeMo I 2023-05-17 00:40:35 label_models:510]  two audio files are from different speakers
tensor(0.5288)
[NeMo I 2023-05-17 00:40:35 label_models:510]  two audio files are from different speakers
tensor(0.5552)
[NeMo I 2023-05-17 00:40:36 label_models:510]  two audio files are from different speakers
tensor(0.6137)
[NeMo I 2023-05-17 00:40:36 label_models:510]  two audio files are from different speakers
tensor(0.5816)
[NeMo I 2023-05-17 00:40:36 label_models:510]  two audio files are from different speakers
tensor(0.4668)
[NeMo I 2023-05-17 00:40:36 label_models:510]  two audio files are from different speakers
tensor(0.5852)
[NeMo I 2023-05-17 00:40:37 label_models:510]  two audio files are from different speakers
tensor(0.5348)
[NeMo I 2023-05-17 00:40:37 label_models:510]  two audio files are from different speakers
tensor(0.5667)
[NeMo I 2023-05-17 00:40:37 lab

 90%|█████████ | 9/10 [01:25<00:09,  9.26s/it]

tensor(0.5078)
[NeMo I 2023-05-17 00:40:44 label_models:510]  two audio files are from different speakers
tensor(0.5596)
[NeMo I 2023-05-17 00:40:44 label_models:510]  two audio files are from different speakers
tensor(0.5747)
[NeMo I 2023-05-17 00:40:45 label_models:510]  two audio files are from different speakers
tensor(0.5575)
[NeMo I 2023-05-17 00:40:45 label_models:510]  two audio files are from different speakers
tensor(0.5019)
[NeMo I 2023-05-17 00:40:45 label_models:510]  two audio files are from different speakers
tensor(0.5875)
[NeMo I 2023-05-17 00:40:45 label_models:510]  two audio files are from different speakers
tensor(0.4941)
[NeMo I 2023-05-17 00:40:46 label_models:510]  two audio files are from different speakers
tensor(0.6024)
[NeMo I 2023-05-17 00:40:46 label_models:510]  two audio files are from different speakers
tensor(0.5674)
[NeMo I 2023-05-17 00:40:46 label_models:510]  two audio files are from different speakers
tensor(0.5415)
[NeMo I 2023-05-17 00:40:46 lab

100%|██████████| 10/10 [01:34<00:00,  9.48s/it]


In [10]:
speaker_map = {'wav_2.rttm_speaker_3': ['wav_3.rttm_speaker_1', 'wav_0.rttm_speaker_0'],
 'wav_3.rttm_speaker_1': ['wav_2.rttm_speaker_3'],
 'wav_0.rttm_speaker_0': ['wav_2.rttm_speaker_3']}

for key, value in speaker2wavs.items():
    if key not in speaker_map:
        speaker_map[key] = [key,]

In [18]:
speaker2id = {}
count = 0
for key, value in speaker_map.items():
    if key not in speaker2id:
        speaker2id[key] = f"speaker_{count}"
        count += 1
        for mapped_spk in value:
            speaker2id[mapped_spk] = speaker2id[key]


In [19]:
speaker2id

{'wav_2.rttm_speaker_3': 'speaker_0',
 'wav_3.rttm_speaker_1': 'speaker_0',
 'wav_0.rttm_speaker_0': 'speaker_0',
 'wav_0.rttm_speaker_1': 'speaker_1',
 'wav_2.rttm_speaker_1': 'speaker_2',
 'wav_1.rttm_speaker_0': 'speaker_3',
 'wav_2.rttm_speaker_0': 'speaker_4',
 'wav_3.rttm_speaker_0': 'speaker_5',
 'wav_2.rttm_speaker_4': 'speaker_6',
 'wav_2.rttm_speaker_2': 'speaker_7'}

In [27]:
import shutil

In [30]:
path = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization/*/*.wav"
output_path = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/datas"
files = glob(path)
speaker2num_utt = dict.fromkeys(speaker2id.keys(), 0)

for _file in files:
    _id = _file.split("/")[-1]
    _id = "_".join(_id.split("_")[:-1])
    output_dir = f'{output_path}/{speaker2id[_id]}'
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    shutil.copy(_file, f'{output_dir}/{speaker2num_utt[_id]}.wav')
    speaker2num_utt[_id] += 1

In [60]:
# load data
path = "/home/tuyendv/Desktop/codes/ess_data_crawler_pipline/outputs/speaker_diarization"
files = glob(f'{path}/*/*.wav')

speakers = []
embeddings = []
for _file in tqdm(files):
    embedding = verification_model.get_embedding(_file)
    speakers.append(_file.split("/")[-1])
    embeddings.append(embedding)

100%|██████████| 195/195 [00:25<00:00,  7.60it/s]
