In [1]:
import sys
from pathlib import Path
import random
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from wespeaker.cli.speaker import load_model
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F


import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

PROJECT_ROOT = Path.cwd().parents[0]
sys.path.append(str(PROJECT_ROOT))
print("PROJECT_ROOT =", PROJECT_ROOT)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

speaker = load_model(PROJECT_ROOT / "wespeaker-voxceleb-resnet293-LM")
net = speaker.model
net = net.to(DEVICE)

print("ResNet-293 loaded from HF")

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("sox_io")
ESPnet is not installed, cannot use espnet_hubert upstream


PROJECT_ROOT = /home/SpeakerRec/BioVoice
Using device: cuda
{'data_type': 'shard', 'dataloader_args': {'batch_size': 32, 'drop_last': True, 'num_workers': 16, 'pin_memory': False, 'prefetch_factor': 8}, 'dataset_args': {'aug_prob': 0.6, 'fbank_args': {'dither': 1.0, 'frame_length': 25, 'frame_shift': 10, 'num_mel_bins': 80}, 'num_frms': 200, 'shuffle': True, 'shuffle_args': {'shuffle_size': 2500}, 'spec_aug': False, 'spec_aug_args': {'max_f': 8, 'max_t': 10, 'num_f_mask': 1, 'num_t_mask': 1, 'prob': 0.6}, 'speed_perturb': True}, 'exp_dir': 'exp/ResNet293-TSTP-emb256-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-SGD-epoch150', 'gpus': [0, 1], 'log_batch_interval': 100, 'loss': 'CrossEntropyLoss', 'loss_args': {}, 'margin_scheduler': 'MarginScheduler', 'margin_update': {'epoch_iter': 17062, 'final_margin': 0.2, 'fix_start_epoch': 40, 'increase_start_epoch': 20, 'increase_type': 'exp', 'initial_margin': 0.0, 'update_margin': True}, 'model': 'ResNet293', 'model_args': {'embed_dim': 2

  checkpoint = torch.load(path, map_location="cpu")


ResNet-293 loaded from HF


In [5]:
# %%
AUDIO_ROOT = PROJECT_ROOT / "data" / "wavs"

SPEAKERS = ["eden", "idan", "yoav"]

def speaker_from_name(p: Path):
    name = p.stem.lower()
    for s in SPEAKERS:
        if name.startswith(s + "_"):
            return s
    return None

In [3]:
def embed_with_resnet_293(wav_path, l2_norm: bool = True):
    waveform, sr = torchaudio.load(wav_path)
    if sr != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(
            waveform
        )

    # Move waveform to the same device as the model
    waveform = waveform.to(DEVICE)

    # Use the speaker object's extract_embedding_from_pcm method
    embedding = speaker.extract_embedding_from_pcm(waveform, 16000)

    if l2_norm:
        embedding = embedding / (embedding.norm(p=2) + 1e-12)

    return embedding.squeeze()

In [9]:
# %%
rows = []

# collect BOTH wav and m4a recursively
audio_paths = sorted(list(AUDIO_ROOT.rglob("*.wav")) + list(AUDIO_ROOT.rglob("*.m4a")))

print(f"Found {len(audio_paths)} audio files total")

# # optional limits (VERY IMPORTANT)
MAX_SPEAKERS = 100
MAX_SAMPLES_PER_SPEAKER = 200

speaker_counts = {}
used_speakers = set()

for p in tqdm(audio_paths):
    spk = speaker_from_name(p)

    # limit number of speakers
    if spk not in used_speakers:
        if len(used_speakers) >= MAX_SPEAKERS:
            continue
        used_speakers.add(spk)
        speaker_counts[spk] = 0

    # limit samples per speaker
    if speaker_counts[spk] >= MAX_SAMPLES_PER_SPEAKER:
        continue

    emb = embed_with_resnet_293(p)
    if emb is None:
        continue

    speaker_counts[spk] += 1

    rows.append({"path": str(p), "file": p.name, "speaker": spk, "embedding": emb})

df = pd.DataFrame(rows)

print("Total samples:", len(df))
print("Samples per speaker:")
print(pd.Series(speaker_counts))
df.head()

Found 90 audio files total


100%|██████████| 90/90 [00:13<00:00,  6.81it/s]


Total samples: 90
Samples per speaker:
eden    30
idan    30
yoav    30
dtype: int64


Unnamed: 0,path,file,speaker,embedding
0,/home/SpeakerRec/BioVoice/data/wavs/eden_001.wav,eden_001.wav,eden,"[tensor(0.0569), tensor(0.0265), tensor(0.0244..."
1,/home/SpeakerRec/BioVoice/data/wavs/eden_002.wav,eden_002.wav,eden,"[tensor(0.0710), tensor(-0.0597), tensor(-0.02..."
2,/home/SpeakerRec/BioVoice/data/wavs/eden_003.wav,eden_003.wav,eden,"[tensor(0.0058), tensor(-0.0486), tensor(-0.05..."
3,/home/SpeakerRec/BioVoice/data/wavs/eden_004.wav,eden_004.wav,eden,"[tensor(0.0300), tensor(-0.0372), tensor(0.038..."
4,/home/SpeakerRec/BioVoice/data/wavs/eden_005.wav,eden_005.wav,eden,"[tensor(0.0802), tensor(-0.0524), tensor(0.052..."


In [10]:
# %%
centroids = {}

for spk in df["speaker"].unique():
    embs = torch.stack(df[df["speaker"] == spk]["embedding"].tolist())
    centroid = embs.mean(dim=0)
    centroid = F.normalize(centroid, dim=0)
    centroids[spk] = centroid

print("Computed centroids for:", list(centroids.keys()))

Computed centroids for: ['eden', 'idan', 'yoav']


In [11]:
# %%
def cosine_to_centroid(emb, centroid):
    return float(torch.dot(emb, centroid))

df["cosine_to_centroid"] = [
    cosine_to_centroid(row.embedding, centroids[row.speaker])
    for row in df.itertuples()
]


In [12]:
# %%
df_sorted = (
    df.sort_values(["speaker", "cosine_to_centroid"], ascending=[True, False])
    .reset_index(drop=True)
)

df_sorted.head(10)


Unnamed: 0,path,file,speaker,embedding,cosine_to_centroid
0,/home/SpeakerRec/BioVoice/data/wavs/eden_029.wav,eden_029.wav,eden,"[tensor(0.0498), tensor(-0.0399), tensor(0.001...",0.846436
1,/home/SpeakerRec/BioVoice/data/wavs/eden_022.wav,eden_022.wav,eden,"[tensor(0.0312), tensor(-0.0345), tensor(0.051...",0.842202
2,/home/SpeakerRec/BioVoice/data/wavs/eden_014.wav,eden_014.wav,eden,"[tensor(0.0824), tensor(-0.0050), tensor(0.031...",0.837751
3,/home/SpeakerRec/BioVoice/data/wavs/eden_020.wav,eden_020.wav,eden,"[tensor(0.0980), tensor(-0.0257), tensor(-0.05...",0.830325
4,/home/SpeakerRec/BioVoice/data/wavs/eden_015.wav,eden_015.wav,eden,"[tensor(0.0145), tensor(-0.0732), tensor(0.014...",0.829727
5,/home/SpeakerRec/BioVoice/data/wavs/eden_027.wav,eden_027.wav,eden,"[tensor(0.0632), tensor(-0.0183), tensor(0.035...",0.821764
6,/home/SpeakerRec/BioVoice/data/wavs/eden_005.wav,eden_005.wav,eden,"[tensor(0.0802), tensor(-0.0524), tensor(0.052...",0.816005
7,/home/SpeakerRec/BioVoice/data/wavs/eden_011.wav,eden_011.wav,eden,"[tensor(0.0543), tensor(-0.0340), tensor(0.072...",0.813013
8,/home/SpeakerRec/BioVoice/data/wavs/eden_001.wav,eden_001.wav,eden,"[tensor(0.0569), tensor(0.0265), tensor(0.0244...",0.811855
9,/home/SpeakerRec/BioVoice/data/wavs/eden_025.wav,eden_025.wav,eden,"[tensor(0.0589), tensor(-0.0163), tensor(0.016...",0.801858


In [13]:
# %%
stats = []

for spk in df_sorted["speaker"].unique():
    sims = df_sorted[df_sorted["speaker"] == spk]["cosine_to_centroid"].values

    stats.append(
        {
            "speaker": spk,
            "min": sims.min(),
            "max": sims.max(),
            "mean": sims.mean(),
            "std": sims.std(),
            "range": sims.max() - sims.min(),
        }
    )

stats_df = pd.DataFrame(stats)
display(stats_df)

Unnamed: 0,speaker,min,max,mean,std,range
0,eden,0.69418,0.846436,0.778943,0.040682,0.152255
1,idan,0.814452,0.927562,0.87199,0.025957,0.11311
2,yoav,0.692299,0.888076,0.802715,0.047097,0.195777


In [15]:
# %%
out_path =  "./speaker_similarity_ranking_team.csv"
df_sorted_out = df_sorted.drop(columns=["embedding"])
df_sorted_out.to_csv(out_path, index=False)

print("Saved ranking to:", out_path)


Saved ranking to: ./speaker_similarity_ranking_team.csv
