In [1]:
# %%
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torchaudio
from tqdm import tqdm
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
# %%
PROJECT_ROOT = Path.cwd().parents[2]
sys.path.append(str(PROJECT_ROOT))

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("PROJECT_ROOT:", PROJECT_ROOT)
print("DEVICE:", DEVICE)


PROJECT_ROOT: /home/SpeakerRec/BioVoice
DEVICE: cuda


In [3]:
# %%
redim_model = torch.hub.load(
    "IDRnD/ReDimNet",
    "ReDimNet",
    model_name="b5",
    train_type="ptn",
    dataset="vox2",
).to(DEVICE).eval()

for p in redim_model.parameters():
    p.requires_grad = False

print("Loaded ReDimNet (frozen).")


Using cache found in /home/SpeakerRec/.cache/torch/hub/IDRnD_ReDimNet_master


Loaded ReDimNet (frozen).


In [4]:
# # %%
# WAV_DIR = PROJECT_ROOT / "data" / "wavs"

# SPEAKERS = ["eden", "idan", "yoav"]

# def speaker_from_name(p: Path):
#     name = p.stem.lower()
#     for s in SPEAKERS:
#         if name.startswith(s + "_"):
#             return s
#     return None


In [5]:
# %%
# VoxCeleb2 configuration

AUDIO_ROOT = (
    PROJECT_ROOT
    / "data"
    / "datasets"
    / "voxceleb2"
    / "voxceleb2_hf"
    / "extracted_aac"
    / "aac"
)

assert AUDIO_ROOT.exists(), f"Audio root not found: {AUDIO_ROOT}"

# Each idXXXXX directory is a speaker
SPEAKERS = sorted([p.name for p in AUDIO_ROOT.iterdir() if p.is_dir()])

print(f"Found {len(SPEAKERS)} speakers")


def speaker_from_path(p: Path) -> str:
    """
    Given an audio file path, return speaker ID (idXXXXX).
    """
    return p.parents[1].name

Found 5558 speakers


In [6]:
# %%
TARGET_SR = 16000

@torch.no_grad()
def embed_with_redim(wav_path: Path):
    wav, sr = torchaudio.load(wav_path)

    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(wav, sr, TARGET_SR)

    wav = wav[:1].to(DEVICE)  # mono
    emb = redim_model(wav).squeeze(0)  # [192]

    emb = F.normalize(emb, dim=0)  # L2 norm
    return emb.cpu()


In [7]:
# %%
rows = []

# collect BOTH wav and m4a recursively
audio_paths = sorted(list(AUDIO_ROOT.rglob("*.wav")) + list(AUDIO_ROOT.rglob("*.m4a")))

print(f"Found {len(audio_paths)} audio files total")

# optional limits (VERY IMPORTANT)
MAX_SPEAKERS = 10  
MAX_SAMPLES_PER_SPEAKER = 20

speaker_counts = {}
used_speakers = set()

for p in tqdm(audio_paths):
    spk = speaker_from_path(p)

    # limit number of speakers
    if spk not in used_speakers:
        if len(used_speakers) >= MAX_SPEAKERS:
            continue
        used_speakers.add(spk)
        speaker_counts[spk] = 0

    # limit samples per speaker
    if speaker_counts[spk] >= MAX_SAMPLES_PER_SPEAKER:
        continue

    emb = embed_with_redim(p)
    if emb is None:
        continue

    speaker_counts[spk] += 1

    rows.append({"path": str(p), "file": p.name, "speaker": spk, "embedding": emb})

df = pd.DataFrame(rows)

print("Total samples:", len(df))
print("Samples per speaker:")
print(pd.Series(speaker_counts))
df.head()

Found 824378 audio files total


100%|██████████| 824378/824378 [00:20<00:00, 40010.55it/s] 


Total samples: 200
Samples per speaker:
id00012    20
id00016    20
id00018    20
id00019    20
id00020    20
id00021    20
id00022    20
id00024    20
id00025    20
id00026    20
dtype: int64


Unnamed: 0,path,file,speaker,embedding
0,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00001.m4a,id00012,"[tensor(-0.0307), tensor(-0.0165), tensor(-0.0..."
1,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00002.m4a,id00012,"[tensor(-0.0431), tensor(-0.0290), tensor(-0.0..."
2,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00003.m4a,id00012,"[tensor(-0.0156), tensor(0.0019), tensor(0.014..."
3,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00004.m4a,id00012,"[tensor(0.0089), tensor(-0.0417), tensor(-0.06..."
4,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00005.m4a,id00012,"[tensor(-0.0215), tensor(-0.0082), tensor(0.01..."


In [8]:
# %%
centroids = {}

for spk in df["speaker"].unique():
    embs = torch.stack(df[df["speaker"] == spk]["embedding"].tolist())
    centroid = embs.mean(dim=0)
    centroid = F.normalize(centroid, dim=0)
    centroids[spk] = centroid

print("Computed centroids for:", list(centroids.keys()))

Computed centroids for: ['id00012', 'id00016', 'id00018', 'id00019', 'id00020', 'id00021', 'id00022', 'id00024', 'id00025', 'id00026']


In [9]:
# %%
def cosine_to_centroid(emb, centroid):
    return float(torch.dot(emb, centroid))

df["cosine_to_centroid"] = [
    cosine_to_centroid(row.embedding, centroids[row.speaker])
    for row in df.itertuples()
]


In [10]:
# %%
df_sorted = (
    df.sort_values(["speaker", "cosine_to_centroid"], ascending=[True, False])
    .reset_index(drop=True)
)

df_sorted.head(10)


Unnamed: 0,path,file,speaker,embedding,cosine_to_centroid
0,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00002.m4a,id00012,"[tensor(-0.0431), tensor(-0.0290), tensor(-0.0...",0.966178
1,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00007.m4a,id00012,"[tensor(-0.0239), tensor(-0.0370), tensor(-0.0...",0.953634
2,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00003.m4a,id00012,"[tensor(-0.0156), tensor(0.0019), tensor(0.014...",0.947912
3,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00001.m4a,id00012,"[tensor(-0.0307), tensor(-0.0165), tensor(-0.0...",0.942899
4,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00009.m4a,id00012,"[tensor(-0.0375), tensor(-0.0242), tensor(-0.0...",0.939073
5,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00013.m4a,id00012,"[tensor(-0.0367), tensor(-0.0193), tensor(0.00...",0.92732
6,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00006.m4a,id00012,"[tensor(-0.0555), tensor(-0.0098), tensor(0.01...",0.921111
7,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00014.m4a,id00012,"[tensor(-0.0273), tensor(-0.0062), tensor(-0.0...",0.920928
8,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00011.m4a,id00012,"[tensor(-0.0360), tensor(-0.0176), tensor(0.00...",0.920679
9,/home/SpeakerRec/BioVoice/data/datasets/voxcel...,00005.m4a,id00012,"[tensor(-0.0215), tensor(-0.0082), tensor(0.01...",0.916402


In [11]:
# %%
stats = []

for spk in df_sorted["speaker"].unique():
    sims = df_sorted[df_sorted["speaker"] == spk]["cosine_to_centroid"].values

    stats.append(
        {
            "speaker": spk,
            "min": sims.min(),
            "max": sims.max(),
            "mean": sims.mean(),
            "std": sims.std(),
            "range": sims.max() - sims.min(),
        }
    )

stats_df = pd.DataFrame(stats)
display(stats_df)

Unnamed: 0,speaker,min,max,mean,std,range
0,id00012,0.685425,0.966178,0.891472,0.074903,0.280753
1,id00016,0.709043,0.898945,0.847244,0.055866,0.189902
2,id00018,0.815134,0.94058,0.897233,0.029706,0.125446
3,id00019,0.647668,0.872321,0.795139,0.052021,0.224653
4,id00020,0.739673,0.92284,0.836646,0.051447,0.183167
5,id00021,0.658259,0.902765,0.812074,0.050902,0.244506
6,id00022,0.791239,0.980089,0.933474,0.049595,0.18885
7,id00024,0.737259,0.954297,0.882986,0.067766,0.217038
8,id00025,0.823309,0.949828,0.913185,0.030851,0.126519
9,id00026,0.360528,0.9071,0.696599,0.166785,0.546572


In [12]:
# %%
out_path =  "output/speaker_similarity_ranking_vox2_10_20_ids.csv"
df_sorted_out = df_sorted.drop(columns=["embedding"])
df_sorted_out.to_csv(out_path, index=False)

print("Saved ranking to:", out_path)


Saved ranking to: output/speaker_similarity_ranking_vox2_10_20_ids.csv
