In [None]:

from pathlib import Path
import torchaudio
import torch
import torch.nn.functional as F
import pandas as pd
from speechbrain.pretrained import EncoderClassifier
import itertools
import ast
import json
import numpy as np
from sklearn.metrics import roc_curve
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load pretrained ECAPA model
classifier = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="ecapa_pretrained"
).to(device)

def get_embedding(wav_path):
    """Returns a normalized ECAPA embedding vector [emb_dim]."""
    signal, fs = torchaudio.load(str(wav_path))
    if fs != 16000:
        signal = torchaudio.functional.resample(signal, fs, 16000)
    signal = signal.to(device)

    with torch.no_grad():
        emb = classifier.encode_batch(signal)   # shape [1, 1, emb_dim]

    emb = emb.squeeze(0).squeeze(0)
    emb = F.normalize(emb, p=2, dim=0) # L2 normalization
    return emb.cpu()

print("Model loaded successfully.")


Model loaded successfully.


  torch.load(path, map_location=device), strict=False
  stats = torch.load(path, map_location=device)


In [11]:

# Extract ECAPA embeddings for ALL wav files in data_dir

data_dir = Path("./data")
wav_paths = sorted(list(data_dir.glob("*.wav")))

rows = []
emb_dict = {}  
for wav in wav_paths:
    emb = get_embedding(wav)
    true_label = wav.stem.split("_")[0]   
    
    emb_dict[str(wav)] = emb
    rows.append({
        "path": str(wav),
        "true_label": true_label,
        "embedding_vector": emb.numpy().tolist()  
    })

df_emb = pd.DataFrame(rows)
df_emb.to_csv("users_embeddings.csv", index=False)



df_emb.head()


Unnamed: 0,path,true_label,embedding_vector
0,data\eden_001.wav,eden,"[0.0511067770421505, 0.03551109880208969, 0.02..."
1,data\eden_002.wav,eden,"[0.18112494051456451, 0.013891077600419521, -0..."
2,data\eden_003.wav,eden,"[0.09115337580442429, -0.10557050257921219, 0...."
3,data\eden_004.wav,eden,"[0.1183476597070694, -0.037616558372974396, 0...."
4,data\eden_005.wav,eden,"[0.11517681926488876, -0.0064055174589157104, ..."


In [12]:
# Compute pairwise cosine similarity for embeddings of the SAME speaker
df = df_emb.copy()
# df["embedding_vector"] = df["embedding_vector"].apply(json.loads)
# df["embedding_vector"] = df["embedding_vector"].apply(ast.literal_eval)

speakers = sorted(df["true_label"].unique())
similarity_rows = []

for spk in speakers:
    df_spk = df[df["true_label"] == spk]

    for i, j in itertools.combinations(df_spk.index, 2):
        emb_i = torch.tensor(df_spk.loc[i, "embedding_vector"])
        emb_j = torch.tensor(df_spk.loc[j, "embedding_vector"])

        sim = F.cosine_similarity(emb_i, emb_j, dim=0).item()

        similarity_rows.append({
            "speaker": spk,
            "wav1": df_spk.loc[i, "path"],
            "wav2": df_spk.loc[j, "path"],
            "cosine_similarity": sim
        })

df_sim = pd.DataFrame(similarity_rows)
df_sim.to_csv("user_similarity_pairs.csv", index=False)


df_sim.head()

Unnamed: 0,speaker,wav1,wav2,cosine_similarity
0,eden,data\eden_001.wav,data\eden_002.wav,0.474616
1,eden,data\eden_001.wav,data\eden_003.wav,0.436774
2,eden,data\eden_001.wav,data\eden_004.wav,0.558711
3,eden,data\eden_001.wav,data\eden_005.wav,0.604544
4,eden,data\eden_001.wav,data\eden_006.wav,0.58778


In [13]:

# Compute average similarity per file

template_rows = []


for spk in speakers:
    df_spk = df[df["true_label"] == spk]

    avg_scores = []

    for idx in df_spk.index:
        file_path = df_spk.loc[idx, "path"]

        # Get all rows where this file appears in similarity table
        rel1 = df_sim[(df_sim["speaker"] == spk) & (df_sim["wav1"] == file_path)]
        rel2 = df_sim[(df_sim["speaker"] == spk) & (df_sim["wav2"] == file_path)]

        sims = list(rel1["cosine_similarity"]) + list(rel2["cosine_similarity"])

        if len(sims) == 0:
            avg_sim = 0.0
        else:
            avg_sim = sum(sims) / len(sims)

        avg_scores.append((file_path, avg_sim))

    # pick file with best similarity
    best_file, best_score = sorted(avg_scores, key=lambda x: x[1], reverse=True)[0]

    template_rows.append({
        "speaker": spk,
        "template_path": best_file,
        "average_similarity": best_score
    })

df_templates = pd.DataFrame(template_rows)
df_templates.to_csv("user_best_templates.csv", index=False)


df_templates


Unnamed: 0,speaker,template_path,average_similarity
0,eden,data\eden_013.wav,0.61016
1,idan,data\idan_012.wav,0.698508
2,yoav,data\yoav_022.wav,0.65312


In [14]:
# Use the best file as the template for each speaker


template_embeddings = {}

for _, row in df_templates.iterrows():
    spk = row["speaker"]
    template_file = row["template_path"]
    
    template_embeddings[spk] = torch.tensor(
        df[df["path"] == template_file]["embedding_vector"].values[0]
    )

template_embeddings


{'eden': tensor([ 0.1227, -0.0100,  0.0723,  0.0259,  0.0044,  0.0512, -0.0072,  0.0143,
          0.0322, -0.0428,  0.0601, -0.0291,  0.0654,  0.0840, -0.0368, -0.0069,
          0.0712,  0.1527, -0.1150,  0.1289, -0.0439, -0.1348,  0.0709, -0.0237,
         -0.0191, -0.0747,  0.1285,  0.0732, -0.0189, -0.1513,  0.0539, -0.1086,
          0.0813, -0.0434,  0.0332, -0.1620, -0.0673, -0.0817, -0.0422, -0.0610,
         -0.0061,  0.0252, -0.1141, -0.0594,  0.0839, -0.1235,  0.0266, -0.1059,
          0.1042, -0.0180, -0.0267, -0.0257, -0.0073,  0.0246, -0.0499, -0.0948,
         -0.0570, -0.0782, -0.0806,  0.0729, -0.0061,  0.0275, -0.0348,  0.1019,
          0.0543, -0.1483, -0.0114,  0.0424, -0.0461, -0.0859,  0.0203,  0.0665,
          0.0808, -0.0109,  0.0240,  0.0030,  0.0432, -0.1256, -0.0168,  0.0117,
          0.0896,  0.0822, -0.0198, -0.1135,  0.0634,  0.0273, -0.0469,  0.1008,
         -0.0963, -0.0890,  0.0446,  0.0438, -0.0259,  0.1812, -0.1122, -0.0120,
         -0.0004,  0

In [15]:
# cosine(test_embedding, template_spk)
# softmax → probabilities

temperature = 0.1
final_rows = []


for wav in wav_paths:

    wav_key = str(wav)


    # retrieve embedding
    emb = torch.tensor(emb_dict[wav_key])
    true_label = wav.stem.split("_")[0]

    sims = []
    for spk in speakers:
        sim = F.cosine_similarity(emb, template_embeddings[spk], dim=0).item()
        sims.append(sim)

    sims_tensor = torch.tensor(sims)
    probs = F.softmax(sims_tensor / temperature, dim=0)

    row = {
        "path": wav_key,   
        "true_label": true_label,
    }

    # probability columns
    for i, spk in enumerate(speakers):
        row[f"prob_{spk}"] = probs[i].item()

    # predicted speaker
    best_idx = torch.argmax(probs).item()
    row["predicted_label"] = speakers[best_idx]
    row["predicted_probability"] = probs[best_idx].item()

    final_rows.append(row)

df_final = pd.DataFrame(final_rows)
df_final.to_csv("user_final_predictions.csv", index=False)



df_final.head()


  emb = torch.tensor(emb_dict[wav_key])


Unnamed: 0,path,true_label,prob_eden,prob_idan,prob_yoav,predicted_label,predicted_probability
0,data\eden_001.wav,eden,0.992331,0.005956,0.001713,eden,0.992331
1,data\eden_002.wav,eden,0.91855,0.064232,0.017218,eden,0.91855
2,data\eden_003.wav,eden,0.954194,0.037506,0.008299,eden,0.954194
3,data\eden_004.wav,eden,0.983065,0.015297,0.001638,eden,0.983065
4,data\eden_005.wav,eden,0.965382,0.032417,0.002201,eden,0.965382


In [16]:
df = df_emb.copy()

labels = df["true_label"].values

emb_list = df["embedding_vector"].tolist()

embeddings = np.asarray(emb_list, dtype=np.float32)

print("Embeddings shape:", embeddings.shape)
print("Num samples:", len(labels))
print("Unique speakers:", np.unique(labels))


Embeddings shape: (90, 192)
Num samples: 90
Unique speakers: ['eden' 'idan' 'yoav']


In [None]:


def cosine_similarity_matrix(X: np.ndarray) -> np.ndarray:
    X_norm = X / np.linalg.norm(X, axis=1, keepdims=True)
    return X_norm @ X_norm.T

def compute_eer_from_embeddings(embeddings: np.ndarray, labels: np.ndarray):
    sim_mat = cosine_similarity_matrix(embeddings)
    N = len(labels)

    scores = []
    gt = []

    # Build pairwise labels
    for i in range(N):
        for j in range(i + 1, N):
            scores.append(sim_mat[i, j])
            gt.append(1 if labels[i] == labels[j] else 0)

    scores = np.array(scores)
    gt = np.array(gt)

    # ROC curve
    fpr, tpr, thresh = roc_curve(gt, scores)
    fnr = 1 - tpr

    # Find EER
    idx = np.argmin(np.abs(fnr - fpr))
    eer = (fnr[idx] + fpr[idx]) / 2
    thr_eer = thresh[idx]

    return eer, thr_eer, fpr, fnr


In [None]:

eer, thr_eer, fpr, fnr = compute_eer_from_embeddings(embeddings, labels)

print(f"EER = {eer * 100:.2f}%")
print(f"Threshold at EER (cosine similarity): {thr_eer:.4f}")


EER = 0.45%
Threshold at EER (cosine similarity): 0.3629
