# Domain Mismatch Diagnostics

This notebook helps compare Zoom-trained speaker embeddings against iPhone (or other domain) clips. It loads the trained classifier bundle, extracts embeddings for both Zoom clips and test clips, and runs a cosine-similarity sanity check alongside the existing sklearn head.

In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
from collections import defaultdict
import json
import numpy as np
import pandas as pd
import librosa
import joblib
from tqdm.notebook import tqdm

from train_speaker_classifier import FeatureExtractor, resolve_hf_token

## Configure paths

Update the paths below:
- `MODEL_PATH`: trained speaker model bundle (`speaker_classifier.joblib`).
- `ZOOM_MANIFEST`: manifest JSONL produced by `generate_speaker_corpus.py` (Zoom domain clips).
- `IPHONE_DIR`: directory of test clips (e.g., iPhone chunks) organized as `speaker_name/*.wav` or just a bag of files.

In [None]:
MODEL_PATH = Path("models/speaker_ecapa/speaker_classifier.joblib")
ZOOM_MANIFEST = Path("/path/to/zoom/manifest.jsonl")
IPHONE_DIR = Path("/path/to/iphone/clips")

# Optional override if the model requires a gated HF token (pyannote/embedding)
 \nHF_TOKEN = resolve_hf_token(None)
assert MODEL_PATH.exists(), MODEL_PATH
assert ZOOM_MANIFEST.exists(), ZOOM_MANIFEST
assert IPHONE_DIR.exists(), IPHONE_DIR

## Load model + feature extractor

In [None]:
bundle = joblib.load(MODEL_PATH)
model = bundle["model"]
label_encoder = bundle["label_encoder"]
feature_params = bundle.get("feature_params", {})
sample_rate = int(feature_params.get("sample_rate", 16_000))
n_mfcc = int(feature_params.get("n_mfcc", 40))
feature_type = feature_params.get("feature_type", "mfcc")
wav2vec2_model = feature_params.get("wav2vec2_model")
ecapa_model = feature_params.get("ecapa_model")
pyannote_model = feature_params.get("pyannote_model")

extractor = FeatureExtractor(
    feature_type=feature_type,
    sample_rate=sample_rate,
    n_mfcc=n_mfcc,
    wav2vec2_model=wav2vec2_model,
    ecapa_model=ecapa_model,
    pyannote_model=pyannote_model,
    hf_token=HF_TOKEN,
    device="cpu",
)
feature_params

## Helper functions

In [None]:
def load_manifest_entries(manifest_path: Path) -> list[dict]:
    entries = []
    with manifest_path.open("r", encoding="utf-8") as handle:
        for line in handle:
            if not line.strip():
                continue
            data = json.loads(line)
            entries.append(data)
    return entries

def compute_embedding(path: Path) -> np.ndarray:
    audio, _ = librosa.load(path, sr=sample_rate, mono=True)
    return extractor.compute_from_waveform(audio, sample_rate)

def l2_normalize(vec: np.ndarray) -> np.ndarray:
    norm = np.linalg.norm(vec) + 1e-9
    return vec / norm

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    return float(np.dot(a, b))

## Compute Zoom centroids

In [None]:
zoom_entries = load_manifest_entries(ZOOM_MANIFEST)
zoom_embeds: dict[str, list[np.ndarray]] = defaultdict(list)
for entry in tqdm(zoom_entries, desc="Zoom embeddings"):
    clip_path = Path(entry["clip_path"]).expanduser()
    if not clip_path.exists():
        continue
    try:
        emb = compute_embedding(clip_path)
    except Exception as exc:
        print(f"[warn] {clip_path}: {exc}")
        continue
    zoom_embeds[entry["speaker"]].append(emb)

zoom_centroids = {
    speaker: l2_normalize(np.mean(np.stack(embs), axis=0))
    for speaker, embs in zoom_embeds.items()
    if embs
}
list(zoom_centroids.keys())

## Embed iPhone / target-domain clips

In [None]:
iphone_records = []
for path in sorted(IPHONE_DIR.rglob("*.wav")):
    speaker_guess = path.parent.name
    try:
        emb = compute_embedding(path)
    except Exception as exc:
        print(f"[warn] {path}: {exc}")
        continue
    iphone_records.append({
        "path": path,
        "speaker_guess": speaker_guess,
        "embedding": emb,
    })
len(iphone_records)

## Cosine sanity check

In [None]:
def predict_by_centroid(embedding: np.ndarray) -> tuple[str, float, dict[str, float]]:
    emb = l2_normalize(embedding)
    scores = {}
    best_spk, best_score = None, -1.0
    for speaker, centroid in zoom_centroids.items():
        score = cosine(emb, centroid)
        scores[speaker] = score
        if score > best_score:
            best_spk, best_score = speaker, score
    return best_spk, best_score, scores

rows = []
for record in iphone_records:
    prediction, score, scores = predict_by_centroid(record["embedding"])
    rows.append({
        "path": str(record["path"]),
        "speaker_guess": record["speaker_guess"],
        "centroid_prediction": prediction,
        "centroid_score": score,
    })
cosine_df = pd.DataFrame(rows)
cosine_df.head()

### Inspect cosine ranking per clip

In [None]:
def show_top_matches(record, top_n=5):
    _, _, scores = predict_by_centroid(record["embedding"])
    ordered = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_n]
    print(f"Clip: {record['path']}")
    print(f"  Labeled speaker: {record['speaker_guess']}")
    for spk, score in ordered:
        print(f"  {spk:20s}  score={score:.3f}")

# Example: inspect the first few
for record in iphone_records[:3]:
    show_top_matches(record)
    print("-")

## Compare classifier predictions vs cosine baseline

In [None]:
classifier_rows = []
for record in iphone_records:
    vec = record["embedding"].reshape(1, -1)
    pred = model.predict(vec)[0]
    classifier_rows.append({
        "path": str(record["path"]),
        "speaker_guess": record["speaker_guess"],
        "classifier_prediction": label_encoder.inverse_transform([pred])[0],
    })

classifier_df = pd.DataFrame(classifier_rows)
comparison_df = cosine_df.merge(classifier_df, on=["path", "speaker_guess"], how="left")
comparison_df.head()

You can now:
- Filter `comparison_df` where centroid prediction matches the known speaker but classifier does not (diagnose scaler/head issues).
- Inspect raw cosine scores to see if the correct speaker is usually top-ranked.
- Plot distributions of cosine scores vs classifier confidence to understand the domain gap.