In [None]:
import torch, torchaudio, faiss
from faiss import StandardGpuResources, index_cpu_to_gpu

In [None]:
# 1) Embed batch of audio files
def embed_batch(filepaths, model, device='cuda'):
    waves = []
    for f in filepaths:
        w, sr = torchaudio.load(f)
        w = torchaudio.transforms.Resample(sr,16000)(w)
        waves.append(w.to(device))
    waves = torch.nn.utils.rnn.pad_sequence(waves, batch_first=True)
    mels = torchaudio.transforms.MelSpectrogram(16000)(waves)  # [B, n_mels, T]
    with torch.no_grad():
        emb = model(mels)  # your model → [B, D]
    return torch.nn.functional.normalize(emb, dim=1).cpu().numpy()

In [None]:
# 2) Build FAISS GPU index
D = 128
res = StandardGpuResources()
cpu_index = faiss.IndexFlatIP(D)             # inner-product on normalized vectors = cosine
gpu_index = index_cpu_to_gpu(res, 0, cpu_index)
gpu_index.add(all_embeddings)                 # all_embeddings: np.float32[N×D]

In [None]:
# 3) Radius search
radius = 0.8                                # tune between 0.7–0.9
lims, Dists, Ids = gpu_index.range_search(all_embeddings, radius)
# range_search returns: lims: (N+1) prefix-sum of counts, flat arrays Dists/Ids

In [None]:
# 4) Build clusters (simple CPU union-find)
parent = list(range(N))
def find(x):
    while parent[x]!=x:
        parent[x]=parent[parent[x]]
        x=parent[x]
    return x
def unite(a,b):
    ra,rb = find(a), find(b)
    if ra!=rb: parent[rb]=ra

for i in range(N):
    for idx in range(lims[i], lims[i+1]):
        j = Ids[idx]
        if i!=j:
            unite(i,j)

clusters = {}
for i in range(N):
    r = find(i)
    clusters.setdefault(r, []).append(i)

In [None]:
# 5) Pick representatives & filter
filtered = []
for members in clusters.values():
    # e.g. pick the longest duration file
    best = max(members, key=lambda i: metadata[i].duration)
    filtered.append(best)