# Audio Clustering with ImageBind Embeddings
Template: extract audio embeddings via ImageBind and cluster. Run in Colab with GPU.

In [None]:
# Install
!pip install -q git+https://github.com/facebookresearch/ImageBind.git scikit-learn

In [None]:
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
from imagebind.data import load_and_transform_audio_data
from sklearn.cluster import KMeans

# Sample audio URLs (replace with your own short clips)
audio_urls = [
    'https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac',
    'https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/2.flac',
    'https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/3.flac',
]
audio_data = load_and_transform_audio_data(audio_urls, device='cpu')
model = imagebind_model.imagebind_huge(pretrained=True)
with torch.no_grad():
    embeddings = model({ModalityType.AUDIO: audio_data})[ModalityType.AUDIO].cpu().numpy()
km = KMeans(n_clusters=3, random_state=42, n_init='auto').fit(embeddings)
print('Cluster labels:', km.labels_)


## Notes
- Use short audio clips; GPU strongly recommended.
- Optionally run PCA/TSNE on embeddings to visualize.