# Dataset Embedding Clustering

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from pathlib import Path
from matplotlib import pyplot as plt

from jaguar.datasets import get_data, EmbeddingDataset
from jaguar.embedding_models import MegaDescriptor, DINOv3
from sklearn.cluster import DBSCAN, KMeans

In [2]:
DATA_DIR = Path("../data/")
EMBEDDINGS_DIR = Path("../embeddings/")
VALIDATION_SPLIT_SIZE = 0.2
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_data, val_data, test_query, test_gallery, num_classes, label_encoder = get_data(
    DATA_DIR,
    validation_split_size=VALIDATION_SPLIT_SIZE,
    seed=SEED,
)
display(train_data.head())
num_identities = len(label_encoder.classes_)

Unnamed: 0,filename,ground_truth,label_encoded
1372,../data/train/train/train_1373.png,Ousado,20
775,../data/train/train/train_0776.png,Kwang,15
1094,../data/train/train/train_1095.png,Marcela,18
1784,../data/train/train/train_1785.png,Ti,29
112,../data/train/train/train_0113.png,Benita,6


['../data/train/train/train_1373.png' '../data/train/train/train_0776.png'
 '../data/train/train/train_1095.png' ...
 '../data/train/train/train_0975.png' '../data/train/train/train_0522.png'
 '../data/train/train/train_0906.png']


In [10]:
import os


embedding_model = DINOv3(device=device)
train_dataset = EmbeddingDataset(train_data, embedding_model=embedding_model, key="train", cache_folder=EMBEDDINGS_DIR)

embeddinbgs = train_dataset.get_embeddings()
embeddings = F.normalize(embeddinbgs, p=2, dim=1)
clustering = DBSCAN(eps=1.0, min_samples=5).fit(embeddinbgs)
labels = clustering.labels_

outliers = np.sum(labels == -1)
print(f"Number of outliers detected: {outliers}/{len(embeddinbgs)}")

Loading cached embeddings from ../embeddings/dinov3_train_embeddings.npz
Number of outliers detected: 1177/1516
