In [1]:
from hcmus.utils import data_utils

splits = data_utils.get_data_splits()
datasets = data_utils.get_image_datasets_v2(splits, random_margin=0)

[32m2025-07-07 10:40:06.861[0m | [1mINFO    [0m | [36mhcmus.core.appconfig[0m:[36m<module>[0m:[36m7[0m - [1mLoad DotEnv: True[0m
[32m2025-07-07 10:40:07.988[0m | [1mINFO    [0m | [36mhcmus.lbs._label_studio_connector[0m:[36mget_tasks[0m:[36m152[0m - [1mNew `page_to` applied: 35[0m
Loading tasks: 100%|██████████| 35/35 [00:10<00:00,  3.24it/s]
Downloading images: 100%|██████████| 3443/3443 [00:05<00:00, 579.26it/s] 
[32m2025-07-07 10:40:24.873[0m | [1mINFO    [0m | [36mhcmus.lbs._label_studio_connector[0m:[36mget_tasks[0m:[36m152[0m - [1mNew `page_to` applied: 5[0m
Loading tasks: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]
Downloading images: 100%|██████████| 435/435 [00:02<00:00, 204.09it/s]
[32m2025-07-07 10:40:30.241[0m | [1mINFO    [0m | [36mhcmus.lbs._label_studio_connector[0m:[36mget_tasks[0m:[36m152[0m - [1mNew `page_to` applied: 4[0m
Loading tasks: 100%|██████████| 4/4 [00:01<00:00,  2.43it/s]
Downloading images: 100%|██████████|

In [6]:
from loguru import logger
from torchvision import transforms as T
from hcmus.models.backbone import CLIPBackbone
from hcmus.models.backbone import DinoBackbone

In [3]:
device = "mps"
backbone_list = [
    (CLIPBackbone, {"backbone_name": "ViT-B/32"}),
    (CLIPBackbone, {"backbone_name": "ViT-B/16"}),
    (DinoBackbone, {"model_id": "facebook/dinov2-small"}),
    (DinoBackbone, {"model_id": "facebook/dinov2-base"}),
    (DinoBackbone, {"model_id": "facebook/dino-vitb8"}),
    (DinoBackbone, {"model_id": "facebook/dino-vits8"}),
    (DinoBackbone, {"model_id": "facebook/dino-vits16"}),
    (DinoBackbone, {"model_id": "facebook/dino-vitb16"}),
]

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor()
])

In [14]:
import faiss
from tqdm import tqdm

def build_index(backbone_config):
    cls, params = backbone_config
    backbone = cls(**params)
    output_dim = backbone.output_dim
    logger.info(f"Backbone: {cls.__name__}, params={params}")
    logger.info(f"Output dim: {backbone.output_dim}")
    index = faiss.IndexHNSWFlat(output_dim, 256)
    labels = []
    for item in tqdm(datasets["train"], desc="Building index..."):
        image, label, metadata = item
        tensor = transform(image)
        feature = backbone.forward(tensor)
        feature = feature.detach().numpy().astype("float32")
        labels.append((label, metadata["label_str"]))
        index.add(feature)
    return backbone, index, labels

In [33]:
def evaluate(backbone, index, labels, split_name: str):
    precision_1 = 0
    precision_3 = 0
    precision_5 = 0

    total = len(datasets[split_name])
    for item in tqdm(datasets[split_name], desc="Evaluating dataset=val..."):
        image, label, metadata = item
        tensor = transform(image)
        feature = backbone.forward(tensor)
        feature = feature.detach().numpy().astype("float32")
        _, I = index.search(feature, k=5)
        preds = [labels[i][0] for i in I[0]]
        precision_1 += preds[0] == label
        precision_3 += label in preds[:2]
        precision_5 += label in preds[:4]

    logger.info(f"Precision@1 dataset={split_name}: {precision_1 / total}")
    logger.info(f"Precision@3 dataset={split_name}: {precision_3 / total}")
    logger.info(f"Precision@5 dataset={split_name}: {precision_5 / total}")


In [34]:
for config in backbone_list:
    backbone, index, labels = build_index(config)
    evaluate(backbone, index, labels, "val")
    evaluate(backbone, index, labels, "test")

[32m2025-07-07 11:13:50.336[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_index[0m:[36m8[0m - [1mBackbone: CLIPBackbone, params={'backbone_name': 'ViT-B/32'}[0m
[32m2025-07-07 11:13:50.337[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_index[0m:[36m9[0m - [1mOutput dim: 512[0m
Building index...: 100%|██████████| 2659/2659 [01:25<00:00, 31.27it/s]
Evaluating dataset=val...: 100%|██████████| 2824/2824 [02:03<00:00, 22.82it/s]
[32m2025-07-07 11:17:19.116[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m18[0m - [1mPrecision@1 dataset=val: 0.5722379603399433[0m
[32m2025-07-07 11:17:19.116[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m19[0m - [1mPrecision@3 dataset=val: 0.6388101983002833[0m
[32m2025-07-07 11:17:19.116[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m20[0m - [1mPrecision@5 dataset=val: 0.6855524079320113[0m
Evaluating dataset=val...: 100%|██████████| 5836/5836 [04:24<00:00, 22.10it/s

In [35]:
backbone, index, labels = build_index((DinoBackbone, {"model_id": "facebook/dinov2-large"}))
evaluate(backbone, index, labels, "val")
evaluate(backbone, index, labels, "test")

[32m2025-07-07 18:28:05.669[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_index[0m:[36m8[0m - [1mBackbone: DinoBackbone, params={'model_id': 'facebook/dinov2-large'}[0m
[32m2025-07-07 18:28:05.670[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_index[0m:[36m9[0m - [1mOutput dim: 1024[0m
Building index...: 100%|██████████| 2659/2659 [09:42<00:00,  4.57it/s]
Evaluating dataset=val...: 100%|██████████| 2824/2824 [11:03<00:00,  4.25it/s]
[32m2025-07-07 18:48:51.888[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m18[0m - [1mPrecision@1 dataset=val: 0.5995042492917847[0m
[32m2025-07-07 18:48:51.888[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m19[0m - [1mPrecision@3 dataset=val: 0.6724504249291785[0m
[32m2025-07-07 18:48:51.888[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m20[0m - [1mPrecision@5 dataset=val: 0.740084985835694[0m
Evaluating dataset=val...: 100%|██████████| 5836/5836 [21:44<00:00,  