1. Load and Preprocess Retrieval Data

In [1]:
# === IMPORTS ===
import json
import numpy as np
from tqdm import tqdm

# === LOAD RETRIEVAL RESULTS FROM JSON ===
with open("retrieval_results.json", "r") as f:
    retrieval_data = json.load(f)

num_queries = len(retrieval_data)

# === HELPER FUNCTION: Extract label from filename ===
def label_from_filename(filename):
    """
    Extracts the ground truth label from the filename.
    Assumes label is the integer prefix before the first underscore.
    E.g., "12_cat_001.jpg" → 12
    """
    return int(filename.split("_")[0])


2. Define Evaluation Metric

In [2]:
# === AVERAGE PRECISION @k FOR A SINGLE QUERY ===
def average_precision_at_k(true_label, retrieved_labels, k):
    """
    Computes Average Precision at rank k for a single query.

    Args:
        true_label (int): The ground truth label of the query.
        retrieved_labels (List[int]): The list of predicted labels.
        k (int): Cutoff rank for evaluation.

    Returns:
        float: Average Precision at k.
    """
    num_hits = 0
    precision_values = []
    for i in range(min(k, len(retrieved_labels))):
        if retrieved_labels[i] == true_label:
            num_hits += 1
            precision = num_hits / (i + 1)
            precision_values.append(precision)

    return np.mean(precision_values) if precision_values else 0.0


3. Compute Evaluation Metrics

In [3]:
# === RANK POSITIONS TO EVALUATE ===
k_values = [1, 2, 3, 4, 5, 7, 10]

# === INITIALIZE METRICS CONTAINERS ===
map_scores      = {k: [] for k in k_values}   # Mean Average Precision at k
topk_ratios     = {k: [] for k in k_values}   # Precision@k (hit ratio)
topk_accuracy   = {k: [] for k in k_values}   # Top-k Accuracy (at least one correct)

# === MAIN LOOP OVER RETRIEVAL RESULTS ===
for entry in tqdm(retrieval_data, desc="Computing metrics"):
    query_file      = entry["filename"]
    retrieved_files = entry["samples"]

    true_label = label_from_filename(query_file)
    retrieved_labels = [label_from_filename(f) for f in retrieved_files]

    for k in k_values:
        # ----- mAP@k -----
        ap = average_precision_at_k(true_label, retrieved_labels, k)
        map_scores[k].append(ap)

        # ----- Precision@k (hit count / k) -----
        hit_count = sum(
            1 for i in range(min(k, len(retrieved_labels)))
            if retrieved_labels[i] == true_label
        )
        topk_ratios[k].append(hit_count / k)

        # ----- Top-k Accuracy (1 if true_label appears in top-k) -----
        hit = int(true_label in retrieved_labels[:k])
        topk_accuracy[k].append(hit)


Computing metrics: 100%|██████████| 240/240 [00:00<00:00, 8846.57it/s]


4. Print and Aggregate Results

In [4]:
# === FINAL REPORT =========================================================

print(f"\n📊 Mean Average Precision (mAP@k) over {num_queries} queries:")
for k in k_values:
    print(f"mAP@{k:<2}: {np.mean(map_scores[k]):.4f}")

print(f"\n📈 Precision@k (hit count / k):")
for k in k_values:
    print(f"P@{k:<2}:   {np.mean(topk_ratios[k]):.4f}")

print(f"\n🎯 Top-k Accuracy (at least one correct match in top-k):")
for k in k_values:
    print(f"Acc@{k:<2}: {np.mean(topk_accuracy[k]):.4f}")

# === OPTIONAL: OVERALL AVERAGE OF ALL MAP@K ===
overall_mean = np.mean([np.mean(map_scores[k]) for k in k_values])
print(f"\n📌 Average mAP across all k values: {overall_mean:.4f}")



📊 Mean Average Precision (mAP@k) over 240 queries:
mAP@1 : 0.9417
mAP@2 : 0.9583
mAP@3 : 0.9542
mAP@4 : 0.9528
mAP@5 : 0.9489
mAP@7 : 0.9410
mAP@10: 0.9352

📈 Precision@k (hit count / k):
P@1 :   0.9417
P@2 :   0.9292
P@3 :   0.9222
P@4 :   0.9156
P@5 :   0.9125
P@7 :   0.8994
P@10:   0.8871

🎯 Top-k Accuracy (at least one correct match in top-k):
Acc@1 : 0.9417
Acc@2 : 0.9750
Acc@3 : 0.9750
Acc@4 : 0.9833
Acc@5 : 0.9833
Acc@7 : 0.9875
Acc@10: 0.9958

📌 Average mAP across all k values: 0.9474
