In [2]:
!pip install open_clip_torch torch torchvision



In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"\nGPU Name: {torch.cuda.get_device_name(0)}")


Using device: cuda

GPU Available: True

GPU Name: Tesla P100-PCIE-16GB


In [4]:
import open_clip

print("\nAvailable CLIP models:")
print(open_clip.list_models())




Available CLIP models:
['coca_base', 'coca_roberta-ViT-B-32', 'coca_ViT-B-32', 'coca_ViT-L-14', 'convnext_base', 'convnext_base_w', 'convnext_base_w_320', 'convnext_large', 'convnext_large_d', 'convnext_large_d_320', 'convnext_small', 'convnext_tiny', 'convnext_xlarge', 'convnext_xxlarge', 'convnext_xxlarge_320', 'EVA01-g-14', 'EVA01-g-14-plus', 'EVA02-B-16', 'EVA02-E-14', 'EVA02-E-14-plus', 'EVA02-L-14', 'EVA02-L-14-336', 'MobileCLIP2-B', 'MobileCLIP2-L-14', 'MobileCLIP2-S0', 'MobileCLIP2-S2', 'MobileCLIP2-S3', 'MobileCLIP2-S4', 'MobileCLIP-B', 'MobileCLIP-S1', 'MobileCLIP-S2', 'mt5-base-ViT-B-32', 'mt5-xl-ViT-H-14', 'nllb-clip-base', 'nllb-clip-base-siglip', 'nllb-clip-large', 'nllb-clip-large-siglip', 'PE-Core-B-16', 'PE-Core-bigG-14-448', 'PE-Core-L-14-336', 'PE-Core-S-16-384', 'PE-Core-T-16-384', 'RN50', 'RN50-quickgelu', 'RN50x4', 'RN50x4-quickgelu', 'RN50x16', 'RN50x16-quickgelu', 'RN50x64', 'RN50x64-quickgelu', 'RN101', 'RN101-quickgelu', 'roberta-ViT-B-32', 'swin_base_patch4_

In [5]:
import open_clip

model_name = "ViT-B-32"
print(f"\nLoading CLIP Model: {model_name}")

model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
tokenizer = open_clip.get_tokenizer(model_name)
model = model.to(device)

print(f"\nModel Loaded Successfully")


Loading CLIP Model: ViT-B-32





Model Loaded Successfully


In [6]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image

class ImageTextDataset(Dataset):
    def __init__(self, csv_path, preprocess):
        df = pd.read_csv(csv_path)
        self.images = df['image_path'].tolist()
        self.texts = df['text'].astype(str).tolist()
        self.preprocess = preprocess

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = Image.open(self.images[idx]).convert("RGB")
        img = self.preprocess(img)
        txt = self.texts[idx]
        return img, txt

In [7]:
from torchvision.datasets import CocoCaptions
import os
import pandas as pd
from tqdm import tqdm
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print(f"\nPath to Dataset Files: {path}")

coco2017_path = os.path.join(path, "coco2017")
coco_root = os.path.join(coco2017_path, "train2017")
ann_file = os.path.join(coco2017_path, "annotations/captions_train2017.json")
train_csv_path = "dataset_train.csv"

print()
coco = CocoCaptions(root=coco_root, annFile=ann_file)
print()

if not os.path.exists(train_csv_path):
    image_paths = []
    texts = []

    for idx in tqdm(range(len(coco)), desc="Processing COCO Captions"):
        img, captions = coco[idx]
        img_path = os.path.join(coco_root, coco.coco.imgs[coco.ids[idx]]['file_name'])
        for caption in captions:
            image_paths.append(img_path)
            texts.append(caption)

    df = pd.DataFrame({"image_path": image_paths, "text": texts})
    df.to_csv(train_csv_path, index=False)
    print(f"\nSaved CSV To: {train_csv_path}")
else:
    print(f"\nCSV File Already Exists: {train_csv_path}")
    df = pd.read_csv(train_csv_path)


Path to Dataset Files: /kaggle/input/coco-2017-dataset

loading annotations into memory...
Done (t=1.50s)
creating index...
index created!


CSV File Already Exists: dataset_train.csv


In [8]:
df = df.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)

  df = df.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [9]:
df['image_id'] = df['image_path'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))


In [10]:
df_subset = df.sample(n=5000, random_state=42).reset_index(drop=True)


###ICT-Q


In [11]:
# df_subset_ictq = df_subset.drop(columns=["neg_img_idx"])
df_subset_ictq = df_subset

In [12]:
!pip install torch torchvision transformers scikit-learn faiss-cpu pandas pillow




In [28]:
import os
import random
from tqdm import tqdm
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.cluster import SpectralClustering
from sklearn.metrics.pairwise import cosine_similarity

# ----------------------------
# USER CONFIG
# ----------------------------
df = df_subset_ictq.reset_index(drop=True)
assert "image_path" in df.columns and "text" in df.columns

MODEL_NAME = "openai/clip-vit-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EPOCHS = 7
BATCH_SIZE = 16
LR = 2e-5
WEIGHT_DECAY = 0.01
CLUSTER_EVERY_N_EPOCHS = 1
N_CLUSTERS = None            # Auto-compute if None (sqrt(n_samples))
N_HARD_NEG_PER_QUERY = 7
SEED = 67
SAVE_DIR = "./clip_ictq_model"

# Spectral clustering parameters
SPECTRAL_N_NEIGHBORS = 10    # Number of neighbors for affinity matrix
SPECTRAL_AFFINITY = 'nearest_neighbors'  # 'rbf', 'nearest_neighbors', or 'precomputed'
SPECTRAL_ASSIGN_LABELS = 'kmeans'  # 'kmeans' or 'discretize'
SPECTRAL_N_INIT = 10         # Number of k-means initializations

os.makedirs(SAVE_DIR, exist_ok=True)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ----------------------------
# MODEL + PROCESSOR
# ----------------------------
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model = CLIPModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.train()

# ----------------------------
# DATASET / DATALOADER
# ----------------------------
class CocoSubsetDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "image_path": row["image_path"],
            "text": str(row["text"]),
            "idx": int(idx),
        }

dataset = CocoSubsetDataset(df)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, 
                       num_workers=4, pin_memory=True)

# ----------------------------
# SPECTRAL CLUSTERING
# ----------------------------
def compute_n_clusters(n_samples):
    """Auto-compute optimal number of clusters"""
    return max(10, int(np.sqrt(n_samples)))

def build_clusters_spectral(embeddings, n_clusters=None, n_neighbors=10, 
                           affinity='nearest_neighbors', assign_labels='kmeans',
                           n_init=10, verbose=True, seed=SEED):
    """
    Spectral clustering - finds non-convex clusters using graph-based methods.
    Works well with similarity matrices and can capture complex cluster shapes.
    
    Args:
        embeddings: numpy array of shape (n_samples, embedding_dim)
        n_clusters: number of clusters (auto-computed if None)
        n_neighbors: number of neighbors for constructing affinity graph
        affinity: how to construct affinity matrix
                 - 'nearest_neighbors': sparse graph based on k-nearest neighbors
                 - 'rbf': Gaussian (RBF) kernel
                 - 'precomputed': provide similarity matrix directly
        assign_labels: strategy for assigning labels from eigenvectors
                      - 'kmeans': use k-means (more stable)
                      - 'discretize': discretization method (faster)
        n_init: number of k-means initializations (if assign_labels='kmeans')
        verbose: print clustering info
        seed: random seed for reproducibility
    
    Returns:
        cluster_ids: numpy array of cluster assignments
    """
    embeddings = embeddings.astype('float32')
    n_samples, d = embeddings.shape
    
    # Auto-compute clusters if not specified
    if n_clusters is None:
        n_clusters = compute_n_clusters(n_samples)
    
    if verbose:
        print(f"  Running Spectral Clustering with {n_clusters} clusters...")
        print(f"  Dataset: {n_samples} samples, {d} dimensions")
        print(f"  Parameters: n_neighbors={n_neighbors}, affinity={affinity}, "
              f"assign_labels={assign_labels}")
    
    # Initialize Spectral Clustering
    spectral = SpectralClustering(
        n_clusters=n_clusters,
        n_neighbors=n_neighbors,
        affinity=affinity,
        assign_labels=assign_labels,
        n_init=n_init,
        random_state=seed,
        n_jobs=-1  # Use all CPU cores
    )
    
    # Fit and predict cluster assignments
    cluster_ids = spectral.fit_predict(embeddings)
    
    # Print cluster statistics
    if verbose:
        unique, counts = np.unique(cluster_ids, return_counts=True)
        print(f"  Created {len(unique)} clusters")
        print(f"  Cluster sizes - Min: {counts.min()}, Max: {counts.max()}, "
              f"Avg: {counts.mean():.1f}, Std: {counts.std():.1f}")
    
    return cluster_ids

def build_clusters_spectral_cosine(embeddings, n_clusters=None, n_neighbors=10,
                                   assign_labels='kmeans', n_init=10,
                                   verbose=True, seed=SEED):
    """
    Spectral clustering using cosine similarity affinity matrix.
    Best for normalized embeddings like CLIP where cosine similarity is meaningful.
    
    This precomputes the full cosine similarity matrix, which gives better
    results but is more memory-intensive. For large datasets, use
    build_clusters_spectral with affinity='nearest_neighbors' instead.
    """
    embeddings = embeddings.astype('float32')
    n_samples, d = embeddings.shape
    
    if n_clusters is None:
        n_clusters = compute_n_clusters(n_samples)
    
    if verbose:
        print(f"  Running Spectral Clustering (Cosine Affinity) with {n_clusters} clusters...")
        print(f"  Computing cosine similarity matrix ({n_samples}x{n_samples})...")
    
    # Normalize embeddings
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    embeddings_normalized = embeddings / (norms + 1e-8)
    
    # Compute cosine similarity matrix
    affinity_matrix = cosine_similarity(embeddings_normalized)
    
    # Convert to affinity (ensure non-negative and in [0,1])
    affinity_matrix = (affinity_matrix + 1) / 2  # Scale from [-1,1] to [0,1]
    
    if verbose:
        print(f"  Affinity matrix computed. Running spectral clustering...")
    
    # Initialize Spectral Clustering with precomputed affinity
    spectral = SpectralClustering(
        n_clusters=n_clusters,
        affinity='precomputed',
        assign_labels=assign_labels,
        n_init=n_init,
        random_state=seed,
        n_jobs=-1
    )
    
    # Fit and predict
    cluster_ids = spectral.fit_predict(affinity_matrix)
    
    if verbose:
        unique, counts = np.unique(cluster_ids, return_counts=True)
        print(f"  Created {len(unique)} clusters")
        print(f"  Cluster sizes - Min: {counts.min()}, Max: {counts.max()}, "
              f"Avg: {counts.mean():.1f}, Std: {counts.std():.1f}")
    
    return cluster_ids

def build_clusters_spectral_sparse(embeddings, n_clusters=None, n_neighbors=15,
                                   similarity_threshold=0.5, verbose=True, seed=SEED):
    """
    Memory-efficient spectral clustering using sparse affinity matrix.
    Good for large datasets where full similarity matrix doesn't fit in memory.
    
    Creates a sparse graph by keeping only:
    1. k-nearest neighbors for each point
    2. Connections above similarity threshold
    """
    from scipy.sparse import csr_matrix
    from sklearn.neighbors import NearestNeighbors
    
    embeddings = embeddings.astype('float32')
    n_samples, d = embeddings.shape
    
    if n_clusters is None:
        n_clusters = compute_n_clusters(n_samples)
    
    if verbose:
        print(f"  Running Sparse Spectral Clustering with {n_clusters} clusters...")
        print(f"  Building sparse affinity graph...")
    
    # Normalize embeddings for cosine similarity
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    embeddings_normalized = embeddings / (norms + 1e-8)
    
    # Find k-nearest neighbors using cosine similarity
    nbrs = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine', 
                            n_jobs=-1).fit(embeddings_normalized)
    distances, indices = nbrs.kneighbors(embeddings_normalized)
    
    # Convert distances to similarities and build sparse matrix
    similarities = 1 - distances  # cosine distance to similarity
    
    # Build sparse affinity matrix
    row_indices = np.repeat(np.arange(n_samples), n_neighbors+1)
    col_indices = indices.flatten()
    data = similarities.flatten()
    
    # Filter by threshold
    mask = data >= similarity_threshold
    affinity_matrix = csr_matrix(
        (data[mask], (row_indices[mask], col_indices[mask])),
        shape=(n_samples, n_samples)
    )
    
    # Make symmetric
    affinity_matrix = (affinity_matrix + affinity_matrix.T) / 2
    
    if verbose:
        nnz = affinity_matrix.nnz
        density = nnz / (n_samples * n_samples) * 100
        print(f"  Sparse matrix: {nnz} non-zero entries ({density:.2f}% density)")
    
    # Spectral clustering
    spectral = SpectralClustering(
        n_clusters=n_clusters,
        affinity='precomputed',
        assign_labels='kmeans',
        n_init=10,
        random_state=seed,
        n_jobs=-1
    )
    
    cluster_ids = spectral.fit_predict(affinity_matrix)
    
    if verbose:
        unique, counts = np.unique(cluster_ids, return_counts=True)
        print(f"  Created {len(unique)} clusters")
        print(f"  Cluster sizes - Min: {counts.min()}, Max: {counts.max()}, "
              f"Avg: {counts.mean():.1f}")
    
    return cluster_ids

# ----------------------------
# HELPERS
# ----------------------------
@torch.no_grad()
def compute_text_embeddings(model, processor, texts, batch_size=64, device=DEVICE):
    model.eval()
    embs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = processor(text=batch_texts, return_tensors="pt", 
                          padding=True, truncation=True).to(device)
        text_feats = model.get_text_features(**inputs)
        text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
        embs.append(text_feats.cpu().float())
    embs = torch.cat(embs, dim=0)
    model.train()
    return embs.numpy()

def build_cluster_map(cluster_ids):
    cluster_map = defaultdict(list)
    for i, c in enumerate(cluster_ids):
        cluster_map[int(c)].append(i)
    return cluster_map

def sample_hard_negatives(global_idx, cluster_ids, cluster_map, n_hard_neg, df_size):
    """Sample hard negatives with mixed strategy"""
    c = int(cluster_ids[global_idx])
    candidates = [x for x in cluster_map[c] if x != global_idx]
    
    if len(candidates) >= n_hard_neg:
        return random.sample(candidates, n_hard_neg)
    else:
        # Mixed: in-cluster + random
        sampled = candidates.copy()
        remaining = n_hard_neg - len(sampled)
        pool = list(range(df_size))
        pool = [x for x in pool if x != global_idx and x not in sampled]
        sampled.extend(random.sample(pool, min(remaining, len(pool))))
        return sampled

# ----------------------------
# INITIAL CLUSTERING
# ----------------------------
print("Computing initial text embeddings...")
all_texts = df["text"].astype(str).tolist()
text_embs = compute_text_embeddings(model, processor, all_texts, batch_size=64)

print("\nInitial clustering...")
# Choose clustering method:

# Option 1: Standard spectral clustering with nearest neighbors (recommended)
# Fast and memory-efficient, works well for large datasets
cluster_ids = build_clusters_spectral(
    text_embs, 
    n_clusters=N_CLUSTERS,
    n_neighbors=SPECTRAL_N_NEIGHBORS,
    affinity=SPECTRAL_AFFINITY,
    assign_labels=SPECTRAL_ASSIGN_LABELS,
    n_init=SPECTRAL_N_INIT,
    verbose=True
)

# Option 2: Spectral with full cosine similarity (best quality, memory-intensive)
# Use for smaller datasets (<10k samples)
# cluster_ids = build_clusters_spectral_cosine(
#     text_embs,
#     n_clusters=N_CLUSTERS,
#     n_neighbors=SPECTRAL_N_NEIGHBORS,
#     assign_labels=SPECTRAL_ASSIGN_LABELS,
#     verbose=True
# )

# Option 3: Sparse spectral clustering (most memory-efficient)
# Use for very large datasets (>50k samples)
# cluster_ids = build_clusters_spectral_sparse(
#     text_embs,
#     n_clusters=N_CLUSTERS,
#     n_neighbors=SPECTRAL_N_NEIGHBORS,
#     similarity_threshold=0.5,
#     verbose=True
# )

cluster_map = build_cluster_map(cluster_ids)

# ----------------------------
# TRAINING LOOP
# ----------------------------
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
temperature = 1.0

global_step = 0
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in pbar:
        image_paths = batch["image_path"]
        texts = batch["text"]
        idxs = batch["idx"].numpy()
        B = len(texts)

        # Sample hard negatives
        hard_neg_indices = []
        for global_idx in idxs:
            sampled = sample_hard_negatives(global_idx, cluster_ids, cluster_map,
                                          N_HARD_NEG_PER_QUERY, len(df))
            hard_neg_indices.extend(sampled)

        # Build text pool
        text_pool_texts = list(texts)
        hard_texts = [df.loc[i, "text"] for i in hard_neg_indices]
        text_pool_texts.extend(hard_texts)

        # Process inputs
        imgs = [Image.open(p).convert("RGB") for p in image_paths]
        inputs = processor(text=text_pool_texts, images=imgs, 
                          return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        # Get features
        outputs = model(**inputs)
        text_features = outputs.text_embeds
        image_features = outputs.image_embeds
        
        # Normalize
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Compute contrastive loss
        logits = image_features @ text_features.t() / temperature
        labels = torch.arange(B, device=DEVICE)
        
        loss_img_to_text = criterion(logits, labels)
        
        logits_t2i = logits.t()
        logits_pos_texts = logits_t2i[:B, :]
        loss_text_to_img = criterion(logits_pos_texts, labels)
        
        loss = (loss_img_to_text + loss_text_to_img) / 2.0

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        global_step += 1
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_loss = epoch_loss / len(dataloader)
    print(f"\nEpoch {epoch+1} finished. Avg loss: {avg_loss:.4f}")

    # Re-cluster
    if (epoch + 1) % CLUSTER_EVERY_N_EPOCHS == 0:
        print(f"\nRecomputing text embeddings & reclustering (epoch {epoch+1})...")
        text_embs = compute_text_embeddings(model, processor, all_texts, batch_size=64)
        
        # Re-cluster with spectral clustering
        cluster_ids = build_clusters_spectral(
            text_embs,
            n_clusters=N_CLUSTERS,
            n_neighbors=SPECTRAL_N_NEIGHBORS,
            affinity=SPECTRAL_AFFINITY,
            assign_labels=SPECTRAL_ASSIGN_LABELS,
            n_init=SPECTRAL_N_INIT,
            verbose=True
        )
        cluster_map = build_cluster_map(cluster_ids)
        print("Re-clustering done.\n")

    # Save checkpoint
    ckpt_path = os.path.join(SAVE_DIR, f"checkpoint_epoch_{epoch+1}.pt")
    torch.save(model.state_dict(), ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

# Final save
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("\nTraining complete. Model saved.")

Computing initial text embeddings...

Initial clustering...
  Running Spectral Clustering with 70 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: n_neighbors=10, affinity=nearest_neighbors, assign_labels=kmeans
  Created 70 clusters
  Cluster sizes - Min: 13, Max: 794, Avg: 71.4, Std: 93.2


Epoch 1/7: 100%|██████████| 313/313 [02:12<00:00,  2.37it/s, loss=2.5717]



Epoch 1 finished. Avg loss: 3.1399

Recomputing text embeddings & reclustering (epoch 1)...
  Running Spectral Clustering with 70 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: n_neighbors=10, affinity=nearest_neighbors, assign_labels=kmeans
  Created 70 clusters
  Cluster sizes - Min: 15, Max: 229, Avg: 71.4, Std: 45.2
Re-clustering done.

Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_1.pt


Epoch 4/7: 100%|██████████| 313/313 [02:12<00:00,  2.35it/s, loss=2.3704]



Epoch 4 finished. Avg loss: 2.9934

Recomputing text embeddings & reclustering (epoch 4)...
  Running Spectral Clustering with 70 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: n_neighbors=10, affinity=nearest_neighbors, assign_labels=kmeans
  Created 70 clusters
  Cluster sizes - Min: 12, Max: 213, Avg: 71.4, Std: 43.0
Re-clustering done.

Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_4.pt


Epoch 5/7: 100%|██████████| 313/313 [02:13<00:00,  2.34it/s, loss=2.3382]



Epoch 5 finished. Avg loss: 3.0016

Recomputing text embeddings & reclustering (epoch 5)...
  Running Spectral Clustering with 70 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: n_neighbors=10, affinity=nearest_neighbors, assign_labels=kmeans




  Created 70 clusters
  Cluster sizes - Min: 12, Max: 263, Avg: 71.4, Std: 45.8
Re-clustering done.

Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_5.pt


Epoch 6/7: 100%|██████████| 313/313 [02:11<00:00,  2.38it/s, loss=2.4527]



Epoch 6 finished. Avg loss: 2.9963

Recomputing text embeddings & reclustering (epoch 6)...
  Running Spectral Clustering with 70 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: n_neighbors=10, affinity=nearest_neighbors, assign_labels=kmeans
  Created 70 clusters
  Cluster sizes - Min: 12, Max: 216, Avg: 71.4, Std: 42.1
Re-clustering done.

Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_6.pt


Epoch 7/7: 100%|██████████| 313/313 [02:14<00:00,  2.33it/s, loss=2.4362]



Epoch 7 finished. Avg loss: 2.9940

Recomputing text embeddings & reclustering (epoch 7)...
  Running Spectral Clustering with 70 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: n_neighbors=10, affinity=nearest_neighbors, assign_labels=kmeans




  Created 70 clusters
  Cluster sizes - Min: 10, Max: 237, Avg: 71.4, Std: 46.4
Re-clustering done.

Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_7.pt

Training complete. Model saved.


In [29]:
from torchvision.datasets import CocoCaptions
import os
import pandas as pd
from tqdm import tqdm
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print(f"\nPath to Dataset Files: {path}")

coco2017_path = os.path.join(path, "coco2017")
coco_root = os.path.join(coco2017_path, "train2017")
ann_file = os.path.join(coco2017_path, "annotations/captions_train2017.json")
train_csv_path = "dataset_train.csv"

print()
coco = CocoCaptions(root=coco_root, annFile=ann_file)
print()

if not os.path.exists(train_csv_path):
    image_paths = []
    texts = []

    for idx in tqdm(range(len(coco)), desc="Processing COCO Captions"):
        img, captions = coco[idx]
        img_path = os.path.join(coco_root, coco.coco.imgs[coco.ids[idx]]['file_name'])
        for caption in captions:
            image_paths.append(img_path)
            texts.append(caption)

    df_og = pd.DataFrame({"image_path": image_paths, "text": texts})
    df_og.to_csv(train_csv_path, index=False)
    print(f"\nSaved CSV To: {train_csv_path}")
else:
    print(f"\nCSV File Already Exists: {train_csv_path}")
    df_og = pd.read_csv(train_csv_path)


Path to Dataset Files: /kaggle/input/coco-2017-dataset

loading annotations into memory...
Done (t=0.97s)
creating index...
index created!


CSV File Already Exists: dataset_train.csv


In [30]:
df_og = df_og.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)

  df_og = df_og.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [31]:
# Step 1: Remove the train subset from the full dataframe
df_remaining = df_og.drop(df_subset.index)

# Step 2: Sample exactly 1,000 rows for test set
df_test = df_remaining.sample(n=1000, random_state=42)

# Optional: Reset indices
df_test = df_test.reset_index(drop=True)
df_remaining = df_remaining.reset_index(drop=True)

print("Train size:", len(df_subset))
print("Test size:", len(df_test))
print("Remaining (unused):", len(df_remaining))


Train size: 5000
Test size: 1000
Remaining (unused): 113287


In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from tqdm import tqdm

# ----------------------------
# Config
# ----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32

# Load your trained model
model_path = "./clip_ictq_model"
model = CLIPModel.from_pretrained(model_path).to(DEVICE)
processor = CLIPProcessor.from_pretrained(model_path)
model.eval()

# ----------------------------
# Dataset for testing
# ----------------------------
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "image_path": row["image_path"],
            "text": str(row["text"]),
            "idx": int(idx)
        }

test_dataset = TestDataset(df_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# ----------------------------
# Compute embeddings
# ----------------------------
all_image_embeds = []
all_text_embeds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Computing embeddings"):
        # Images
        images = [Image.open(p).convert("RGB") for p in batch["image_path"]]
        texts = batch["text"]

        inputs = processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)

        outputs = model(**inputs)

        # Normalize embeddings
        img_embeds = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True)
        txt_embeds = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True)

        all_image_embeds.append(img_embeds.cpu())
        all_text_embeds.append(txt_embeds.cpu())

# Concatenate embeddings
all_image_embeds = torch.cat(all_image_embeds, dim=0)  # shape: (num_images, dim)
all_text_embeds = torch.cat(all_text_embeds, dim=0)    # shape: (num_texts, dim)

# ----------------------------
# Compute similarity: Text -> Image
# ----------------------------
similarity = all_text_embeds @ all_image_embeds.T  # (num_texts, num_images)
labels = torch.arange(len(df_test))  # ground-truth indices

# ----------------------------
# Compute Text->Image retrieval metrics
# ----------------------------
def compute_retrieval_metrics(similarity, labels):
    num_queries = similarity.size(0)
    ranks = []
    rr = []

    for i in range(num_queries):
        sim_row = similarity[i]  # similarity scores for text i
        sorted_indices = torch.argsort(sim_row, descending=True)
        rank = (sorted_indices == labels[i]).nonzero(as_tuple=True)[0].item()
        ranks.append(rank + 1)
        rr.append(1.0 / (rank + 1))

    ranks = np.array(ranks)
    rr = np.array(rr)

    r1 = np.mean(ranks <= 1)
    r5 = np.mean(ranks <= 5)
    r10 = np.mean(ranks <= 10)
    medr = np.median(ranks)
    mrr = np.mean(rr)

    return {"R@1": r1, "R@5": r5, "R@10": r10, "MedR": medr, "MRR": mrr}

metrics_t2i = compute_retrieval_metrics(similarity, labels)

print("Text -> Image Retrieval Metrics:")
for k, v in metrics_t2i.items():
    if k.startswith("R@"):
        print(f"{k}: {v*100:.2f}%")
    else:
        print(f"{k}: {v:.2f}")


Computing embeddings: 100%|██████████| 32/32 [00:12<00:00,  2.50it/s]

Text -> Image Retrieval Metrics:
R@1: 9.50%
R@5: 29.30%
R@10: 44.00%
MedR: 13.00
MRR: 0.20



