In [2]:
!pip install open_clip_torch torch torchvision



In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"\nGPU Name: {torch.cuda.get_device_name(0)}")


Using device: cuda

GPU Available: True

GPU Name: Tesla P100-PCIE-16GB


In [4]:
import open_clip

print("\nAvailable CLIP models:")
print(open_clip.list_models())




Available CLIP models:
['coca_base', 'coca_roberta-ViT-B-32', 'coca_ViT-B-32', 'coca_ViT-L-14', 'convnext_base', 'convnext_base_w', 'convnext_base_w_320', 'convnext_large', 'convnext_large_d', 'convnext_large_d_320', 'convnext_small', 'convnext_tiny', 'convnext_xlarge', 'convnext_xxlarge', 'convnext_xxlarge_320', 'EVA01-g-14', 'EVA01-g-14-plus', 'EVA02-B-16', 'EVA02-E-14', 'EVA02-E-14-plus', 'EVA02-L-14', 'EVA02-L-14-336', 'MobileCLIP2-B', 'MobileCLIP2-L-14', 'MobileCLIP2-S0', 'MobileCLIP2-S2', 'MobileCLIP2-S3', 'MobileCLIP2-S4', 'MobileCLIP-B', 'MobileCLIP-S1', 'MobileCLIP-S2', 'mt5-base-ViT-B-32', 'mt5-xl-ViT-H-14', 'nllb-clip-base', 'nllb-clip-base-siglip', 'nllb-clip-large', 'nllb-clip-large-siglip', 'PE-Core-B-16', 'PE-Core-bigG-14-448', 'PE-Core-L-14-336', 'PE-Core-S-16-384', 'PE-Core-T-16-384', 'RN50', 'RN50-quickgelu', 'RN50x4', 'RN50x4-quickgelu', 'RN50x16', 'RN50x16-quickgelu', 'RN50x64', 'RN50x64-quickgelu', 'RN101', 'RN101-quickgelu', 'roberta-ViT-B-32', 'swin_base_patch4_

In [5]:
import open_clip

model_name = "ViT-B-32"
print(f"\nLoading CLIP Model: {model_name}")

model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
tokenizer = open_clip.get_tokenizer(model_name)
model = model.to(device)

print(f"\nModel Loaded Successfully")


Loading CLIP Model: ViT-B-32





Model Loaded Successfully


In [6]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image

class ImageTextDataset(Dataset):
    def __init__(self, csv_path, preprocess):
        df = pd.read_csv(csv_path)
        self.images = df['image_path'].tolist()
        self.texts = df['text'].astype(str).tolist()
        self.preprocess = preprocess

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = Image.open(self.images[idx]).convert("RGB")
        img = self.preprocess(img)
        txt = self.texts[idx]
        return img, txt

In [7]:
from torchvision.datasets import CocoCaptions
import os
import pandas as pd
from tqdm import tqdm
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print(f"\nPath to Dataset Files: {path}")

coco2017_path = os.path.join(path, "coco2017")
coco_root = os.path.join(coco2017_path, "train2017")
ann_file = os.path.join(coco2017_path, "annotations/captions_train2017.json")
train_csv_path = "dataset_train.csv"

print()
coco = CocoCaptions(root=coco_root, annFile=ann_file)
print()

if not os.path.exists(train_csv_path):
    image_paths = []
    texts = []

    for idx in tqdm(range(len(coco)), desc="Processing COCO Captions"):
        img, captions = coco[idx]
        img_path = os.path.join(coco_root, coco.coco.imgs[coco.ids[idx]]['file_name'])
        for caption in captions:
            image_paths.append(img_path)
            texts.append(caption)

    df = pd.DataFrame({"image_path": image_paths, "text": texts})
    df.to_csv(train_csv_path, index=False)
    print(f"\nSaved CSV To: {train_csv_path}")
else:
    print(f"\nCSV File Already Exists: {train_csv_path}")
    df = pd.read_csv(train_csv_path)


Path to Dataset Files: /kaggle/input/coco-2017-dataset

loading annotations into memory...
Done (t=1.50s)
creating index...
index created!


CSV File Already Exists: dataset_train.csv


In [8]:
df = df.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)

  df = df.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [9]:
df['image_id'] = df['image_path'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))


In [10]:
df_subset = df.sample(n=5000, random_state=42).reset_index(drop=True)


###ICT-Q


In [11]:
# df_subset_ictq = df_subset.drop(columns=["neg_img_idx"])
df_subset_ictq = df_subset

In [12]:
!pip install torch torchvision transformers scikit-learn faiss-cpu pandas pillow




In [23]:
import os
import random
from tqdm import tqdm
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import pandas as pd
from collections import defaultdict
import faiss

# ----------------------------
# USER CONFIG
# ----------------------------
df = df_subset_ictq.reset_index(drop=True)
assert "image_path" in df.columns and "text" in df.columns

MODEL_NAME = "openai/clip-vit-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EPOCHS = 7
BATCH_SIZE = 16
LR = 2e-5
WEIGHT_DECAY = 0.01
CLUSTER_EVERY_N_EPOCHS = 1
N_CLUSTERS = 40            # Auto-compute if None (sqrt(n_samples))
N_HARD_NEG_PER_QUERY = 7
SEED = 67
SAVE_DIR = "./clip_ictq_model"

# FAISS-specific parameters
FAISS_NITER = 20             # Number of K-means iterations
FAISS_NREDO = 5              # Number of K-means redos to find best clustering
USE_GPU_FAISS = torch.cuda.is_available()  # Use GPU for FAISS if available

os.makedirs(SAVE_DIR, exist_ok=True)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ----------------------------
# MODEL + PROCESSOR
# ----------------------------
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model = CLIPModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.train()

# ----------------------------
# DATASET / DATALOADER
# ----------------------------
class CocoSubsetDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "image_path": row["image_path"],
            "text": str(row["text"]),
            "idx": int(idx),
        }

dataset = CocoSubsetDataset(df)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, 
                       num_workers=4, pin_memory=True)

# ----------------------------
# FAISS CLUSTERING
# ----------------------------
def compute_n_clusters(n_samples):
    """Auto-compute optimal number of clusters"""
    return max(10, int(np.sqrt(n_samples)))

def build_clusters_faiss(embeddings, n_clusters=None, niter=20, nredo=5, 
                        use_gpu=False, verbose=True, seed=SEED):
    """
    FAISS-based K-Means clustering - extremely fast and scalable.
    
    Args:
        embeddings: numpy array of shape (n_samples, embedding_dim)
        n_clusters: number of clusters (auto-computed if None)
        niter: number of K-means iterations
        nredo: number of random initializations (best one is kept)
        use_gpu: whether to use GPU acceleration
        verbose: print clustering info
        seed: random seed for reproducibility
    
    Returns:
        cluster_ids: numpy array of cluster assignments
    """
    # Ensure float32 (FAISS requirement)
    embeddings = embeddings.astype('float32')
    n_samples, d = embeddings.shape
    
    # Auto-compute clusters if not specified
    if n_clusters is None:
        n_clusters = compute_n_clusters(n_samples)
    
    if verbose:
        print(f"  Running FAISS K-Means with {n_clusters} clusters...")
        print(f"  Dataset: {n_samples} samples, {d} dimensions")
        print(f"  Parameters: niter={niter}, nredo={nredo}, GPU={use_gpu}")
    
    # Initialize FAISS K-means
    kmeans = faiss.Kmeans(
        d=d,                    # dimension
        k=n_clusters,           # number of clusters
        niter=niter,            # number of iterations
        nredo=nredo,            # number of random initializations
        verbose=verbose,
        gpu=use_gpu,
        seed=seed
    )
    
    # Train the clustering model
    kmeans.train(embeddings)
    
    # Get cluster assignments
    # Search returns (distances, indices) - we only need indices
    _, cluster_ids = kmeans.index.search(embeddings, 1)
    cluster_ids = cluster_ids.flatten()
    
    # Print cluster statistics
    if verbose:
        unique, counts = np.unique(cluster_ids, return_counts=True)
        print(f"  Created {len(unique)} clusters")
        print(f"  Cluster sizes - Min: {counts.min()}, Max: {counts.max()}, "
              f"Avg: {counts.mean():.1f}, Std: {counts.std():.1f}")
        print(f"  Inertia (sum of squared distances): {kmeans.obj[-1]:.2f}")
    
    return cluster_ids

def build_clusters_faiss_advanced(embeddings, n_clusters=None, use_spherical=True,
                                  use_gpu=False, verbose=True):
    """
    Advanced FAISS clustering with spherical K-means (better for normalized embeddings).
    Spherical K-means uses cosine similarity instead of Euclidean distance.
    """
    embeddings = embeddings.astype('float32')
    n_samples, d = embeddings.shape
    
    if n_clusters is None:
        n_clusters = compute_n_clusters(n_samples)
    
    if verbose:
        print(f"  Running FAISS Spherical K-Means with {n_clusters} clusters...")
    
    if use_spherical:
        # Normalize embeddings for spherical K-means (if not already normalized)
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = embeddings / (norms + 1e-8)
    
    # Initialize K-means
    kmeans = faiss.Kmeans(
        d=d,
        k=n_clusters,
        niter=25,
        nredo=10,
        verbose=verbose,
        gpu=use_gpu,
        spherical=use_spherical,  # Use cosine similarity
        seed=SEED
    )
    
    kmeans.train(embeddings)
    _, cluster_ids = kmeans.index.search(embeddings, 1)
    cluster_ids = cluster_ids.flatten()
    
    if verbose:
        unique, counts = np.unique(cluster_ids, return_counts=True)
        print(f"  Cluster sizes - Min: {counts.min()}, Max: {counts.max()}, Avg: {counts.mean():.1f}")
    
    return cluster_ids

def build_clusters_faiss_hierarchical(embeddings, n_clusters=None, use_gpu=False, verbose=True):
    """
    Two-level hierarchical FAISS clustering for better quality.
    First creates coarse clusters, then refines them.
    """
    embeddings = embeddings.astype('float32')
    n_samples, d = embeddings.shape
    
    if n_clusters is None:
        n_clusters = compute_n_clusters(n_samples)
    
    if verbose:
        print(f"  Running Hierarchical FAISS K-Means...")
    
    # Level 1: Coarse clustering
    n_coarse = max(5, n_clusters // 4)
    kmeans_coarse = faiss.Kmeans(d=d, k=n_coarse, niter=15, gpu=use_gpu, seed=SEED)
    kmeans_coarse.train(embeddings)
    _, coarse_ids = kmeans_coarse.index.search(embeddings, 1)
    coarse_ids = coarse_ids.flatten()
    
    # Level 2: Fine clustering within each coarse cluster
    final_cluster_ids = np.zeros(n_samples, dtype=np.int32)
    next_cluster_id = 0
    
    for coarse_cluster in range(n_coarse):
        mask = coarse_ids == coarse_cluster
        cluster_size = mask.sum()
        
        if cluster_size < 10:
            # Too small, assign single cluster
            final_cluster_ids[mask] = next_cluster_id
            next_cluster_id += 1
        else:
            # Sub-cluster
            sub_embeddings = embeddings[mask]
            n_sub_clusters = max(2, int(np.sqrt(cluster_size)))
            
            kmeans_fine = faiss.Kmeans(d=d, k=n_sub_clusters, niter=10, 
                                      gpu=use_gpu, seed=SEED)
            kmeans_fine.train(sub_embeddings)
            _, sub_ids = kmeans_fine.index.search(sub_embeddings, 1)
            
            final_cluster_ids[mask] = sub_ids.flatten() + next_cluster_id
            next_cluster_id += n_sub_clusters
    
    if verbose:
        unique, counts = np.unique(final_cluster_ids, return_counts=True)
        print(f"  Created {len(unique)} clusters from {n_coarse} coarse clusters")
        print(f"  Cluster sizes - Min: {counts.min()}, Max: {counts.max()}, Avg: {counts.mean():.1f}")
    
    return final_cluster_ids

# ----------------------------
# HELPERS
# ----------------------------
@torch.no_grad()
def compute_text_embeddings(model, processor, texts, batch_size=64, device=DEVICE):
    model.eval()
    embs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = processor(text=batch_texts, return_tensors="pt", 
                          padding=True, truncation=True).to(device)
        text_feats = model.get_text_features(**inputs)
        text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
        embs.append(text_feats.cpu().float())
    embs = torch.cat(embs, dim=0)
    model.train()
    return embs.numpy()

def build_cluster_map(cluster_ids):
    cluster_map = defaultdict(list)
    for i, c in enumerate(cluster_ids):
        cluster_map[int(c)].append(i)
    return cluster_map

def sample_hard_negatives(global_idx, cluster_ids, cluster_map, n_hard_neg, df_size):
    """Sample hard negatives with mixed strategy"""
    c = int(cluster_ids[global_idx])
    candidates = [x for x in cluster_map[c] if x != global_idx]
    
    if len(candidates) >= n_hard_neg:
        return random.sample(candidates, n_hard_neg)
    else:
        # Mixed: in-cluster + random
        sampled = candidates.copy()
        remaining = n_hard_neg - len(sampled)
        pool = list(range(df_size))
        pool = [x for x in pool if x != global_idx and x not in sampled]
        sampled.extend(random.sample(pool, min(remaining, len(pool))))
        return sampled

# ----------------------------
# INITIAL CLUSTERING
# ----------------------------
print("Computing initial text embeddings...")
all_texts = df["text"].astype(str).tolist()
text_embs = compute_text_embeddings(model, processor, all_texts, batch_size=64)

print("\nInitial clustering...")
# Choose clustering method:
# Option 1: Standard FAISS K-means (fastest)
# cluster_ids = build_clusters_faiss(text_embs, n_clusters=N_CLUSTERS, 
#                                   niter=FAISS_NITER, nredo=FAISS_NREDO,
#                                   use_gpu=USE_GPU_FAISS, verbose=True)

# Option 2: Spherical K-means (better for normalized embeddings like CLIP)
cluster_ids = build_clusters_faiss_advanced(text_embs, n_clusters=N_CLUSTERS,
                                           use_spherical=True, use_gpu=USE_GPU_FAISS)

# Option 3: Hierarchical (best quality, slower)
# cluster_ids = build_clusters_faiss_hierarchical(text_embs, n_clusters=N_CLUSTERS,
#                                                use_gpu=USE_GPU_FAISS)

cluster_map = build_cluster_map(cluster_ids)

# ----------------------------
# TRAINING LOOP
# ----------------------------
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
temperature = 1.0

global_step = 0
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in pbar:
        image_paths = batch["image_path"]
        texts = batch["text"]
        idxs = batch["idx"].numpy()
        B = len(texts)

        # Sample hard negatives
        hard_neg_indices = []
        for global_idx in idxs:
            sampled = sample_hard_negatives(global_idx, cluster_ids, cluster_map,
                                          N_HARD_NEG_PER_QUERY, len(df))
            hard_neg_indices.extend(sampled)

        # Build text pool
        text_pool_texts = list(texts)
        hard_texts = [df.loc[i, "text"] for i in hard_neg_indices]
        text_pool_texts.extend(hard_texts)

        # Process inputs
        imgs = [Image.open(p).convert("RGB") for p in image_paths]
        inputs = processor(text=text_pool_texts, images=imgs, 
                          return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        # Get features
        outputs = model(**inputs)
        text_features = outputs.text_embeds
        image_features = outputs.image_embeds
        
        # Normalize
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Compute contrastive loss
        logits = image_features @ text_features.t() / temperature
        labels = torch.arange(B, device=DEVICE)
        
        loss_img_to_text = criterion(logits, labels)
        
        logits_t2i = logits.t()
        logits_pos_texts = logits_t2i[:B, :]
        loss_text_to_img = criterion(logits_pos_texts, labels)
        
        loss = (loss_img_to_text + loss_text_to_img) / 2.0

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        global_step += 1
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_loss = epoch_loss / len(dataloader)
    print(f"\nEpoch {epoch+1} finished. Avg loss: {avg_loss:.4f}")

    # Re-cluster
    if (epoch + 1) % CLUSTER_EVERY_N_EPOCHS == 0:
        print(f"\nRecomputing text embeddings & reclustering (epoch {epoch+1})...")
        text_embs = compute_text_embeddings(model, processor, all_texts, batch_size=64)
        
        # Re-cluster with FAISS
        cluster_ids = build_clusters_faiss(text_embs, n_clusters=N_CLUSTERS,
                                          niter=FAISS_NITER, nredo=FAISS_NREDO,
                                          use_gpu=USE_GPU_FAISS, verbose=True)
        cluster_map = build_cluster_map(cluster_ids)
        print("Re-clustering done.\n")

    # Save checkpoint
    ckpt_path = os.path.join(SAVE_DIR, f"checkpoint_epoch_{epoch+1}.pt")
    torch.save(model.state_dict(), ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

# Final save
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("\nTraining complete. Model saved.")

Computing initial text embeddings...

Initial clustering...
  Running FAISS Spherical K-Means with 40 clusters...

Clustering 5000 points in 512D to 40 clusters, redo 10 times, 25 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 10
  Iteration 24 (0.08 s, search 0.07 s): objective=4074.83 imbalance=1.321 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 10
  Iteration 24 (0.14 s, search 0.12 s): objective=4085.57 imbalance=1.236 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 10
  Iteration 24 (0.20 s, search 0.17 s): objective=4081.55 imbalance=1.256 nsplit=0       
Outer iteration 3 / 10
  Iteration 24 (0.25 s, search 0.21 s): objective=4087.77 imbalance=1.236 nsplit=0       
Objective improved: keep new clusters
Outer iteration 4 / 10
  Iteration 24 (0.31 s, search 0.25 s): objective=4070.39 imbalance=1.266 nsplit=0       
Outer iteration 5 / 10
  Iteration 24 (0.36 s, search 0.30 s): objective=4080.22 imbalance=1.275 nsplit=

Epoch 1/7: 100%|██████████| 313/313 [02:17<00:00,  2.28it/s, loss=2.5466]



Epoch 1 finished. Avg loss: 3.1419

Recomputing text embeddings & reclustering (epoch 1)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Objective improved: keep new clusters
Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.07 s, search 0.06 s): objective=607.173 imbalance=1.604 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.11 s, search 0.09 s): objective=643.497 imbalance=1.628 nsplit=0       
Outer iteration 2 / 5
  Iteration 19 (0.16 s, search 0.13 s): objective=620.57 imbalance=1.387 nsplit=0        
Outer iteration 3 / 5
  Iteration 19 (0.20 s, search 0.16 s): objective=720.295 imbalance=1.564 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.20 s): objective=616.691 imbalance=1.538 nsplit=0       
  Cluster sizes - Min: 16, Max: 510, Avg: 1

Epoch 2/7: 100%|██████████| 313/313 [02:15<00:00,  2.31it/s, loss=2.3399]



Epoch 2 finished. Avg loss: 3.0380

Recomputing text embeddings & reclustering (epoch 2)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.05 s, search 0.04 s): objective=574.706 imbalance=1.638 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.09 s, search 0.08 s): objective=551.515 imbalance=1.569 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 19 (0.14 s, search 0.11 s): objective=523.437 imbalance=1.316 nsplit=0       
Objective improved: keep new clusters
Outer iteration 3 / 5
  Iteration 19 (0.18 s, search 0.15 s): objective=676.181 imbalance=1.543 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.18 s): objective=534.253 imbalance=1.447 nsplit=0       
  Clu

Epoch 3/7: 100%|██████████| 313/313 [02:12<00:00,  2.37it/s, loss=2.3839]



Epoch 3 finished. Avg loss: 3.0077

Recomputing text embeddings & reclustering (epoch 3)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.05 s, search 0.04 s): objective=460.382 imbalance=1.499 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.09 s, search 0.07 s): objective=555.001 imbalance=1.604 nsplit=0       
Outer iteration 2 / 5
  Iteration 19 (0.13 s, search 0.11 s): objective=581.293 imbalance=1.419 nsplit=0       
Outer iteration 3 / 5
  Iteration 19 (0.18 s, search 0.14 s): objective=532.145 imbalance=1.404 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.18 s): objective=497.513 imbalance=1.588 nsplit=0       
  Cluster sizes - Min: 4, Max: 351, Avg: 125.0, Std: 88.3
  Inertia (sum of squar

Epoch 4/7: 100%|██████████| 313/313 [02:14<00:00,  2.33it/s, loss=2.3727]



Epoch 4 finished. Avg loss: 3.0083

Recomputing text embeddings & reclustering (epoch 4)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.07 s, search 0.06 s): objective=429.654 imbalance=1.435 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.12 s, search 0.10 s): objective=524.801 imbalance=1.569 nsplit=0       
Outer iteration 2 / 5
  Iteration 19 (0.17 s, search 0.14 s): objective=549.202 imbalance=1.431 nsplit=0       
Outer iteration 3 / 5
  Iteration 19 (0.22 s, search 0.18 s): objective=618.368 imbalance=1.631 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.22 s): objective=497.64 imbalance=1.577 nsplit=0        
  Cluster sizes - Min: 5, Max: 343, Avg: 125.0, Std: 82.5
  Inertia (sum of squar

Epoch 5/7: 100%|██████████| 313/313 [02:17<00:00,  2.28it/s, loss=2.3518]



Epoch 5 finished. Avg loss: 3.0023

Recomputing text embeddings & reclustering (epoch 5)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.05 s, search 0.04 s): objective=463.875 imbalance=1.507 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.10 s, search 0.08 s): objective=449.136 imbalance=1.604 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 19 (0.14 s, search 0.12 s): objective=487.438 imbalance=1.493 nsplit=0       
Outer iteration 3 / 5
  Iteration 19 (0.20 s, search 0.16 s): objective=529.127 imbalance=1.489 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.20 s): objective=459.962 imbalance=1.601 nsplit=0       
  Cluster sizes - Min: 14, Max: 376, Avg: 1

Epoch 6/7: 100%|██████████| 313/313 [02:15<00:00,  2.31it/s, loss=2.4606]



Epoch 6 finished. Avg loss: 2.9801

Recomputing text embeddings & reclustering (epoch 6)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.04 s, search 0.04 s): objective=448.777 imbalance=1.516 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.09 s, search 0.07 s): objective=390.478 imbalance=1.590 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 19 (0.13 s, search 0.10 s): objective=496.862 imbalance=1.586 nsplit=0       
Outer iteration 3 / 5
  Iteration 19 (0.17 s, search 0.14 s): objective=449.994 imbalance=1.450 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.17 s): objective=450.935 imbalance=1.704 nsplit=0       
  Cluster sizes - Min: 13, Max: 378, Avg: 1

Epoch 7/7: 100%|██████████| 313/313 [02:12<00:00,  2.36it/s, loss=2.4875]



Epoch 7 finished. Avg loss: 2.9668

Recomputing text embeddings & reclustering (epoch 7)...
  Running FAISS K-Means with 40 clusters...
  Dataset: 5000 samples, 512 dimensions
  Parameters: niter=20, nredo=5, GPU=True

Clustering 5000 points in 512D to 40 clusters, redo 5 times, 20 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 19 (0.04 s, search 0.04 s): objective=444.546 imbalance=1.577 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 19 (0.09 s, search 0.07 s): objective=437.713 imbalance=1.494 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 19 (0.13 s, search 0.10 s): objective=473.535 imbalance=1.520 nsplit=0       
Outer iteration 3 / 5
  Iteration 19 (0.17 s, search 0.14 s): objective=633.475 imbalance=1.661 nsplit=0       
Outer iteration 4 / 5
  Created 40 clusterss, search 0.18 s): objective=377.989 imbalance=1.518 nsplit=0       
  Cluster sizes - Min: 10, Max: 383, Avg: 1

In [24]:
from torchvision.datasets import CocoCaptions
import os
import pandas as pd
from tqdm import tqdm
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print(f"\nPath to Dataset Files: {path}")

coco2017_path = os.path.join(path, "coco2017")
coco_root = os.path.join(coco2017_path, "train2017")
ann_file = os.path.join(coco2017_path, "annotations/captions_train2017.json")
train_csv_path = "dataset_train.csv"

print()
coco = CocoCaptions(root=coco_root, annFile=ann_file)
print()

if not os.path.exists(train_csv_path):
    image_paths = []
    texts = []

    for idx in tqdm(range(len(coco)), desc="Processing COCO Captions"):
        img, captions = coco[idx]
        img_path = os.path.join(coco_root, coco.coco.imgs[coco.ids[idx]]['file_name'])
        for caption in captions:
            image_paths.append(img_path)
            texts.append(caption)

    df_og = pd.DataFrame({"image_path": image_paths, "text": texts})
    df_og.to_csv(train_csv_path, index=False)
    print(f"\nSaved CSV To: {train_csv_path}")
else:
    print(f"\nCSV File Already Exists: {train_csv_path}")
    df_og = pd.read_csv(train_csv_path)


Path to Dataset Files: /kaggle/input/coco-2017-dataset

loading annotations into memory...
Done (t=0.95s)
creating index...
index created!


CSV File Already Exists: dataset_train.csv


In [25]:
df_og = df_og.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)

  df_og = df_og.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [26]:
# Step 1: Remove the train subset from the full dataframe
df_remaining = df_og.drop(df_subset.index)

# Step 2: Sample exactly 1,000 rows for test set
df_test = df_remaining.sample(n=1000, random_state=42)

# Optional: Reset indices
df_test = df_test.reset_index(drop=True)
df_remaining = df_remaining.reset_index(drop=True)

print("Train size:", len(df_subset))
print("Test size:", len(df_test))
print("Remaining (unused):", len(df_remaining))


Train size: 5000
Test size: 1000
Remaining (unused): 113287


In [27]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from tqdm import tqdm

# ----------------------------
# Config
# ----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32

# Load your trained model
model_path = "./clip_ictq_model"
model = CLIPModel.from_pretrained(model_path).to(DEVICE)
processor = CLIPProcessor.from_pretrained(model_path)
model.eval()

# ----------------------------
# Dataset for testing
# ----------------------------
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "image_path": row["image_path"],
            "text": str(row["text"]),
            "idx": int(idx)
        }

test_dataset = TestDataset(df_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# ----------------------------
# Compute embeddings
# ----------------------------
all_image_embeds = []
all_text_embeds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Computing embeddings"):
        # Images
        images = [Image.open(p).convert("RGB") for p in batch["image_path"]]
        texts = batch["text"]

        inputs = processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)

        outputs = model(**inputs)

        # Normalize embeddings
        img_embeds = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True)
        txt_embeds = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True)

        all_image_embeds.append(img_embeds.cpu())
        all_text_embeds.append(txt_embeds.cpu())

# Concatenate embeddings
all_image_embeds = torch.cat(all_image_embeds, dim=0)  # shape: (num_images, dim)
all_text_embeds = torch.cat(all_text_embeds, dim=0)    # shape: (num_texts, dim)

# ----------------------------
# Compute similarity: Text -> Image
# ----------------------------
similarity = all_text_embeds @ all_image_embeds.T  # (num_texts, num_images)
labels = torch.arange(len(df_test))  # ground-truth indices

# ----------------------------
# Compute Text->Image retrieval metrics
# ----------------------------
def compute_retrieval_metrics(similarity, labels):
    num_queries = similarity.size(0)
    ranks = []
    rr = []

    for i in range(num_queries):
        sim_row = similarity[i]  # similarity scores for text i
        sorted_indices = torch.argsort(sim_row, descending=True)
        rank = (sorted_indices == labels[i]).nonzero(as_tuple=True)[0].item()
        ranks.append(rank + 1)
        rr.append(1.0 / (rank + 1))

    ranks = np.array(ranks)
    rr = np.array(rr)

    r1 = np.mean(ranks <= 1)
    r5 = np.mean(ranks <= 5)
    r10 = np.mean(ranks <= 10)
    medr = np.median(ranks)
    mrr = np.mean(rr)

    return {"R@1": r1, "R@5": r5, "R@10": r10, "MedR": medr, "MRR": mrr}

metrics_t2i = compute_retrieval_metrics(similarity, labels)

print("Text -> Image Retrieval Metrics:")
for k, v in metrics_t2i.items():
    if k.startswith("R@"):
        print(f"{k}: {v*100:.2f}%")
    else:
        print(f"{k}: {v:.2f}")


Computing embeddings: 100%|██████████| 32/32 [00:13<00:00,  2.45it/s]


Text -> Image Retrieval Metrics:
R@1: 9.70%
R@5: 29.50%
R@10: 42.80%
MedR: 14.00
MRR: 0.20
