In [1]:
!pip install open_clip_torch torch torchvision

Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-3.2.0


In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"\nGPU Name: {torch.cuda.get_device_name(0)}")


Using device: cuda

GPU Available: True

GPU Name: Tesla T4


In [3]:
import open_clip

print("\nAvailable CLIP models:")
print(open_clip.list_models())


Available CLIP models:
['coca_base', 'coca_roberta-ViT-B-32', 'coca_ViT-B-32', 'coca_ViT-L-14', 'convnext_base', 'convnext_base_w', 'convnext_base_w_320', 'convnext_large', 'convnext_large_d', 'convnext_large_d_320', 'convnext_small', 'convnext_tiny', 'convnext_xlarge', 'convnext_xxlarge', 'convnext_xxlarge_320', 'EVA01-g-14', 'EVA01-g-14-plus', 'EVA02-B-16', 'EVA02-E-14', 'EVA02-E-14-plus', 'EVA02-L-14', 'EVA02-L-14-336', 'MobileCLIP2-B', 'MobileCLIP2-L-14', 'MobileCLIP2-S0', 'MobileCLIP2-S2', 'MobileCLIP2-S3', 'MobileCLIP2-S4', 'MobileCLIP-B', 'MobileCLIP-S1', 'MobileCLIP-S2', 'mt5-base-ViT-B-32', 'mt5-xl-ViT-H-14', 'nllb-clip-base', 'nllb-clip-base-siglip', 'nllb-clip-large', 'nllb-clip-large-siglip', 'PE-Core-B-16', 'PE-Core-bigG-14-448', 'PE-Core-L-14-336', 'PE-Core-S-16-384', 'PE-Core-T-16-384', 'RN50', 'RN50-quickgelu', 'RN50x4', 'RN50x4-quickgelu', 'RN50x16', 'RN50x16-quickgelu', 'RN50x64', 'RN50x64-quickgelu', 'RN101', 'RN101-quickgelu', 'roberta-ViT-B-32', 'swin_base_patch4_

In [4]:
import open_clip

model_name = "ViT-B-32"
print(f"\nLoading CLIP Model: {model_name}")

model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')
tokenizer = open_clip.get_tokenizer(model_name)
model = model.to(device)

print(f"\nModel Loaded Successfully")


Loading CLIP Model: ViT-B-32


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]




Model Loaded Successfully


In [5]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image

class ImageTextDataset(Dataset):
    def __init__(self, csv_path, preprocess):
        df = pd.read_csv(csv_path)
        self.images = df['image_path'].tolist()
        self.texts = df['text'].astype(str).tolist()
        self.preprocess = preprocess

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = Image.open(self.images[idx]).convert("RGB")
        img = self.preprocess(img)
        txt = self.texts[idx]
        return img, txt

In [6]:
from torchvision.datasets import CocoCaptions
import os
import pandas as pd
from tqdm import tqdm
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print(f"\nPath to Dataset Files: {path}")

coco2017_path = os.path.join(path, "coco2017")
coco_root = os.path.join(coco2017_path, "train2017")
ann_file = os.path.join(coco2017_path, "annotations/captions_train2017.json")
train_csv_path = "dataset_train.csv"

print()
coco = CocoCaptions(root=coco_root, annFile=ann_file)
print()

if not os.path.exists(train_csv_path):
    image_paths = []
    texts = []

    for idx in tqdm(range(len(coco)), desc="Processing COCO Captions"):
        img, captions = coco[idx]
        img_path = os.path.join(coco_root, coco.coco.imgs[coco.ids[idx]]['file_name'])
        for caption in captions:
            image_paths.append(img_path)
            texts.append(caption)

    df = pd.DataFrame({"image_path": image_paths, "text": texts})
    df.to_csv(train_csv_path, index=False)
    print(f"\nSaved CSV To: {train_csv_path}")
else:
    print(f"\nCSV File Already Exists: {train_csv_path}")
    df = pd.read_csv(train_csv_path)

Using Colab cache for faster access to the 'coco-2017-dataset' dataset.

Path to Dataset Files: /kaggle/input/coco-2017-dataset

loading annotations into memory...
Done (t=1.90s)
creating index...
index created!



Processing COCO Captions: 100%|██████████| 118287/118287 [22:06<00:00, 89.18it/s]



Saved CSV To: dataset_train.csv


In [7]:
df = df.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)

  df = df.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [8]:
df['image_id'] = df['image_path'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))


In [9]:
df_subset = df.sample(n=5000, random_state=42).reset_index(drop=True)


###ICT-Q


In [10]:
# df_subset_ictq = df_subset.drop(columns=["neg_img_idx"])
df_subset_ictq = df_subset

In [11]:
!pip install torch torchvision transformers scikit-learn faiss-cpu pandas pillow


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [18]:
import os
import random
from tqdm import tqdm
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
from collections import defaultdict

# ----------------------------
# USER CONFIG
# ----------------------------
df = df_subset_ictq.reset_index(drop=True)
assert "image_path" in df.columns and "text" in df.columns

MODEL_NAME = "openai/clip-vit-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EPOCHS = 7
BATCH_SIZE = 32  # Increased for better GPU utilization
LR = 2e-5
WEIGHT_DECAY = 0.01
CLUSTER_EVERY_N_EPOCHS = 1  # Reduced frequency - clusters don't change much each epoch
N_CLUSTERS = 40
N_HARD_NEG_PER_QUERY = 7
SEED = 67
SAVE_DIR = "./clip_ictq_model"
os.makedirs(SAVE_DIR, exist_ok=True)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ----------------------------
# PRE-CACHE DATA
# ----------------------------
all_texts_list = df["text"].astype(str).tolist()  # Cache as list for fast lookup

# ----------------------------
# MODEL + PROCESSOR
# ----------------------------
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model = CLIPModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.train()

# ----------------------------
# DATASET
# ----------------------------
class CocoSubsetDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "image_path": row["image_path"],
            "text": str(row["text"]),
            "idx": int(idx),
        }

dataset = CocoSubsetDataset(df)
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    prefetch_factor=2  # Prefetch batches
)

# ----------------------------
# HELPERS
# ----------------------------
@torch.no_grad()
def compute_text_embeddings(model, processor, texts, batch_size=128, device=DEVICE):
    model.eval()
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings"):
        batch_texts = texts[i:i+batch_size]
        inputs = processor(text=batch_texts, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        text_feats = model.get_text_features(**inputs)
        text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
        embs.append(text_feats.cpu())
    embs = torch.cat(embs, dim=0)
    model.train()
    return embs.numpy()

def build_clusters(embeddings, n_clusters=N_CLUSTERS):
    kmeans = MiniBatchKMeans(
        n_clusters=min(n_clusters, max(2, embeddings.shape[0]//5)),
        random_state=SEED,
        batch_size=2000,  # Increased batch size
        n_init=3
    )
    cluster_ids = kmeans.fit_predict(embeddings)
    return cluster_ids

def build_cluster_map(cluster_ids):
    cluster_map = defaultdict(list)
    for i, c in enumerate(cluster_ids):
        cluster_map[int(c)].append(i)
    return cluster_map

# ----------------------------
# INITIAL CLUSTERING
# ----------------------------
print("Computing initial text embeddings...")
text_embs = compute_text_embeddings(model, processor, all_texts_list, batch_size=128)
cluster_ids = build_clusters(text_embs, n_clusters=N_CLUSTERS)
cluster_map = build_cluster_map(cluster_ids)
print(f"Created {len(cluster_map)} clusters")

# ----------------------------
# TRAINING
# ----------------------------
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
temperature = 1.0

global_step = 0
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for batch in pbar:
        image_paths = batch["image_path"]
        texts = batch["text"]
        idxs = batch["idx"].numpy()
        B = len(texts)

        # Collect hard negative indices
        hard_neg_indices = []
        for i, global_idx in enumerate(idxs):
            c = int(cluster_ids[global_idx])
            candidates = [x for x in cluster_map[c] if x != global_idx]

            if len(candidates) >= N_HARD_NEG_PER_QUERY:
                sampled = random.sample(candidates, N_HARD_NEG_PER_QUERY)
            else:
                pool = [x for x in range(len(df)) if x != global_idx]
                sampled = random.sample(pool, N_HARD_NEG_PER_QUERY)
            hard_neg_indices.extend(sampled)

        # Build text pool - use cached list instead of df.loc
        text_pool_texts = list(texts)  # positives
        text_pool_texts.extend([all_texts_list[i] for i in hard_neg_indices])  # hard negatives

        # Load images
        imgs = [Image.open(p).convert("RGB") for p in image_paths]

        # Process inputs
        inputs = processor(
            text=text_pool_texts,
            images=imgs,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)

        # Forward pass
        outputs = model(**inputs)
        text_features = outputs.text_embeds
        image_features = outputs.image_embeds

        # Normalize
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Compute logits
        logits = image_features @ text_features.t() / temperature
        labels = torch.arange(B, device=DEVICE)

        # Bidirectional loss
        loss_img_to_text = criterion(logits, labels)
        logits_t2i = logits.t()
        logits_pos_texts = logits_t2i[:B, :]
        loss_text_to_img = criterion(logits_pos_texts, labels)
        loss = (loss_img_to_text + loss_text_to_img) / 2.0

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        global_step += 1
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1} finished. Avg loss: {avg_loss:.4f}")

    # Re-cluster periodically
    if (epoch + 1) % CLUSTER_EVERY_N_EPOCHS == 0 and (epoch + 1) < EPOCHS:
        print(f"Recomputing text embeddings & reclustering...")
        text_embs = compute_text_embeddings(model, processor, all_texts_list, batch_size=128)
        cluster_ids = build_clusters(text_embs, n_clusters=N_CLUSTERS)
        cluster_map = build_cluster_map(cluster_ids)
        print(f"Re-clustering done. {len(cluster_map)} clusters")

    # Save checkpoint
    ckpt_path = os.path.join(SAVE_DIR, f"checkpoint_epoch_{epoch+1}.pt")
    torch.save(model.state_dict(), ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

# Final save
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("Training complete. Model saved.")



Computing initial text embeddings...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 11.54it/s]


Created 40 clusters


Epoch 1/7: 100%|██████████| 157/157 [03:01<00:00,  1.15s/it, loss=2.5672]


Epoch 1 finished. Avg loss: 3.8044
Recomputing text embeddings & reclustering...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 10.86it/s]


Re-clustering done. 40 clusters
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_1.pt


Epoch 2/7: 100%|██████████| 157/157 [03:03<00:00,  1.17s/it, loss=2.3328]


Epoch 2 finished. Avg loss: 3.6721
Recomputing text embeddings & reclustering...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 10.73it/s]


Re-clustering done. 40 clusters
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_2.pt


Epoch 3/7: 100%|██████████| 157/157 [03:02<00:00,  1.16s/it, loss=2.3476]


Epoch 3 finished. Avg loss: 3.6389
Recomputing text embeddings & reclustering...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 10.84it/s]


Re-clustering done. 40 clusters
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_3.pt


Epoch 4/7: 100%|██████████| 157/157 [03:04<00:00,  1.17s/it, loss=2.3567]


Epoch 4 finished. Avg loss: 3.6278
Recomputing text embeddings & reclustering...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 11.00it/s]


Re-clustering done. 40 clusters
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_4.pt


Epoch 5/7: 100%|██████████| 157/157 [03:05<00:00,  1.18s/it, loss=2.3106]


Epoch 5 finished. Avg loss: 3.6326
Recomputing text embeddings & reclustering...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 10.53it/s]


Re-clustering done. 40 clusters
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_5.pt


Epoch 6/7: 100%|██████████| 157/157 [03:05<00:00,  1.18s/it, loss=2.3897]


Epoch 6 finished. Avg loss: 3.6172
Recomputing text embeddings & reclustering...


Computing embeddings: 100%|██████████| 40/40 [00:03<00:00, 10.99it/s]


Re-clustering done. 40 clusters
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_6.pt


Epoch 7/7: 100%|██████████| 157/157 [03:04<00:00,  1.17s/it, loss=2.3417]


Epoch 7 finished. Avg loss: 3.6050
Saved checkpoint to ./clip_ictq_model/checkpoint_epoch_7.pt
Training complete. Model saved.


In [26]:
from torchvision.datasets import CocoCaptions
import os
import pandas as pd
from tqdm import tqdm
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print(f"\nPath to Dataset Files: {path}")

coco2017_path = os.path.join(path, "coco2017")
coco_root = os.path.join(coco2017_path, "train2017")
ann_file = os.path.join(coco2017_path, "annotations/captions_train2017.json")
train_csv_path = "dataset_train.csv"

print()
coco = CocoCaptions(root=coco_root, annFile=ann_file)
print()

if not os.path.exists(train_csv_path):
    image_paths = []
    texts = []

    for idx in tqdm(range(len(coco)), desc="Processing COCO Captions"):
        img, captions = coco[idx]
        img_path = os.path.join(coco_root, coco.coco.imgs[coco.ids[idx]]['file_name'])
        for caption in captions:
            image_paths.append(img_path)
            texts.append(caption)

    df_og = pd.DataFrame({"image_path": image_paths, "text": texts})
    df_og.to_csv(train_csv_path, index=False)
    print(f"\nSaved CSV To: {train_csv_path}")
else:
    print(f"\nCSV File Already Exists: {train_csv_path}")
    df_og = pd.read_csv(train_csv_path)

Using Colab cache for faster access to the 'coco-2017-dataset' dataset.

Path to Dataset Files: /kaggle/input/coco-2017-dataset

loading annotations into memory...
Done (t=0.91s)
creating index...
index created!


CSV File Already Exists: dataset_train.csv


In [27]:
df_og = df_og.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)

  df_og = df_og.groupby('image_path').apply(lambda x: x.sample(1)).reset_index(drop=True)


In [28]:
# Step 1: Remove the train subset from the full dataframe
df_remaining = df_og.drop(df_subset.index)

# Step 2: Sample exactly 1,000 rows for test set
df_test = df_remaining.sample(n=1000, random_state=42)

# Optional: Reset indices
df_test = df_test.reset_index(drop=True)
df_remaining = df_remaining.reset_index(drop=True)

print("Train size:", len(df_subset))
print("Test size:", len(df_test))
print("Remaining (unused):", len(df_remaining))


Train size: 5000
Test size: 1000
Remaining (unused): 113287


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from tqdm import tqdm

# ----------------------------
# Config
# ----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32

# Load your trained model
model_path = "./clip_ictq_model"
model = CLIPModel.from_pretrained(model_path).to(DEVICE)
processor = CLIPProcessor.from_pretrained(model_path)
model.eval()

# ----------------------------
# Dataset for testing
# ----------------------------
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "image_path": row["image_path"],
            "text": str(row["text"]),
            "idx": int(idx)
        }

test_dataset = TestDataset(df_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# ----------------------------
# Compute embeddings
# ----------------------------
all_image_embeds = []
all_text_embeds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Computing embeddings"):
        # Images
        images = [Image.open(p).convert("RGB") for p in batch["image_path"]]
        texts = batch["text"]

        inputs = processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)

        outputs = model(**inputs)

        # Normalize embeddings
        img_embeds = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True)
        txt_embeds = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True)

        all_image_embeds.append(img_embeds.cpu())
        all_text_embeds.append(txt_embeds.cpu())

# Concatenate embeddings
all_image_embeds = torch.cat(all_image_embeds, dim=0)  # shape: (num_images, dim)
all_text_embeds = torch.cat(all_text_embeds, dim=0)    # shape: (num_texts, dim)

# ----------------------------
# Compute similarity: Text -> Image
# ----------------------------
similarity = all_text_embeds @ all_image_embeds.T  # (num_texts, num_images)
labels = torch.arange(len(df_test))  # ground-truth indices

# ----------------------------
# Compute Text->Image retrieval metrics
# ----------------------------
def compute_retrieval_metrics(similarity, labels):
    num_queries = similarity.size(0)
    ranks = []
    rr = []

    for i in range(num_queries):
        sim_row = similarity[i]  # similarity scores for text i
        sorted_indices = torch.argsort(sim_row, descending=True)
        rank = (sorted_indices == labels[i]).nonzero(as_tuple=True)[0].item()
        ranks.append(rank + 1)
        rr.append(1.0 / (rank + 1))

    ranks = np.array(ranks)
    rr = np.array(rr)

    r1 = np.mean(ranks <= 1)
    r5 = np.mean(ranks <= 5)
    r10 = np.mean(ranks <= 10)
    medr = np.median(ranks)
    mrr = np.mean(rr)

    return {"R@1": r1, "R@5": r5, "R@10": r10, "MedR": medr, "MRR": mrr}

metrics_t2i = compute_retrieval_metrics(similarity, labels)

print("Text -> Image Retrieval Metrics:")
for k, v in metrics_t2i.items():
    if k.startswith("R@"):
        print(f"{k}: {v*100:.2f}%")
    else:
        print(f"{k}: {v:.2f}")


Computing embeddings: 100%|██████████| 32/32 [00:13<00:00,  2.39it/s]


Text -> Image Retrieval Metrics:
R@1: 14.00%
R@5: 37.60%
R@10: 54.10%
MedR: 9.00
MRR: 0.26
