<a href="https://colab.research.google.com/github/vanshika-poojari/Clothiq-react/blob/main/CLIP_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ===============================================
# CLIP Cosine Similarity Experiment (All 5 Images)
# ===============================================

# 1) Install dependencies
!pip install transformers ftfy regex tqdm torch torchvision pillow

# 2) Imports
from PIL import Image
import requests
import torch
import torch.nn.functional as F
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

# 3) Load model & processor
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

# 4) Helper functions
def image_from_url(url):
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    return Image.open(resp.raw).convert('RGB')

def embed_image(image):
    inputs = processor(images=image, return_tensors='pt').to(device)
    with torch.no_grad():
        img_emb = model.get_image_features(**inputs)
    img_emb = F.normalize(img_emb, dim=-1)
    return img_emb.cpu()

def embed_text(texts):
    inputs = processor(text=texts, return_tensors='pt', padding=True).to(device)
    with torch.no_grad():
        txt_emb = model.get_text_features(**inputs)
    txt_emb = F.normalize(txt_emb, dim=-1)
    return txt_emb.cpu()

# 5) All Images (3 previous + 2 Flickr images)
image_urls = [
    # Replace these with your original 3 images if needed
    'https://images.pexels.com/photos/36744/agriculture-arable-clouds-countryside.jpg',
    'https://images.pexels.com/photos/825947/pexels-photo-825947.jpeg',
    'https://images.pexels.com/photos/34044163/pexels-photo-34044163.jpeg',
    # New Flickr images
    'https://live.staticflickr.com/840/43380549381_004601c7ac_h.jpg',
    'https://live.staticflickr.com/2404/2020522557_d1aa0a1066_k.jpg'
]

# 6) Download all images
images = []
for url in image_urls:
    try:
        img = image_from_url(url)
        images.append(img)
    except Exception as e:
        print('Failed to download', url, e)
        images.append(None)

# 7) Precompute image embeddings
img_embs = []
for img in images:
    if img is None:
        img_embs.append(None)
    else:
        img_embs.append(embed_image(img))

# 8) Define word list
words = [
    'sunset', 'sun', 'tree', 'bird', 'dog', 'cat', 'forest', 'woods',
    'car', 'building', 'city', 'beach', 'mountain', 'person', 'man',
    'woman', 'child', 'flower', 'food', 'toaster', 'room', 'bed', 'hotel', 'iron'
]

# 9) Compute best and worst single-word matches
print("\n===== Single Word Similarity Results =====")
for i, emb in enumerate(img_embs):
    if emb is None:
        print(f'Image {i}: no embedding')
        continue
    text_embs = embed_text(words)
    sims = (emb @ text_embs.T).squeeze(0)
    best_idx = sims.argmax().item()
    worst_idx = sims.argmin().item()
    print(f'Image {i}: BEST WORD = "{words[best_idx]}" (sim={sims[best_idx].item():.4f}) | WORST WORD = "{words[worst_idx]}" (sim={sims[worst_idx].item():.4f})')

# 10) Structured captions ("A photo of a W")
print("\n===== Structured Caption Similarity Results =====")
structured = [f'A photo of a {w}' for w in words]
for i, emb in enumerate(img_embs):
    if emb is None:
        continue
    st_embs = embed_text(structured)
    sims = (emb @ st_embs.T).squeeze(0)
    best_idx = sims.argmax().item()
    worst_idx = sims.argmin().item()
    print(f'Image {i}: BEST STRUCTURED = "{structured[best_idx]}" (sim={sims[best_idx].item():.4f}) | WORST STRUCTURED = "{structured[worst_idx]}" (sim={sims[worst_idx].item():.4f})')

# 11) Arbitrary captions (example sentences)
print("\n===== Arbitrary Caption Results =====")
candidates = [
    'A close-up of a smiling baby wearing a party hat.',
    'A red sports car driving at high speed on a racetrack.',
    'Aerial view of a city at night with neon signs.',
    'A plate of sushi served on a wooden board.',
    'An astronaut riding a horse on the moon.',
    'A messy hotel room with an iron board in the middle.',
    'A cartoon illustration of a running dog with floppy ears.'
]
for i, emb in enumerate(img_embs):
    if emb is None:
        continue
    cand_embs = embed_text(candidates)
    sims = (emb @ cand_embs.T).squeeze(0)
    best_idx = sims.argmax().item()
    worst_idx = sims.argmin().item()
    print(f'Image {i}: LOWEST SIM CAPTION = "{candidates[worst_idx]}" (sim={sims[worst_idx].item():.4f}) | HIGHEST SIM CAPTION = "{candidates[best_idx]}" (sim={sims[best_idx].item():.4f})')

# 12) Notes for Part 2
print("\nFor Part 2: You can extend this code by providing your own captions dataset (like COCO Captions or LAION tags) and comparing similarities to find the overall highest scoring image-caption pair.")



===== Single Word Similarity Results =====
Image 0: BEST WORD = "sunset" (sim=0.2658) | WORST WORD = "toaster" (sim=0.1769)
Image 1: BEST WORD = "dog" (sim=0.2825) | WORST WORD = "sunset" (sim=0.1690)
Image 2: BEST WORD = "forest" (sim=0.2677) | WORST WORD = "flower" (sim=0.1584)
Image 3: BEST WORD = "dog" (sim=0.2535) | WORST WORD = "sunset" (sim=0.1543)
Image 4: BEST WORD = "room" (sim=0.2802) | WORST WORD = "tree" (sim=0.1699)

===== Structured Caption Similarity Results =====
Image 0: BEST STRUCTURED = "A photo of a sunset" (sim=0.2787) | WORST STRUCTURED = "A photo of a building" (sim=0.1916)
Image 1: BEST STRUCTURED = "A photo of a dog" (sim=0.2922) | WORST STRUCTURED = "A photo of a forest" (sim=0.1741)
Image 2: BEST STRUCTURED = "A photo of a woods" (sim=0.2921) | WORST STRUCTURED = "A photo of a flower" (sim=0.1763)
Image 3: BEST STRUCTURED = "A photo of a dog" (sim=0.2593) | WORST STRUCTURED = "A photo of a sunset" (sim=0.1331)
Image 4: BEST STRUCTURED = "A photo of a bed" (

In [6]:
# === PART 2: Find (image, caption) pair with largest cosine similarity ===

import itertools

# Some high-quality, regular real-world images
image_urls_part2 = [
    # 1. Red apple
    "https://images.pexels.com/photos/39803/pexels-photo-39803.jpeg",

    # 2. White cat
    "https://images.pexels.com/photos/127028/pexels-photo-127028.jpeg",

    # 3. Pepperoni pizza
    "https://images.pexels.com/photos/315755/pexels-photo-315755.jpeg",

    # 4. Mountain landscape
    "https://images.pexels.com/photos/674010/pexels-photo-674010.jpeg",

    # 5. Person with laptop (working replacement)
    "https://images.pexels.com/photos/4065876/pexels-photo-4065876.jpeg"
]

# Candidate captions (normal, descriptive, not artificial)
captions = [
    "A photo of a red apple on a white background.",
    "A close-up photo of a white cat with blue eyes.",
    "A delicious pepperoni pizza on a wooden table.",
    "A scenic photo of mountains under a blue sky.",
    "A person typing on a laptop at a desk."
]

# Compute embeddings
imgs = [image_from_url(url) for url in image_urls_part2]
img_embs2 = [embed_image(img) for img in imgs]
text_embs2 = embed_text(captions)

# Compute cosine similarities between every image & every caption
sims = []
for i, emb_i in enumerate(img_embs2):
    for j, emb_t in enumerate(text_embs2):
        score = float((emb_i @ emb_t.T).item())
        sims.append((score, i, j))

# Sort and print best matches
sims.sort(reverse=True)
best_score, best_img, best_cap = sims[0]

print(f"Highest similarity: {best_score:.4f}")
print(f"Image URL: {image_urls_part2[best_img]}")
print(f"Caption: {captions[best_cap]}")


Highest similarity: 0.3338
Image URL: https://images.pexels.com/photos/39803/pexels-photo-39803.jpeg
Caption: A photo of a red apple on a white background.


  score = float((emb_i @ emb_t.T).item())
