In [None]:
!pip install transformers torch torchvision ftfy regex tqdm

In [None]:


import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import pandas as pd
from tqdm import tqdm
import os
import json
from google.colab import drive

# ===== 1. Mount Google Drive =====
drive.mount('/content/drive')

# ===== 2. Paths =====
image_root_folder = "/content/drive/MyDrive/TTI_Models_images"  # Folder containing Flux-Dev, SD2, SDXL
baseline_file = "/content/drive/MyDrive/mscoco_captions.csv"     # MSCOCO captions CSV

# ===== 3. Load MSCOCO captions =====
baseline_df = pd.read_csv(baseline_file)
baseline_df = baseline_df[["image_name", "mscoco_caption"]]

# ===== 4. Load CLIP model =====
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ===== 5. Compute average CLIPScore per model =====
model_folders = ["Flux-Dev", "SD2", "SDXL"]
average_clip_scores = {}

for model_name in model_folders:
    print(f"\nProcessing model: {model_name}")
    folder_path = os.path.join(image_root_folder, model_name)
    model_scores = []

    for _, row in tqdm(baseline_df.iterrows(), total=len(baseline_df)):
        image_name = row["image_name"]
        caption = str(row["mscoco_caption"]).strip()

        # Look for image inside the current model folder
        possible_extensions = [".jpg", ".jpeg", ".png"]
        image_path = None
        for ext in possible_extensions:
            candidate_path = os.path.join(folder_path, f"{image_name}{ext}")
            if os.path.exists(candidate_path):
                image_path = candidate_path
                break

        if image_path is None or caption == "":
            continue  # skip missing images or empty captions

        # Compute CLIPScore
        image = Image.open(image_path).convert("RGB")
        inputs = processor(text=[caption], images=[image], return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            outputs = clip_model(**inputs)
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds
            sim = torch.nn.functional.cosine_similarity(image_embeds, text_embeds)
            model_scores.append(sim.item())

    # Average score for this model
    avg_score = sum(model_scores) / len(model_scores) if model_scores else 0.0
    average_clip_scores[model_name] = avg_score
    print(f"{model_name} → Average CLIPScore: {avg_score:.4f}")

# ===== 6. Save average scores =====
out_json = os.path.join(image_root_folder, "mscoco_clip_avg_scores.json")
with open(out_json, "w") as f:
    json.dump(average_clip_scores, f, indent=4)

print(f"\n Average CLIP scores saved to: {out_json}")
print(average_clip_scores)
