In [5]:
import os
import json
import pandas as pd
from tqdm import tqdm
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# ===== Paths =====
root_images = "/content/drive/MyDrive/eval_images"
baseline_file = "/content/drive/MyDrive/csvs/DrawBenchPrompts.csv"

model_folders = {
    "Flux-Dev": "Flux-Dev_drawbench_images",
    "SDXL": "SDXL_drawbench_images",
    "SD2": "SD2_drawbench_images"
}

output_json = "/content/drive/MyDrive/csvs/clip_average_scores.json"

# ===== Load baseline prompts =====
baseline_df = pd.read_csv(baseline_file)
baseline_dict = dict(zip(baseline_df["image_name"], baseline_df["Prompts"]))

# ===== Load CLIP model =====
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ===== Compute CLIP score for all models =====
average_scores = {}

for model_name, folder_name in model_folders.items():
    print(f"Calculating CLIP scores for {model_name}...")
    folder_path = os.path.join(root_images, folder_name)
    scores = []

    for image_name, prompt in tqdm(baseline_dict.items()):
        # Construct image path
        possible_ext = ["png", "jpg", "jpeg"]
        image_path = None
        for ext in possible_ext:
            path = os.path.join(folder_path, f"{image_name}.{ext}")
            if os.path.exists(path):
                image_path = path
                break
        if image_path is None:
            print(f"Warning: Image {image_name} not found in {folder_path}")
            continue

        # Load image
        image = Image.open(image_path).convert("RGB")

        # Encode text and image with truncation
        inputs = processor(
            text=[prompt],
            images=image,
            return_tensors="pt",
            truncation=True
        ).to(device)

        with torch.no_grad():
            image_embeds = model.get_image_features(pixel_values=inputs["pixel_values"])
            text_embeds = model.get_text_features(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        # Normalize embeddings
        image_embeds /= image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds /= text_embeds.norm(p=2, dim=-1, keepdim=True)

        # Cosine similarity
        sim = (image_embeds @ text_embeds.T).item()
        scores.append(sim)

    # Average similarity
    avg_score = sum(scores) / len(scores) if scores else 0
    average_scores[model_name] = avg_score
    print(f"Average CLIP score for {model_name}: {avg_score:.4f}")

# ===== Save to JSON =====
with open(output_json, "w") as f:
    json.dump(average_scores, f, indent=4)

print("Average CLIP scores saved to:", output_json)
print(average_scores)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating CLIP scores for Flux-Dev...


100%|██████████| 200/200 [01:07<00:00,  2.95it/s]


Average CLIP score for Flux-Dev: 0.3147
Calculating CLIP scores for SDXL...


100%|██████████| 200/200 [02:24<00:00,  1.38it/s]


Average CLIP score for SDXL: 0.3206
Calculating CLIP scores for SD2...


100%|██████████| 200/200 [02:08<00:00,  1.56it/s]

Average CLIP score for SD2: 0.3219
Average CLIP scores saved to: /content/drive/MyDrive/csvs/clip_average_scores.json
{'Flux-Dev': 0.314676451086998, 'SDXL': 0.32061889708042146, 'SD2': 0.3218668834865093}



