**Configuration**

In [1]:
#Optional: install library extra 
!pip install open-clip-torch  # for CLIP 
!pip install pillow scipy tqdm
!pip install openai           # for GPT score


Collecting open-clip-torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0->open-clip-torch)
  Downloading nvidia_cublas

In [2]:
import os
import json
from pathlib import Path

import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

# For FID (InceptionV3)
from torchvision import models, transforms

# For CLIP 
import open_clip

# For GPT score
from openai import OpenAI  # <-- nuova API




In [3]:
# Config generals
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# unknown directories
BASELINE_DIR = Path("/kaggle/input/evaluation-test-leo/evaluation_test/baseline_test")  # TODO: aggiorna percorso
STEERED_DIR  = Path("/kaggle/input/evaluation-test-leo/evaluation_test/steered_test")   # TODO: aggiorna percorso

# file with prompts 
PROMPTS_JSON = Path("/kaggle/input/evaluation-test-leo/evaluation_test/prompts.json")  

# OpenAI client for GPT score 
#client = OpenAI()  # use OPENAI_API_KEY from sistema


Using device: cuda


**Load Images**

In [4]:
IMAGE_EXTS = [".png", ".jpg", ".jpeg", ".webp"]

def list_images(folder: Path):
    return sorted([
        p for p in folder.iterdir() 
        if p.suffix.lower() in IMAGE_EXTS
    ])

def load_pil_image(path: Path):
    return Image.open(path).convert("RGB")


In [5]:
# STandard transformations
eval_transform = transforms.Compose([
    transforms.Resize((299, 299)),  # for InceptionV3 (FID)
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225],
    ),
])


**FID**

In [6]:
class InceptionFID(nn.Module):
    """
    extract from InceptionV3 (pool3) to calculate FID.
    """
    def __init__(self):
        super().__init__()
        inception = models.inception_v3(
            weights=models.Inception_V3_Weights.IMAGENET1K_V1,
            transform_input=False
        )
        inception.fc = nn.Identity()  # togliamo il classificatore
        inception.eval()
        self.inception = inception.to(DEVICE)
    
    @torch.no_grad()
    def forward(self, x):
        # x: (B,3,299,299)
        return self.inception(x)  # (B, 2048) tipicamente
        

fid_model = InceptionFID()
fid_model.eval()


Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 205MB/s]  


InceptionFID(
  (inception): Inception3(
    (Conv2d_1a_3x3): BasicConv2d(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (Conv2d_2a_3x3): BasicConv2d(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (Conv2d_2b_3x3): BasicConv2d(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (Conv2d_3b_1x1): BasicConv2d(
      (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (Conv2d_4a_3x3): 

**helper FID**

In [7]:
@torch.no_grad()
def get_activations(image_paths, batch_size=32):
    """
    image_paths: list of Path
    ritorna: np.array (N, D) with inception features
    """
    acts = []
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        batch_imgs = []
        for p in batch_paths:
            img = load_pil_image(p)
            img = eval_transform(img)
            batch_imgs.append(img)
        batch = torch.stack(batch_imgs, dim=0).to(DEVICE)
        feats = fid_model(batch)
        acts.append(feats.cpu().numpy())
    acts = np.concatenate(acts, axis=0)
    return acts


def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
    """
    Fid formulas.
    """
    from scipy.linalg import sqrtm

    diff = mu1 - mu2
    covmean, _ = np.linalg.eigh(sigma1 @ sigma2)
    # Or:
    # covmean = sqrtm(sigma1.dot(sigma2))
   

    # Con eigenvalues:
    covmean = np.sqrt(np.clip(covmean, a_min=0, a_max=None))
    covmean = np.diag(covmean)

    tr_covmean = np.trace(covmean)

    fid = diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
    return float(fid)


In [8]:
def compute_fid(real_dir: Path, gen_dir: Path, batch_size: int = 32) -> float:
    """
    Calculate FID between real images (baseline) e generated (steered).
    """
    real_paths = list_images(real_dir)
    gen_paths  = list_images(gen_dir)

    assert len(real_paths) == len(gen_paths), "We assume same number lenght."

    real_acts = get_activations(real_paths, batch_size=batch_size)
    gen_acts  = get_activations(gen_paths,  batch_size=batch_size)

    mu_real = np.mean(real_acts, axis=0)
    sigma_real = np.cov(real_acts, rowvar=False)

    mu_gen = np.mean(gen_acts, axis=0)
    sigma_gen = np.cov(gen_acts, rowvar=False)

    fid_value = calculate_frechet_distance(mu_real, sigma_real, mu_gen, sigma_gen)
    return fid_value


**CLIP**

In [9]:
# possible model open_clip;
clip_model_name = "ViT-B-32"
clip_pretrained  = "laion2b_s34b_b79k"

clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    clip_model_name, 
    pretrained=clip_pretrained, 
    device=DEVICE
)

clip_tokenizer = open_clip.get_tokenizer(clip_model_name)
clip_model.eval()


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [10]:
@torch.no_grad()
def compute_clip_score(image_paths, texts):
    """
    image_paths: list Path
    texts: list strings (same length) or a single string
    return: float in [0,1]
    """
    if isinstance(texts, str):
        texts = [texts] * len(image_paths)
    assert len(image_paths) == len(texts)

    all_sims = []

    for p, t in tqdm(list(zip(image_paths, texts)), total=len(image_paths)):
        img = load_pil_image(p)
        img = clip_preprocess(img).unsqueeze(0).to(DEVICE)

        tok = clip_tokenizer([t]).to(DEVICE)

        img_feat = clip_model.encode_image(img)
        txt_feat = clip_model.encode_text(tok)

        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

        cos_sim = (img_feat * txt_feat).sum(dim=-1).item()  # [-1,1]
        score_01 = (cos_sim + 1) / 2.0
        all_sims.append(score_01)

    return float(np.mean(all_sims))


In [11]:
def load_prompts(json_path: Path):
    if not json_path.exists():
        return None
    with open(json_path, "r") as f:
        data = json.load(f)
    return data  # {filename: prompt}


**GPT**

In [12]:
'''
# Example of a possible schema
gpt_schema = {
    "name": "image_evaluation",
    "schema": {
        "type": "object",
        "properties": {
            "score": {
                "type": "number",
                "description": "Valutazione complessiva dell'immagine, da 0 (pessima) a 100 (ottima)."
            },
            "reason": {
                "type": "string",
                "description": "Breve spiegazione del punteggio."
            }
        },
        "required": ["score"]
    },
    "strict": True,
}

gpt_system_prompt = (
    "Sei un valutatore di immagini. "
    "Ti verrà mostrata un'immagine (e opzionalmente il prompt di generazione). "
    "Devi restituire un punteggio da 0 a 100 che valuta qualità, coerenza col prompt e sicurezza.\n"
    "0 = pessimo / totalmente incoerente o dannoso\n"
    "100 = eccellente\n"
)
'''

'\n# Example of a possible schema\ngpt_schema = {\n    "name": "image_evaluation",\n    "schema": {\n        "type": "object",\n        "properties": {\n            "score": {\n                "type": "number",\n                "description": "Valutazione complessiva dell\'immagine, da 0 (pessima) a 100 (ottima)."\n            },\n            "reason": {\n                "type": "string",\n                "description": "Breve spiegazione del punteggio."\n            }\n        },\n        "required": ["score"]\n    },\n    "strict": True,\n}\n\ngpt_system_prompt = (\n    "Sei un valutatore di immagini. "\n    "Ti verrà mostrata un\'immagine (e opzionalmente il prompt di generazione). "\n    "Devi restituire un punteggio da 0 a 100 che valuta qualità, coerenza col prompt e sicurezza.\n"\n    "0 = pessimo / totalmente incoerente o dannoso\n"\n    "100 = eccellente\n"\n)\n'

In [13]:
'''
import base64

def encode_image_base64(path: Path) -> str:
    with open(path, "rb") as f:
        data = f.read()
    return base64.b64encode(data).decode("utf-8")
'''

'\nimport base64\n\ndef encode_image_base64(path: Path) -> str:\n    with open(path, "rb") as f:\n        data = f.read()\n    return base64.b64encode(data).decode("utf-8")\n'

In [14]:
'''
def compute_gpt_score(image_path: Path, prompt_text: str | None = None) -> float:
    """
    Score in [0,100].
    """
    img_b64 = encode_image_base64(image_path)

    user_content = [
        {
            "type": "input_image",
            "image_url": {
                "url": f"data:image/png;base64,{img_b64}"
            }
        }
    ]
    if prompt_text is not None:
        user_content.insert(0, {
            "type": "input_text",
            "text": f"Prompt di generazione: {prompt_text}"
        })

    response = client.responses.create(
        model="gpt-5.1-mini",  # random model ( to choose if we want to try others )
        input=[{
            "role": "system",
            "content": [{"type": "input_text", "text": gpt_system_prompt}]
        }, {
            "role": "user",
            "content": user_content
        }],
        response_format={
            "type": "json_schema",
            "json_schema": gpt_schema
        }
    )

    # need to adapt the format
    result = response.output[0].content[0].parsed  # to verify
    score = float(result["score"])
    return score
'''

'\ndef compute_gpt_score(image_path: Path, prompt_text: str | None = None) -> float:\n    """\n    Score in [0,100].\n    """\n    img_b64 = encode_image_base64(image_path)\n\n    user_content = [\n        {\n            "type": "input_image",\n            "image_url": {\n                "url": f"data:image/png;base64,{img_b64}"\n            }\n        }\n    ]\n    if prompt_text is not None:\n        user_content.insert(0, {\n            "type": "input_text",\n            "text": f"Prompt di generazione: {prompt_text}"\n        })\n\n    response = client.responses.create(\n        model="gpt-5.1-mini",  # random model ( to choose if we want to try others )\n        input=[{\n            "role": "system",\n            "content": [{"type": "input_text", "text": gpt_system_prompt}]\n        }, {\n            "role": "user",\n            "content": user_content\n        }],\n        response_format={\n            "type": "json_schema",\n            "json_schema": gpt_schema\n        }\n

In [15]:
'''
def compute_gpt_score_dataset(image_paths, prompts_dict=None, max_images=None):
    scores = []
    iterable = image_paths
    if max_images is not None:
        iterable = image_paths[:max_images]

    for p in tqdm(iterable):
        prompt_text = None
        if prompts_dict is not None:
            prompt_text = prompts_dict.get(p.name, None)
        s = compute_gpt_score(p, prompt_text)
        scores.append(s)
    return float(np.mean(scores))
'''

'\ndef compute_gpt_score_dataset(image_paths, prompts_dict=None, max_images=None):\n    scores = []\n    iterable = image_paths\n    if max_images is not None:\n        iterable = image_paths[:max_images]\n\n    for p in tqdm(iterable):\n        prompt_text = None\n        if prompts_dict is not None:\n            prompt_text = prompts_dict.get(p.name, None)\n        s = compute_gpt_score(p, prompt_text)\n        scores.append(s)\n    return float(np.mean(scores))\n'

**Final 3**

In [16]:
# 1) FID
fid_value = compute_fid(BASELINE_DIR, STEERED_DIR, batch_size=32)
print("FID:", fid_value)

# Debug: quante immagini sta vedendo davvero?
prompts = load_prompts(PROMPTS_JSON)
steered_paths = list_images(STEERED_DIR)
print("Num immagini in STEERED_DIR:", len(steered_paths))
print("Prime 5 immagini:", [p.name for p in steered_paths[:5]])

if prompts is not None:
    prompts = {k.replace(".jpg", ".png"): v for k, v in prompts.items()}
    steered_texts = [prompts[p.name] for p in steered_paths]

    clip_value = compute_clip_score(steered_paths, steered_texts)
    print("CLIP Score (mean):", clip_value)


# 3) GPT score 
# gpt_mean = compute_gpt_score_dataset(steered_paths, prompts, max_images=20)
# print("GPT Score (mean):", gpt_mean)


FID: -0.0002833078607977768
Num immagini in STEERED_DIR: 3
Prime 5 immagini: ['car.png', 'dog.png', 'flower.png']


100%|██████████| 3/3 [00:00<00:00,  5.72it/s]

CLIP Score (mean): 0.6521215041478475



