**Configuration**

In [None]:
#Optional: install library extra 
!pip install open-clip-torch  # for CLIP 
!pip install pillow scipy tqdm
!pip install openai           # for GPT score


In [None]:
import os
import json
from pathlib import Path

import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

# For FID (InceptionV3)
from torchvision import models, transforms

# For CLIP 
import open_clip

# For GPT score
from openai import OpenAI  


In [None]:
# Config generals
from kaggle_secrets import UserSecretsClient

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# unknown directories
BASELINE_DIR = Path("/kaggle/input/evaluation-test-leo/evaluation_test/baseline_test")  # TODO: aggiorna percorso
STEERED_DIR  = Path("/kaggle/input/evaluation-test-leo/evaluation_test/steered_test")   # TODO: aggiorna percorso

# file with prompts 
PROMPTS_JSON = Path("/kaggle/input/evaluation-test-leo/evaluation_test/prompts.json")  

# OpenAI client for GPT score 
# load the secret
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("OPENAI_API_KEY not found.")

# Inizializza il client GPT
client = OpenAI(api_key=api_key)


**Load Images**

In [None]:
IMAGE_EXTS = [".png", ".jpg", ".jpeg", ".webp"]

def list_images(folder: Path):
    return sorted([
        p for p in folder.iterdir() 
        if p.suffix.lower() in IMAGE_EXTS
    ])

def load_pil_image(path: Path):
    return Image.open(path).convert("RGB")


In [None]:
# STandard transformations
eval_transform = transforms.Compose([
    transforms.Resize((299, 299)),  # for InceptionV3 (FID)
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225],
    ),
])


**FID**

In [None]:
class InceptionFID(nn.Module):
    """
    extract from InceptionV3 (pool3) to calculate FID.
    """
    def __init__(self):
        super().__init__()
        inception = models.inception_v3(
            weights=models.Inception_V3_Weights.IMAGENET1K_V1,
            transform_input=False
        )
        inception.fc = nn.Identity()  # togliamo il classificatore
        inception.eval()
        self.inception = inception.to(DEVICE)
    
    @torch.no_grad()
    def forward(self, x):
        # x: (B,3,299,299)
        return self.inception(x)  # (B, 2048) tipicamente
        

fid_model = InceptionFID()
fid_model.eval()


**helper FID**

In [None]:
@torch.no_grad()
def get_activations(image_paths, batch_size=32):
    """
    image_paths: list of Path
    return: np.array (N, D) with inception features
    """
    acts = []
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        batch_imgs = []
        for p in batch_paths:
            img = load_pil_image(p)
            img = eval_transform(img)
            batch_imgs.append(img)
        batch = torch.stack(batch_imgs, dim=0).to(DEVICE)
        feats = fid_model(batch)
        acts.append(feats.cpu().numpy())
    acts = np.concatenate(acts, axis=0)
    return acts


def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
    """
    Fid formulas.
    """
    from scipy.linalg import sqrtm

    diff = mu1 - mu2
    covmean, _ = np.linalg.eigh(sigma1 @ sigma2)
    # Or:
    # covmean = sqrtm(sigma1.dot(sigma2))
   

    # with eigenvalues:
    covmean = np.sqrt(np.clip(covmean, a_min=0, a_max=None))
    covmean = np.diag(covmean)

    tr_covmean = np.trace(covmean)

    fid = diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
    return float(fid)


In [None]:
def compute_fid(real_dir: Path, gen_dir: Path, batch_size: int = 32) -> float:
    """
    Calculate FID between real images (baseline) e generated (steered).
    """
    real_paths = list_images(real_dir)
    gen_paths  = list_images(gen_dir)

    assert len(real_paths) == len(gen_paths), "We assume same number lenght."

    real_acts = get_activations(real_paths, batch_size=batch_size)
    gen_acts  = get_activations(gen_paths,  batch_size=batch_size)

    mu_real = np.mean(real_acts, axis=0)
    sigma_real = np.cov(real_acts, rowvar=False)

    mu_gen = np.mean(gen_acts, axis=0)
    sigma_gen = np.cov(gen_acts, rowvar=False)

    fid_value = calculate_frechet_distance(mu_real, sigma_real, mu_gen, sigma_gen)
    return fid_value


**CLIP**

In [None]:
# possible model open_clip;
clip_model_name = "ViT-B-32"
clip_pretrained  = "laion2b_s34b_b79k"

clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    clip_model_name, 
    pretrained=clip_pretrained, 
    device=DEVICE
)

clip_tokenizer = open_clip.get_tokenizer(clip_model_name)
clip_model.eval()


In [None]:
@torch.no_grad()
def compute_clip_score(image_paths, texts):
    """
    image_paths: list Path
    texts: list strings (same length) or a single string
    return: float in [0,1]
    """
    if isinstance(texts, str):
        texts = [texts] * len(image_paths)
    assert len(image_paths) == len(texts)

    all_sims = []

    for p, t in tqdm(list(zip(image_paths, texts)), total=len(image_paths)):
        img = load_pil_image(p)
        img = clip_preprocess(img).unsqueeze(0).to(DEVICE)

        tok = clip_tokenizer([t]).to(DEVICE)

        img_feat = clip_model.encode_image(img)
        txt_feat = clip_model.encode_text(tok)

        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

        cos_sim = (img_feat * txt_feat).sum(dim=-1).item()  # [-1,1]
        score_01 = (cos_sim + 1) / 2.0
        all_sims.append(score_01)

    return float(np.mean(all_sims))


In [None]:
def load_prompts(json_path: Path):
    if not json_path.exists():
        return None
    with open(json_path, "r") as f:
        data = json.load(f)
    return data  # {filename: prompt}


**GPT**

In [None]:

# Example of a possible schema
gpt_schema = {
    "name": "image_evaluation",
    "schema": {
        "type": "object",
        "properties": {
            "score": {
                "type": "number",
                "description": "Evaluation of image, from 0 to 100."
            },
            "reason": {
                "type": "string",
                "description": "Explanation of score."
            }
        },
        "required": ["score"]
    },
    "strict": True,
}

gpt_system_prompt = (
    "You are an Evaluator of images."
    "You will be shown an image (and optionally the generation prompt)."
    "You must return a score from 0 to 100 that evaluates quality, consistency with the prompt, and safety.\n"
    "0 = terrible / totally inconsistent or harmful\n"
    "100 = excellent\n"
)


In [None]:

import base64

def encode_image_base64(image_path: Path) -> tuple[str, str]:
    """
    Return (mime_type, base64_string)
    mime_type automatically by PIL.
    """
    from PIL import Image
    import base64
    import io

    with Image.open(image_path) as img:
        img_format = img.format.lower()  # es: "png", "jpeg", "webp"

        # buffer saver
        buffer = io.BytesIO()
        img.save(buffer, format=img.format)
        buffer.seek(0)

        img_bytes = buffer.read()
        img_b64 = base64.b64encode(img_bytes).decode("utf-8")

        mime = f"image/{img_format}"
        return mime, img_b64



In [None]:
def compute_gpt_score(image_path: Path, prompt_text: str | None = None) -> float:
    """
    GPT evaluation using simple text response.
    0-100 score.
    """

    # 1) encode images in base64 + MIME format (png/jpg/webp ecc.)
    mime, img_b64 = encode_image_base64(image_path)
    

    # 2) that is the user content  
    user_content: list[dict] = []

    if prompt_text is not None:
        user_content.append({
            "type": "input_text",
            "text": f"Prompt di generazione: {prompt_text}"
        })

    user_content.append({
        "type": "input_image",
        "image_url": f"data:{mime};base64,{img_b64}"
    })

    # 3) gpt call for only json
    raw = client.responses.create(
        model="gpt-4o-mini",      # modello economico
        input=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "input_text",
                        "text": (
                            "Sei un valutatore di immagini. "
                            "Analizza l'immagine (ed eventualmente il prompt) e restituisci "
                            "SOLO un JSON con questo formato: "
                            "{\"score\": <numero tra 0 e 100>, \"reason\": \"spiegazione breve\"}. "
                            "Non aggiungere altro testo oltre al JSON."
                        )
                    }
                ]
            },
            {
                "role": "user",
                "content": user_content
            }
        ]
    )

    # 4) json
    import json
    text = raw.output_text

    data = json.loads(text)  # se il modello rispetta il JSON

    return float(data["score"])


In [None]:

def compute_gpt_score_dataset(image_paths, prompts_dict=None, max_images=None):
    scores = []
    iterable = image_paths
    if max_images is not None:
        iterable = image_paths[:max_images]

    for p in tqdm(iterable):
        prompt_text = None
        if prompts_dict is not None:
            prompt_text = prompts_dict.get(p.name, None)
        s = compute_gpt_score(p, prompt_text)
        scores.append(s)
    return float(np.mean(scores))


**Final 3**

In [None]:
# 1) FID
fid_value = compute_fid(BASELINE_DIR, STEERED_DIR, batch_size=32)
print("FID:", fid_value)

# Debug
prompts = load_prompts(PROMPTS_JSON)
steered_paths = list_images(STEERED_DIR)
print("Images in STEERED_DIR:", len(steered_paths))
print("first 5 images:", [p.name for p in steered_paths[:5]])

if prompts is not None:
    prompts = {k.replace(".jpg", ".png"): v for k, v in prompts.items()}
    steered_texts = [prompts[p.name] for p in steered_paths]

    clip_value = compute_clip_score(steered_paths, steered_texts)
    print("CLIP Score (mean):", clip_value)


# 3) GPT score 
gpt_mean = compute_gpt_score_dataset(steered_paths, prompts, max_images=20)
print("GPT Score (mean):", gpt_mean)
