In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [6]:
import os
import pandas as pd
import json
from nltk import word_tokenize, pos_tag
from tqdm import tqdm

# ===== Paths =====
root_csv = "/content/drive/MyDrive/csvs"
baseline_file = os.path.join(root_csv, "DrawBenchPrompts.csv")

generated_files = {
    "Flux-Dev": "meta_captions_Flux-Dev.csv",
    "SDXL": "meta_captions_sdxl.csv",
    "SD2": "meta_captions_sd_2.csv"
}

output_json = os.path.join(root_csv, "spice_scores.json")

# ===== Load baseline prompts =====
baseline_df = pd.read_csv(baseline_file)
baseline_dict = dict(zip(baseline_df["image_name"], baseline_df["Prompts"]))

# ===== Preprocessing =====
def clean_text(text):
    return str(text).lower().strip()

# ===== SPICE-like F1 calculation =====
def spice_score(ref, cand):
    ref_tokens = [w for w, t in pos_tag(word_tokenize(ref)) if t.startswith(("NN", "JJ", "VB"))]
    cand_tokens = [w for w, t in pos_tag(word_tokenize(cand)) if t.startswith(("NN", "JJ", "VB"))]

    if not ref_tokens or not cand_tokens:
        return 0.0

    overlap = set(ref_tokens) & set(cand_tokens)
    precision = len(overlap) / len(cand_tokens)
    recall = len(overlap) / len(ref_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)  # F1

# ===== Compute for all models =====
average_scores = {}

for model_name, file_name in generated_files.items():
    print(f"Calculating SPICE for {model_name}...")
    file_path = os.path.join(root_csv, file_name)
    df = pd.read_csv(file_path).dropna(subset=["Meta Caption"])

    scores = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        image_id = row["image_name"]
        ref = baseline_dict.get(image_id)
        cand = row["Meta Caption"]
        if ref and cand:
            scores.append(spice_score(clean_text(ref), clean_text(cand)))

    avg_score = sum(scores) / len(scores) if scores else 0.0
    average_scores[model_name] = avg_score
    print(f"SPICE score for {model_name}: {avg_score:.4f}")

# ===== Save JSON =====
with open(output_json, "w") as f:
    json.dump(average_scores, f, indent=4)

print("SPICE scores saved to:", output_json)
print(average_scores)


Calculating SPICE for Flux-Dev...


100%|██████████| 170/170 [00:01<00:00, 146.54it/s]


SPICE score for Flux-Dev: 0.1308
Calculating SPICE for SDXL...


100%|██████████| 200/200 [00:00<00:00, 316.56it/s]


SPICE score for SDXL: 0.1034
Calculating SPICE for SD2...


100%|██████████| 200/200 [00:00<00:00, 456.38it/s]

SPICE score for SD2: 0.0935
SPICE scores saved to: /content/drive/MyDrive/csvs/spice_scores.json
{'Flux-Dev': 0.1308033132278193, 'SDXL': 0.10341236130339217, 'SD2': 0.09354926999176799}



