In [None]:
!pip install rouge
!pip install bert-score


In [None]:
import pandas as pd
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from bert_score import score
from tqdm import tqdm

# ===== File paths (Google Drive) =====
base_path = "/content/drive/MyDrive/"
ref_df = pd.read_csv(base_path + "DrawBenchPrompts.csv")
flux_df = pd.read_csv(base_path + "meta_captions_Flux-Dev.csv")
sd2_df = pd.read_csv(base_path + "meta_captions_sd_2.csv")
sdxl_df = pd.read_csv(base_path + "meta_captions_sdxl.csv")

# ===== Merge on image_name =====
merged = ref_df[["image_name", "Prompts", "Category"]].merge(
    flux_df[["image_name", "Meta Caption"]].rename(columns={"Meta Caption": "Flux-Dev"}),
    on="image_name"
).merge(
    sd2_df[["image_name", "Meta Caption"]].rename(columns={"Meta Caption": "sd_2"}),
    on="image_name"
).merge(
    sdxl_df[["image_name", "Meta Caption"]].rename(columns={"Meta Caption": "sdxl"}),
    on="image_name"
)

# ===== Metric Setup =====
rouge = Rouge()
smooth = SmoothingFunction().method1

# ===== Initialize accumulators =====
avg_scores = {
    "Flux-Dev": {"BLEU-1": 0, "BLEU-4": 0, "ROUGE-L_F1": 0, "BERTScore_F1": 0},
    "sd_2": {"BLEU-1": 0, "BLEU-4": 0, "ROUGE-L_F1": 0, "BERTScore_F1": 0},
    "sdxl": {"BLEU-1": 0, "BLEU-4": 0, "ROUGE-L_F1": 0, "BERTScore_F1": 0}
}

num_images = len(merged)

# ===== Compute BLEU + ROUGE =====
for model in ["Flux-Dev", "sd_2", "sdxl"]:
    cands = merged[model].fillna("").astype(str).tolist()
    refs = merged["Prompts"].fillna("").astype(str).tolist()

    bleu1_total, bleu4_total, rougeL_total = 0, 0, 0

    for ref, cand in zip(refs, cands):
        # BLEU-1
        try:
            bleu1 = sentence_bleu([ref.split()], cand.split(), weights=(1, 0, 0, 0), smoothing_function=smooth)
        except:
            bleu1 = 0.0
        bleu1_total += bleu1

        # BLEU-4
        try:
            bleu4 = sentence_bleu([ref.split()], cand.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
        except:
            bleu4 = 0.0
        bleu4_total += bleu4

        # ROUGE-L F1
        try:
            rouge_scores = rouge.get_scores(cand, ref)[0]
            rougeL_total += rouge_scores["rouge-l"]["f"]
        except:
            rougeL_total += 0.0

    # BERTScore (batch)
    P, R, F1 = score(cands, refs, lang="en", verbose=True)
    avg_bert_f1 = F1.mean().item()

    # Save averages
    avg_scores[model]["BLEU-1"] = bleu1_total / num_images
    avg_scores[model]["BLEU-4"] = bleu4_total / num_images
    avg_scores[model]["ROUGE-L_F1"] = rougeL_total / num_images
    avg_scores[model]["BERTScore_F1"] = avg_bert_f1

# ===== Save to JSON =====
out_path = base_path + "caption_eval_avg_scores.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(avg_scores, f, indent=4, ensure_ascii=False)

print(f" Average scores saved to {out_path}")
