In [None]:
!pip install rouge
!pip install bert-score
!pip install git+https://github.com/salaniz/pycocoevalcap.git
!pip install nltk


In [None]:
# Importing necessary libraries for data processing and evaluation metrics
import pandas as pd
import json
import string
import os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk import word_tokenize, pos_tag
from rouge import Rouge
from bert_score import score
from tqdm import tqdm
import nltk

# Downloading necessary NLTK resources for text processing
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

# ===== Configuration =====
# Setting base paths for file operations
BASE_PATH = "/content/drive/MyDrive/"
CSV_PATH = "/content/drive/MyDrive/mscoco_csvs/"  # Adjust if needed

# ===== File paths =====
# Defining file paths for reference and model output data
ref_file = BASE_PATH + "mscoco_captions.csv"
flux_file = BASE_PATH + "sdxl_meta.csv"
sd2_file = BASE_PATH + "sd2_meta.csv"
sdxl_file = BASE_PATH + "fluxdev_meta.csv"

# ===== Load and merge data =====
print("Loading and merging data...")
# Loading CSV files into pandas DataFrames
ref_df = pd.read_csv(ref_file)
flux_df = pd.read_csv(flux_file)
sd2_df = pd.read_csv(sd2_file)
sdxl_df = pd.read_csv(sdxl_file)

# Merging all dataframes on image_name to create a unified dataset
merged = ref_df[["image_name", "mscoco_caption"]].merge(
    flux_df[["image_name", "Meta Caption"]].rename(columns={"Meta Caption": "Flux-Dev"}),
    on="image_name"
).merge(
    sd2_df[["image_name", "Meta Caption"]].rename(columns={"Meta Caption": "sd_2"}),
    on="image_name"
).merge(
    sdxl_df[["image_name", "Meta Caption"]].rename(columns={"Meta Caption": "sdxl"}),
    on="image_name"
)

# ===== Utility functions =====
def clean_text(text):
    """Cleaning text by converting to lowercase and removing punctuation"""
    if pd.isna(text) or str(text).strip() == "":
        return None
    text = str(text).lower().strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def spice_score(ref, cand):
    """Calculating SPICE-like F1 score based on noun, adjective and verb overlap"""
    ref_tokens = [w for w, t in pos_tag(word_tokenize(ref)) if t.startswith(("NN", "JJ", "VB"))]
    cand_tokens = [w for w, t in pos_tag(word_tokenize(cand)) if t.startswith(("NN", "JJ", "VB"))]

    if not ref_tokens or not cand_tokens:
        return 0.0

    overlap = set(ref_tokens) & set(cand_tokens)
    precision = len(overlap) / len(cand_tokens)
    recall = len(overlap) / len(ref_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

# ===== Initialize metrics =====
# Setting up evaluation metric objects
rouge = Rouge()
smooth = SmoothingFunction().method1
models = ["Flux-Dev", "sd_2", "sdxl"]

# Initializing results dictionary to store evaluation scores
results = {}

# Iterating through each model for evaluation
for model in models:
    print(f"\nEvaluating {model}...")

    # Preparing data for evaluation
    cands = merged[model].fillna("").astype(str).tolist()
    refs = merged["mscoco_caption"].fillna("").astype(str).tolist()
    num_images = len(merged)

    # Initializing model results with default values
    model_results = {
        "BLEU-1": 0, "BLEU-4": 0,
        "ROUGE-L_F1": 0, "ROUGE-L_Recall": 0,
        "BERTScore_F1": 0, "BERTScore_Recall": 0,
        "METEOR": 0, "SPICE": 0
    }

    # Initializing cumulative scores for BLEU and ROUGE metrics
    bleu1_total, bleu4_total, rougeL_f1_total, rougeL_recall_total = 0, 0, 0, 0

    # Calculating BLEU and ROUGE scores for each image-caption pair
    for ref, cand in tqdm(zip(refs, cands), total=num_images, desc=f"{model} BLEU/ROUGE"):
        # Calculating BLEU-1 score
        try:
            bleu1 = sentence_bleu([ref.split()], cand.split(), weights=(1, 0, 0, 0), smoothing_function=smooth)
        except:
            bleu1 = 0.0
        bleu1_total += bleu1

        # Calculating BLEU-4 score
        try:
            bleu4 = sentence_bleu([ref.split()], cand.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
        except:
            bleu4 = 0.0
        bleu4_total += bleu4

        # Calculating ROUGE-L scores
        try:
            rouge_scores = rouge.get_scores(cand, ref)[0]
            rougeL_f1_total += rouge_scores["rouge-l"]["f"]
            rougeL_recall_total += rouge_scores["rouge-l"]["r"]
        except:
            rougeL_f1_total += 0.0
            rougeL_recall_total += 0.0

    # Calculating BERTScore for the entire dataset
    print(f"Calculating BERTScore for {model}...")
    P, R, F1 = score(cands, refs, lang="en", verbose=True)

    # Calculating METEOR and SPICE scores
    meteor_total, spice_total = 0, 0
    valid_pairs = 0

    # Processing each reference-candidate pair for semantic metrics
    for ref, cand in tqdm(zip(refs, cands), total=num_images, desc=f"{model} METEOR/SPICE"):
        if ref.strip() and cand.strip():
            # Calculating METEOR score
            try:
                meteor_score_val = meteor_score([ref.split()], cand.split())
                meteor_total += meteor_score_val
            except:
                meteor_total += 0.0

            # Calculating SPICE score
            try:
                spice_score_val = spice_score(ref, cand)
                spice_total += spice_score_val
            except:
                spice_total += 0.0

            valid_pairs += 1

    # Calculating average scores across all images
    model_results["BLEU-1"] = bleu1_total / num_images
    model_results["BLEU-4"] = bleu4_total / num_images
    model_results["ROUGE-L_F1"] = rougeL_f1_total / num_images
    model_results["ROUGE-L_Recall"] = rougeL_recall_total / num_images
    model_results["BERTScore_F1"] = F1.mean().item()
    model_results["BERTScore_Recall"] = R.mean().item()
    model_results["METEOR"] = meteor_total / valid_pairs if valid_pairs > 0 else 0
    model_results["SPICE"] = spice_total / valid_pairs if valid_pairs > 0 else 0

    # Storing results for the current model
    results[model] = model_results

# ===== Save results =====
print("\nSaving results...")

# Saving comprehensive evaluation results to JSON file
comprehensive_output = BASE_PATH + "multiscore.json"
with open(comprehensive_output, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

# Grouping metrics for compatibility with different evaluation needs
bleu_rouge_bert = {model: {k: v for k, v in results[model].items()
                          if k in ["BLEU-1", "BLEU-4", "ROUGE-L_F1", "BERTScore_F1"]}
                  for model in models}
recall_metrics = {model: {k: v for k, v in results[model].items()
                         if k in ["ROUGE-L_Recall", "BERTScore_Recall"]}
                 for model in models}
semantic_metrics = {model: {k: v for k, v in results[model].items()
                           if k in ["METEOR", "SPICE"]}
                   for model in models}

# Saving grouped metric results to separate JSON files
with open(BASE_PATH + "bleu_rouge_bert_scores.json", "w", encoding="utf-8") as f:
    json.dump(bleu_rouge_bert, f, indent=4, ensure_ascii=False)

with open(BASE_PATH + "recall_scores.json", "w", encoding="utf-8") as f:
    json.dump(recall_metrics, f, indent=4, ensure_ascii=False)

with open(BASE_PATH + "semantic_scores.json", "w", encoding="utf-8") as f:
    json.dump(semantic_metrics, f, indent=4, ensure_ascii=False)

# Printing completion message and file locations
print(f"Comprehensive evaluation completed!")
print(f"Main results saved to: {comprehensive_output}")
print(f"Individual metric groups also saved for compatibility")

# Displaying final evaluation results in console
print("\n=== FINAL RESULTS ===")
for model, metrics in results.items():
    print(f"\n{model}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")