In [15]:
# ==============================
#          IMPORTS
# ==============================
# Importing necessary libraries for handling CSV/Excel files, text processing, and type annotations
import pandas as pd       # Handling tabular data (CSV/Excel) efficiently
import numpy as np        # Handling numerical operations if needed
import spacy              # Performing natural language processing (tokenization, parsing)
from typing import Set, Dict  # Providing type hints for functions and variables
import re                 # Using regular expressions to extract patterns from text

# ==============================
#        LOAD SPACY MODEL
# ==============================
# Trying to load the small English SpaCy model for natural language processing
try:
    nlp = spacy.load("en_core_web_sm")  # Loading pre-trained English model
except OSError:
    # Downloading model if not already installed
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")  # Loading it after download

# Adding a pipeline component to merge multi-token entities into single tokens
# This is helping to treat entities like "New York" or "Red Apple" as single units
nlp.add_pipe("merge_entities")

# ==============================
#        FINE-GRAINED METRICS
# ==============================
# Defining a class for computing object-level metrics (color, number, text, position)
class FineGrainedMetrics:

    # Checking if a given attribute word is related to a noun in the parsed document
    @staticmethod
    def related_to_noun(doc, attribute: str, noun: str) -> bool:
        # Iterating through each token in the document
        for token in doc:
            if token.text == noun:
                # Looking inside the noun's subtree to see if the attribute exists
                if attribute in [t.text for t in token.subtree]:
                    return True
        return False

    # Computing color consistency between meta and original captions
    @classmethod
    def color(cls, meta, orig, generated_nouns: Set[str]) -> float:
        # Defining a set of colors we are considering
        COLORS = {'red','blue','green','yellow','black','white','gray','grey',
                  'orange','pink','purple','brown','violet','indigo','turquoise',
                  'cyan','magenta'}
        score, original_colors = 0.0, set()
        # Iterating over each noun in the generated caption
        for noun in generated_nouns:
            # Extracting colors linked to the noun in meta and original captions
            meta_colors = {t.text for t in meta
                           if cls.related_to_noun(meta, t.text, noun) and t.dep_ in {'acomp','amod'} and t.text.lower() in COLORS}
            orig_colors = {t.text for t in orig
                           if cls.related_to_noun(orig, t.text, noun) and t.dep_ in {'acomp','amod'} and t.text.lower() in COLORS}
            original_colors.update(orig_colors)
            if orig_colors:
                # Adding overlap between meta and original colors to score
                score += len(orig_colors & meta_colors)
        # Returning -1 if no color is found in original (cannot compute), else computing ratio
        return -1 if not original_colors else score / len(original_colors)

    # Computing number consistency (quantity of objects) between meta and original captions
    @classmethod
    def number(cls, meta, orig, generated_nouns: Set[str]) -> float:
        # Mapping common words to numbers for comparison
        QUANTITY_MAP = {'a':'1','an':'1','the':'1','one':'1','two':'2','three':'3',
                        'couple':'2','few':'3','several':'4','many':'5','dozen':'12'}
        score, original_numbers = 0.0, set()
        for noun in generated_nouns:
            # Extracting numbers associated with nouns in meta and original captions
            meta_nums = {QUANTITY_MAP.get(t.text.lower(), t.text) for t in meta
                         if cls.related_to_noun(meta, t.text, noun) and t.dep_ in {'nummod','det'}}
            orig_nums = {QUANTITY_MAP.get(t.text.lower(), t.text) for t in orig
                         if cls.related_to_noun(orig, t.text, noun) and t.dep_ in {'nummod','det'}}
            original_numbers.update(orig_nums)
            if orig_nums:
                score += len(orig_nums & meta_nums)
        if not original_numbers:
            return -1
        raw_val = score / len(original_numbers)
        return min(raw_val, 1.0)  # <-- FIX: Cap at 1.0 so Number FiG never exceeds 1

    # Computing consistency for textual content (quotes or written words)
    @classmethod
    def text(cls, meta, orig, _: Set[str]) -> float:
        TEXT_INDICATORS = {'written','saying','says','reading','text'}
        QUOTE_PATTERN = r'[\"\'«»“”]([^\"\'«»“”]*)[\"\'«»“”]'
        if any(t.text in TEXT_INDICATORS for t in orig):
            # Extracting quoted text from captions
            orig_matches = re.findall(QUOTE_PATTERN, orig.text)
            meta_matches = re.findall(QUOTE_PATTERN, meta.text)
            if not orig_matches:
                return -1  # Cannot compute if original has no quotes
            # Normalizing text to lowercase and removing spaces for comparison
            orig_norm = [''.join(s.lower().split()) for s in orig_matches]
            meta_norm = [''.join(s.lower().split()) for s in meta_matches]
            # Counting how many original quotes appear in meta
            matches = sum(any(o in m for m in meta_norm) for o in orig_norm)
            return matches / len(orig_matches)
        return -1  # Returning -1 if no textual indicators

    # Extracting spatial relationships (prepositions) to evaluate position consistency
    @staticmethod
    def extract_spatial_relations(doc):
        rels = set()
        for token in doc:
            if token.dep_ == 'prep':
                # Collecting objects related to prepositions
                pobjects = [child for child in token.children if child.dep_ == 'pobj']
                if pobjects:
                    pobj = pobjects[0].text
                    subj = None
                    # Searching for subject/object linked to preposition
                    for anc in token.ancestors:
                        if anc.dep_ in {'nsubj','nsubjpass','dobj','pobj'}:
                            subj = anc.text
                            break
                    if subj:
                        rels.add((subj, token.text, pobj))
        return rels

    # Computing positional consistency
    @classmethod
    def position(cls, meta, orig, _: Set[str]) -> float:
        orig_rel = cls.extract_spatial_relations(orig)
        meta_rel = cls.extract_spatial_relations(meta)
        # Returning -1 if no original relationships exist
        return -1 if not orig_rel else len(orig_rel & meta_rel) / len(orig_rel)

# ==============================
#        ANALYZE CAPTION PAIR
# ==============================
# Comparing meta-generated caption with original to compute all fine-grained metrics
def analyze_caption_pair(meta_caption: str, orig_caption: str) -> Dict[str, float]:
    meta_caption = str(meta_caption) if pd.notna(meta_caption) else ""  # Handling missing values
    orig_caption = str(orig_caption) if pd.notna(orig_caption) else ""
    # Parsing captions using SpaCy NLP model
    meta_doc, orig_doc = nlp(meta_caption), nlp(orig_caption)
    # Extracting nouns and proper nouns
    meta_nouns = {t.text for t in meta_doc if t.pos_ in {"NOUN","PROPN"}}
    orig_nouns = {t.text for t in orig_doc if t.pos_ in {"NOUN","PROPN"}}
    common_nouns = orig_nouns & meta_nouns
    noun_recall = len(common_nouns) / len(orig_nouns) if orig_nouns else 0
    # Returning dictionary of fine-grained metrics
    return {
        "Object FiG": noun_recall,
        "Colour FiG": FineGrainedMetrics.color(meta_doc, orig_doc, common_nouns),
        "Number FiG": FineGrainedMetrics.number(meta_doc, orig_doc, common_nouns),
        "Positional FiG": FineGrainedMetrics.position(meta_doc, orig_doc, common_nouns),
        "Text FiG": FineGrainedMetrics.text(meta_doc, orig_doc, common_nouns)
    }

# ==============================
#       PROCESS MODEL CSV
# ==============================
# Reading model-generated CSV and computing metrics for each row
def process_model_csv(file_path: str):
    df = pd.read_csv(file_path).dropna(subset=["mscoco_caption", "Meta Caption"]).reset_index(drop=True)
    results = [analyze_caption_pair(row["Meta Caption"], row["mscoco_caption"]) for _, row in df.iterrows()]
    metrics_df = pd.DataFrame(results)
    # Combining original CSV data with computed metrics
    return pd.concat([df, metrics_df], axis=1)

# ==============================
#      PARSE HUMAN RESPONSES
# ==============================
# Reading human annotations and organizing them per image and model
def parse_human_responses(file_path: str) -> pd.DataFrame:
    df = pd.read_excel(file_path, header=None)
    rows = []
    for col in df.columns:
        caption_cell = str(df.iloc[0, col])
        match = re.match(r"\[(Flux-Dev|SD2|SDXL)\] Image: (.*?) \| Caption: (.*)", caption_cell)
        if not match:
            continue
        model, image, caption = match.groups()
        human_labels = []
        # Looping over 3 annotators
        for annot_idx in range(1, 4):
            resp_cell = str(df.iloc[annot_idx, col]).strip()
            human_label = "Yes" if resp_cell.startswith("Yes") else "No"
            not_aligning = ", ".join(re.findall(r"(Color|Position|Text|Number|Object|Others)", resp_cell)) if human_label=="No" else ""
            human_labels.append(f"{human_label} ({not_aligning})" if not_aligning else human_label)
        rows.append({
            "Model": model,
            "Image": image,
            "Caption": caption,
            "Human_Responses": "; ".join(human_labels)
        })
    return pd.DataFrame(rows)

# ==============================
#          HUMAN MAJORITY
# ==============================
# Computing majority vote among human annotators
def compute_human_majority(human_df: pd.DataFrame) -> pd.DataFrame:
    def majority_vote(responses):
        # Counting how many annotators said Yes
        return 'Yes' if sum([r.startswith('Yes') for r in responses.split("; ")]) >= 2 else 'No'
    human_df['Human_Majority'] = human_df['Human_Responses'].apply(majority_vote)
    return human_df

# ==============================
#          MERGE & CONSISTENCY
# ==============================
# Merging model metrics with human judgments and computing consistency
def merge_with_human(model_df: pd.DataFrame, human_df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    # Joining datasets on image name and caption text
    merged = pd.merge(
        model_df,
        human_df[human_df['Model']==model_name][['Image','Caption','Human_Responses','Human_Majority']],
        left_on=['image_name','mscoco_caption'],
        right_on=['Image','Caption'],
        how='left'
    ).drop(columns=['Image','Caption'])

    fig_columns = ['Object FiG','Colour FiG','Number FiG','Positional FiG','Text FiG']

    # Computing consistency per FiG metric
    def check_consistency(row):
        consistency = {}
        for col in fig_columns:
            val = row.get(col, None)
            if val == -1 or pd.isna(val):
                # -1 or NaN indicates we cannot evaluate this metric
                consistency[col+'_Consistency'] = None
            else:
                # Special handling for Number FiG: cap at 1
                val_to_check = min(val, 1) if col == 'Number FiG' else val
                if row['Human_Majority'] == 'Yes':
                    consistency[col+'_Consistency'] = 1 if val_to_check == 1 else 0
                else:
                    consistency[col+'_Consistency'] = 1 if val_to_check < 1 else 0
        return pd.Series(consistency)

    # Apply consistency computation to all rows
    merged[[c+'_Consistency' for c in fig_columns]] = merged.apply(check_consistency, axis=1)
    merged['Model'] = model_name
    return merged

# ==============================
#            MAIN PIPELINE
# ==============================
if __name__ == "__main__":

    # --- Step 1: Process model CSVs ---
    sd2_df = process_model_csv("mscoco_sd2_caps.csv")
    sdxl_df = process_model_csv("mscoco_sdxl_caps.csv")
    flux_df = process_model_csv("mscoco_fluxdev_caps.csv")

    # --- Step 2: Parse human responses and compute majority votes ---
    human_df = parse_human_responses("human_responses.xlsx")
    human_df = compute_human_majority(human_df)

    # --- Step 3: Merge metrics with human judgments ---
    sd2_merged = merge_with_human(sd2_df, human_df, "SD2")
    sdxl_merged = merge_with_human(sdxl_df, human_df, "SDXL")
    flux_merged = merge_with_human(flux_df, human_df, "Flux-Dev")

    # --- Step 4: Combine all models into one CSV ---
    final_df = pd.concat([sd2_merged, sdxl_merged, flux_merged], ignore_index=True)
    final_df.to_csv("all_models_human_fig_detailed.csv", index=False)
    print("Final merged CSV with detailed human responses saved as all_models_human_fig_detailed.csv")

    # --- Step 5: Compute average consistency per FiG metric for each model ---
    fig_consistency_cols = ['Object FiG_Consistency','Colour FiG_Consistency','Number FiG_Consistency',
                            'Positional FiG_Consistency','Text FiG_Consistency']
    avg_consistency = final_df.groupby('Model')[fig_consistency_cols].mean().round(3).reset_index()
    avg_consistency.to_csv("avg_fig_consistency_per_model.csv", index=False)
    print("Average consistency per model saved as avg_fig_consistency_per_model.csv")
    print(avg_consistency)


Final merged CSV with detailed human responses saved as all_models_human_fig_detailed.csv
Average consistency per model saved as avg_fig_consistency_per_model.csv
      Model  Object FiG_Consistency  Colour FiG_Consistency  \
0  Flux-Dev                   0.100                   0.500   
1       SD2                   0.556                     NaN   
2      SDXL                   0.800                   0.667   

   Number FiG_Consistency  Positional FiG_Consistency  Text FiG_Consistency  
0                   0.714                       0.000                   NaN  
1                   0.286                       0.667                   NaN  
2                   0.200                       0.500                   NaN  


In [27]:
import pandas as pd

# Load the detailed CSV
df = pd.read_csv("all_models_human_fig_detailed.csv")

# List of FiG metric columns
fig_cols = ['Object FiG','Colour FiG','Number FiG','Positional FiG','Text FiG']

# Replace -1 with NaN (cannot compute)
df[fig_cols] = df[fig_cols].replace(-1, pd.NA)

# Compute average normal FiG scores per model
avg_fig_scores = df.groupby('Model')[fig_cols].mean().round(3).reset_index()

# Save to CSV
avg_fig_scores.to_csv("avg_fig_scores_per_model.csv", index=False)

print("Average FiG scores per model saved as avg_fig_scores_per_model.csv")
print(avg_fig_scores)


Average FiG scores per model saved as avg_fig_scores_per_model.csv
      Model  Object FiG Colour FiG Number FiG Positional FiG Text FiG
0  Flux-Dev       0.465        1.0   0.928571            0.0      NaN
1       SD2       0.509        NaN   0.857143            0.0      NaN
2      SDXL       0.292   0.666667        1.0            0.0      NaN


In [28]:
import pandas as pd
import re

# ==============================
#      CHAIR SCORE FUNCTION
# ==============================
# CHAIR score measures object hallucination: fraction of objects in generated caption that are *not* in ground-truth
def compute_chair(meta_caption: str, orig_caption: str) -> float:
    meta_caption = str(meta_caption).lower()
    orig_caption = str(orig_caption).lower()

    # Extract nouns (very simple version using words with letters only)
    meta_nouns = set(re.findall(r'\b[a-z]+\b', meta_caption))
    orig_nouns = set(re.findall(r'\b[a-z]+\b', orig_caption))

    if not meta_nouns:
        return -1  # cannot compute
    # Objects in meta that are NOT in original (hallucinated)
    hallucinated = meta_nouns - orig_nouns
    return len(hallucinated) / len(meta_nouns)

# ==============================
#       PROCESS MODEL CSV
# ==============================
def add_chair_score(file_path: str, model_name: str):
    df = pd.read_csv(file_path)
    df['CHAIR'] = df.apply(lambda row: compute_chair(row['Meta Caption'], row['mscoco_caption']), axis=1)
    df['Model'] = model_name
    return df

# ==============================
#         MAIN
# ==============================
if __name__ == "__main__":
    sd2_df = add_chair_score("mscoco_sd2_caps.csv", "SD2")
    sdxl_df = add_chair_score("mscoco_sdxl_caps.csv", "SDXL")
    flux_df = add_chair_score("mscoco_fluxdev_caps.csv", "Flux-Dev")

    # Combine all models
    all_models_chair = pd.concat([sd2_df, sdxl_df, flux_df], ignore_index=True)

    # Save to CSV
    all_models_chair.to_csv("all_models_chair_scores.csv", index=False)
    print("CHAIR scores computed and saved to all_models_chair_scores.csv")

    # Optional: average CHAIR per model
    avg_chair = all_models_chair.groupby("Model")['CHAIR'].mean().round(3).reset_index()
    print(avg_chair)


CHAIR scores computed and saved to all_models_chair_scores.csv
      Model  CHAIR
0  Flux-Dev  0.869
1       SD2  0.816
2      SDXL  0.906


In [29]:
import os
import zipfile
from google.colab import files

# Name of the output zip
zip_name = "FiG-human_comparision.zip"

# Find all CSV files in the current directory
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]

# Create the zip and add all CSVs
with zipfile.ZipFile(zip_name, 'w') as zipf:
    for f in csv_files:
        zipf.write(f)

# Trigger download in Colab
files.download(zip_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>