In [None]:
# ==============================
#          IMPORTS
# ==============================
import pandas as pd
import numpy as np
import spacy
from typing import Set, Dict
import re

# ==============================
#        LOAD SPACY MODEL
# ==============================
try:
    # Loading the English language model for NLP processing
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # Downloading the model if it's not found locally
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Merging entities to treat multi-word entities as single tokens
nlp.add_pipe("merge_entities")

# ==============================
#        FINE-GRAINED METRICS
# ==============================
class FineGrainedMetrics:
    """
    Defining a class that is calculating fine-grained similarity metrics
    between original and generated captions
    """

    @staticmethod
    def related_to_noun(doc, attribute: str, noun: str) -> bool:
        """
        Checking if a specific attribute is grammatically related to a given noun in the document
        by examining the noun's syntactic subtree
        """
        for token in doc:
            if token.text == noun:
                # Looking for the attribute within the noun's grammatical subtree
                if attribute in [t.text for t in token.subtree]:
                    return True
        return False

    @classmethod
    def color(cls, meta, orig, generated_nouns: Set[str]) -> float:
        """
        Calculating color attribute matching score by finding color adjectives
        that are grammatically connected to common nouns
        """
        # Defining a comprehensive set of color terms
        COLORS = {'red','blue','green','yellow','black','white','gray','grey',
                  'orange','pink','purple','brown','violet','indigo','turquoise',
                  'cyan','magenta'}
        score, original_colors = 0.0, set()

        for noun in generated_nouns:
            # Finding colors in metadata caption that are modifying this noun
            meta_colors = {t.text for t in meta
                           if cls.related_to_noun(meta, t.text, noun) and t.dep_ in {'acomp','amod'} and t.text.lower() in COLORS}
            # Finding colors in original caption that are modifying this noun
            orig_colors = {t.text for t in orig
                           if cls.related_to_noun(orig, t.text, noun) and t.dep_ in {'acomp','amod'} and t.text.lower() in COLORS}
            original_colors.update(orig_colors)

            # Adding to score for each matching color
            if orig_colors:
                score += len(orig_colors & meta_colors)

        # Returning -1 if no colors found in original, otherwise calculating precision score
        return -1 if not original_colors else score / len(original_colors)

    @classmethod
    def number(cls, meta, orig, generated_nouns: Set[str]) -> float:
        """
        Calculating number/quantity attribute matching by finding numeric modifiers
        and quantifiers connected to common nouns
        """
        # Mapping textual quantity words to their numeric equivalents
        QUANTITY_MAP = {'a':'1','an':'1','the':'1','one':'1','two':'2','three':'3',
                        'couple':'2','few':'3','several':'4','many':'5','dozen':'12'}
        score, original_numbers = 0.0, set()

        for noun in generated_nouns:
            # Extracting numeric modifiers from metadata caption
            meta_nums = {QUANTITY_MAP.get(t.text.lower(), t.text) for t in meta
                         if cls.related_to_noun(meta, t.text, noun) and t.dep_ in {'nummod','det'}}
            # Extracting numeric modifiers from original caption
            orig_nums = {QUANTITY_MAP.get(t.text.lower(), t.text) for t in orig
                         if cls.related_to_noun(orig, t.text, noun) and t.dep_ in {'nummod','det'}}
            original_numbers.update(orig_nums)

            # Adding to score for each matching number
            if orig_nums:
                score += len(orig_nums & meta_nums)

        if not original_numbers:
            return -1

        # Calculating ratio and ensuring it doesn't exceed 1.0
        raw_val = score / len(original_numbers)
        return min(raw_val, 1.0)

    @classmethod
    def text(cls, meta, orig, _: Set[str]) -> float:
        """
        Calculating text/quote matching score by extracting quoted text
        when text indicators are present
        """
        TEXT_INDICATORS = {'written','saying','says','reading','text'}
        QUOTE_PATTERN = r'[\"\'«»"]([^"\'«»"]*)[\"\'«»"]'

        # Checking if original caption contains text indicators
        if any(t.text in TEXT_INDICATORS for t in orig):
            # Extracting all quoted text from both captions
            orig_matches = re.findall(QUOTE_PATTERN, orig.text)
            meta_matches = re.findall(QUOTE_PATTERN, meta.text)

            if not orig_matches:
                return -1

            # Normalizing text by removing spaces and converting to lowercase
            orig_norm = [''.join(s.lower().split()) for s in orig_matches]
            meta_norm = [''.join(s.lower().split()) for s in meta_matches]

            # Counting how many original quotes appear in metadata quotes
            matches = sum(any(o in m for m in meta_norm) for o in orig_norm)
            return matches / len(orig_matches)

        return -1

    @staticmethod
    def extract_spatial_relations(doc):
        """
        Extracting spatial relationships in the form of (subject, preposition, object) tuples
        to understand positional information
        """
        rels = set()
        for token in doc:
            if token.dep_ == 'prep':  # Looking for preposition tokens
                # Finding prepositional objects
                pobjects = [child for child in token.children if child.dep_ == 'pobj']
                if pobjects:
                    pobj = pobjects[0].text
                    subj = None
                    # Finding the subject related to this preposition
                    for anc in token.ancestors:
                        if anc.dep_ in {'nsubj','nsubjpass','dobj','pobj'}:
                            subj = anc.text
                            break
                    if subj:
                        # Storing the spatial relation triple
                        rels.add((subj, token.text, pobj))
        return rels

    @classmethod
    def position(cls, meta, orig, _: Set[str]) -> float:
        """
        Calculating positional relationship matching score by comparing
        spatial relation tuples between captions
        """
        orig_rel = cls.extract_spatial_relations(orig)
        meta_rel = cls.extract_spatial_relations(meta)

        # Returning -1 if no spatial relations in original, otherwise calculating overlap ratio
        return -1 if not orig_rel else len(orig_rel & meta_rel) / len(orig_rel)

# ==============================
#        ANALYZE CAPTION PAIR
# ==============================
def analyze_caption_pair(meta_caption: str, orig_caption: str) -> Dict[str, float]:
    """
    Analyzing a pair of captions (metadata vs original) and calculating
    multiple fine-grained similarity metrics
    """
    # Handling NaN values by converting to empty strings
    meta_caption = str(meta_caption) if pd.notna(meta_caption) else ""
    orig_caption = str(orig_caption) if pd.notna(orig_caption) else ""

    # Processing both captions with spaCy NLP pipeline
    meta_doc, orig_doc = nlp(meta_caption), nlp(orig_caption)

    # Extracting nouns and proper nouns from both captions
    meta_nouns = {t.text for t in meta_doc if t.pos_ in {"NOUN","PROPN"}}
    orig_nouns = {t.text for t in orig_doc if t.pos_ in {"NOUN","PROPN"}}

    # Finding common nouns between both captions
    common_nouns = orig_nouns & meta_nouns

    # Calculating noun recall (object-level similarity)
    noun_recall = len(common_nouns) / len(orig_nouns) if orig_nouns else 0

    # Returning dictionary of all fine-grained metrics
    return {
        "Object FiG": noun_recall,
        "Colour FiG": FineGrainedMetrics.color(meta_doc, orig_doc, common_nouns),
        "Number FiG": FineGrainedMetrics.number(meta_doc, orig_doc, common_nouns),
        "Positional FiG": FineGrainedMetrics.position(meta_doc, orig_doc, common_nouns),
        "Text FiG": FineGrainedMetrics.text(meta_doc, orig_doc, common_nouns)
    }

# ==============================
#       PROCESS MODEL CSV
# ==============================
def process_model_csv(file_path: str):
    """
    Processing a CSV file containing model outputs and calculating
    fine-grained metrics for each caption pair
    """
    # Reading CSV and removing rows with missing prompts or metadata captions
    df = pd.read_csv(file_path).dropna(subset=["Prompts", "Meta Caption"]).reset_index(drop=True)

    # Analyzing each caption pair and storing results
    results = [analyze_caption_pair(row["Meta Caption"], row["Prompts"]) for _, row in df.iterrows()]
    metrics_df = pd.DataFrame(results)

    # Combining original data with calculated metrics
    return pd.concat([df, metrics_df], axis=1)

# ==============================
#      PARSE HUMAN RESPONSES
# ==============================
def parse_human_responses(file_path: str) -> pd.DataFrame:
    """
    Parsing human evaluation data from Excel file and extracting
    alignment judgments from annotators
    """
    df = pd.read_excel(file_path, header=None)
    rows = []

    for col in df.columns:
        # Parsing the caption information cell
        caption_cell = str(df.iloc[0, col])
        match = re.match(r"\[(.*?)\] Image: (\d+) \| Prompt: (.*)", caption_cell)
        if not match:
            continue

        model, image, caption = match.groups()
        human_labels = []

        # Processing responses from two annotators
        for annot_idx in range(1, 3):
            resp_cell = str(df.iloc[annot_idx, col]).strip()
            # Determining if response indicates alignment
            human_label = "Yes" if resp_cell.startswith("Yes") else "No"
            # Extracting which aspects are not aligning (for "No" responses)
            not_aligning = ", ".join(re.findall(r"(Color|Position|Text|Number|Object|Others)", resp_cell)) if human_label=="No" else ""
            human_labels.append(f"{human_label} ({not_aligning})" if not_aligning else human_label)

        rows.append({
            "Model": model,
            "Image": image,
            "Caption": caption,
            "Human_Responses": "; ".join(human_labels)
        })

    return pd.DataFrame(rows)

# ==============================
#          HUMAN MAJORITY
# ==============================
def compute_human_majority(human_df: pd.DataFrame) -> pd.DataFrame:
    """
    Computing majority vote from human annotators for each caption evaluation
    """
    def majority_vote(responses):
        # Counting "Yes" responses and taking majority (>=2 out of 3)
        return 'Yes' if sum([r.startswith('Yes') for r in responses.split("; ")]) >= 2 else 'No'

    human_df['Human_Majority'] = human_df['Human_Responses'].apply(majority_vote)
    return human_df

def merge_with_human(model_df: pd.DataFrame, human_df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    Merging model output data with human evaluation data and calculating
    consistency between fine-grained metrics and human judgments
    """
    # Ensuring consistent data types for merging
    model_df['image_name'] = model_df['image_name'].astype(str)
    human_df['Image'] = human_df['Image'].astype(str)

    # Merging model data with human evaluation data
    merged = pd.merge(
        model_df,
        human_df[human_df['Model']==model_name][['Image','Caption','Human_Responses','Human_Majority']],
        left_on=['image_name','Prompts'],
        right_on=['Image','Caption'],
        how='left'
    ).drop(columns=['Image','Caption'])

    # Defining the fine-grained metric columns
    fig_columns = ['Object FiG','Colour FiG','Number FiG','Positional FiG','Text FiG']

    def check_consistency(row):
        """
        Checking consistency between fine-grained metrics and human majority vote:
        - If humans say "Yes" (aligned), expecting all FiG scores = 1
        - If humans say "No" (not aligned), expecting at least one FiG score < 1
        """
        consistency = {}
        for col in fig_columns:
            val = row.get(col, None)
            if val == -1 or pd.isna(val):
                # Marking as None if metric is not applicable
                consistency[col+'_Consistency'] = None
            else:
                # Handling Number FiG separately since it can exceed 1.0
                val_to_check = min(val, 1) if col == 'Number FiG' else val

                if row['Human_Majority'] == 'Yes':
                    # Expecting perfect scores for human-approved alignments
                    consistency[col+'_Consistency'] = 1 if val_to_check == 1 else 0
                else:
                    # Expecting imperfect scores for human-rejected alignments
                    consistency[col+'_Consistency'] = 1 if val_to_check < 1 else 0
        return pd.Series(consistency)

    # Applying consistency check to all rows
    merged[[c+'_Consistency' for c in fig_columns]] = merged.apply(check_consistency, axis=1)
    merged['Model'] = model_name
    return merged

# ==============================
#            MAIN PIPELINE
# ==============================
if __name__ == "__main__":
    """
    Executing the main analysis pipeline that processes model outputs,
    combines with human evaluations, and generates final metrics
    """

    # Processing fine-grained metrics for each model's outputs
    sd2_df = process_model_csv("drawbench_sd2.csv")
    sdxl_df = process_model_csv("drawbench_sdxl.csv")
    flux_df = process_model_csv("drawbench_fluxdev.csv")

    # Parsing and processing human evaluation data
    human_df = parse_human_responses("human_response.xlsx")
    human_df = compute_human_majority(human_df)

    # Merging model metrics with human evaluations
    sd2_merged = merge_with_human(sd2_df, human_df, "SD2")
    sdxl_merged = merge_with_human(sdxl_df, human_df, "SDXL")
    flux_merged = merge_with_human(flux_df, human_df, "Flux-Dev")

    # Combining all model results into final dataframe
    final_df = pd.concat([sd2_merged, sdxl_merged, flux_merged], ignore_index=True)
    final_df.to_csv("drawbench_all_models_fig_detailed.csv", index=False)
    print("Saved -> drawbench_all_models_fig_detailed.csv")

    # Calculating average consistency metrics per model
    fig_consistency_cols = ['Object FiG_Consistency','Colour FiG_Consistency','Number FiG_Consistency',
                            'Positional FiG_Consistency','Text FiG_Consistency']
    avg_consistency = final_df.groupby('Model')[fig_consistency_cols].mean().round(3).reset_index()
    avg_consistency.to_csv("drawbench_avg_fig_consistency_per_model.csv", index=False)
    print("Saved -> drawbench_avg_fig_consistency_per_model.csv")
    print(avg_consistency)

Saved -> drawbench_all_models_fig_detailed.csv
Saved -> drawbench_avg_fig_consistency_per_model.csv
      Model  Object FiG_Consistency  Colour FiG_Consistency  \
0  Flux-Dev                    0.75                   0.000   
1       SD2                    0.95                   0.500   
2      SDXL                    0.90                   0.667   

   Number FiG_Consistency  Positional FiG_Consistency  Text FiG_Consistency  
0                   0.562                       0.800                  1.00  
1                   0.286                       1.000                  1.00  
2                   0.357                       0.909                  0.75  


In [None]:


# ==============================
#      LOAD DETAILED CSV
# ==============================
df = pd.read_csv("drawbench_all_models_fig_detailed.csv")  # Use your detailed CSV

# ==============================
#      LIST OF FIG COLUMNS
# ==============================
fig_cols = ['Object FiG','Colour FiG','Number FiG','Positional FiG','Text FiG']

# Replace -1 with NaN since -1 indicates metric could not be computed
df[fig_cols] = df[fig_cols].replace(-1, pd.NA)

# ==============================
#      COMPUTE AVERAGE FIG SCORES PER MODEL
# ==============================
avg_fig_scores = df.groupby('Model')[fig_cols].mean().round(3).reset_index()

# ==============================
#      SAVE TO CSV
# ==============================
avg_fig_scores.to_csv("drawbench_avg_fig_scores_per_model.csv", index=False)

# ==============================
#      PRINT RESULTS
# ==============================
print("Average FiG scores per model saved as drawbench_avg_fig_scores_per_model.csv")
print(avg_fig_scores)


Average FiG scores per model saved as drawbench_avg_fig_scores_per_model.csv
      Model  Object FiG Colour FiG Number FiG Positional FiG Text FiG
0  Flux-Dev       0.431       0.75     0.8125            0.0     0.25
1       SD2       0.307        0.5   0.642857            0.0      0.0
2      SDXL       0.356        1.0   0.821429            0.0     0.25


In [None]:
# ==============================
#      CHAIR SCORE FUNCTION
# ==============================
# CHAIR score measures object hallucination: fraction of objects in generated caption that are *not* in ground-truth
def compute_chair(meta_caption: str, orig_caption: str) -> float:
    meta_caption = str(meta_caption).lower()
    orig_caption = str(orig_caption).lower()

    # Extract nouns (simplified as words with letters only)
    meta_nouns = set(re.findall(r'\b[a-z]+\b', meta_caption))
    orig_nouns = set(re.findall(r'\b[a-z]+\b', orig_caption))

    if not meta_nouns:
        return -1  # cannot compute
    # Objects in meta that are NOT in original (hallucinated)
    hallucinated = meta_nouns - orig_nouns
    return len(hallucinated) / len(meta_nouns)

# ==============================
#       PROCESS MODEL CSV
# ==============================
def add_chair_score(file_path: str, model_name: str):
    df = pd.read_csv(file_path)
    df['CHAIR'] = df.apply(lambda row: compute_chair(row['Meta Caption'], row['Prompts']), axis=1)
    df['Model'] = model_name
    return df

# ==============================
#         MAIN
# ==============================
if __name__ == "__main__":
    # Process each DrawBench model
    sd2_df = add_chair_score("drawbench_sd2.csv", "SD2")
    sdxl_df = add_chair_score("drawbench_sdxl.csv", "SDXL")
    flux_df = add_chair_score("drawbench_fluxdev.csv", "Flux-Dev")

    # Combine all models
    all_models_chair = pd.concat([sd2_df, sdxl_df, flux_df], ignore_index=True)

    # Save to CSV
    all_models_chair.to_csv("drawbench_all_models_chair_scores.csv", index=False)
    print("CHAIR scores computed and saved to drawbench_all_models_chair_scores.csv")

    # Optional: average CHAIR per model
    avg_chair = all_models_chair.groupby("Model")['CHAIR'].mean().round(3).reset_index()
    print("Average CHAIR scores per model:")
    print(avg_chair)



CHAIR scores computed and saved to drawbench_all_models_chair_scores.csv
Average CHAIR scores per model:
      Model  CHAIR
0  Flux-Dev  0.858
1       SD2  0.885
2      SDXL  0.880


In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from tabulate import tabulate

# ==============================
#       LOAD CSV
# ==============================
df = pd.read_csv("drawbench_all_models_fig_detailed.csv")

# ==============================
#       FiG CATEGORIES
# ==============================
fig_cols = ['Object', 'Colour', 'Number', 'Positional', 'Text']

# ==============================
#   HELPER: PARSE Human_Responses
# ==============================
def parse_human_response(resp):
    if pd.isna(resp) or resp.strip() == "":
        return []
    parts = [p.strip().split('(')[-1].replace(')', '').strip().lower() for p in resp.split(';')]
    return parts

# ==============================
#   FUNCTION TO COMPUTE PRECISION / RECALL / F1
# ==============================
def compute_model_metrics(df, model_name, fig_cols):
    model_df = df[df['Model'] == model_name]
    metrics = {}

    for col in fig_cols:
        col_lower = col.lower()

        # y_true: 1 = no hallucination, 0 = hallucination
        y_true = model_df['Human_Responses'].apply(
            lambda x: 0 if col_lower in parse_human_response(x) else 1
        )

        # y_pred: 1 = model score = 1 (perfect), 0 = anything < 1
        y_pred = model_df[col + ' FiG_Consistency'].apply(
            lambda x: 1 if pd.notna(x) and x == 1 else 0
        )

        # Filter out NaNs
        mask = y_pred.notna() & y_true.notna()
        y_true_filtered = y_true[mask]
        y_pred_filtered = y_pred[mask]

        # Compute metrics
        metrics[col + '_Precision'] = round(
            precision_score(y_true_filtered, y_pred_filtered, zero_division=0), 3
        ) if len(y_true_filtered) > 0 else np.nan

        metrics[col + '_Recall'] = round(
            recall_score(y_true_filtered, y_pred_filtered, zero_division=0), 3
        ) if len(y_true_filtered) > 0 else np.nan

        metrics[col + '_F1'] = round(
            f1_score(y_true_filtered, y_pred_filtered, zero_division=0), 3
        ) if len(y_true_filtered) > 0 else np.nan

    # Overall metrics (mean across columns)
    metrics['Overall_Precision'] = round(np.nanmean([metrics[c+'_Precision'] for c in fig_cols]), 3)
    metrics['Overall_Recall'] = round(np.nanmean([metrics[c+'_Recall'] for c in fig_cols]), 3)
    metrics['Overall_F1'] = round(np.nanmean([metrics[c+'_F1'] for c in fig_cols]), 3)

    metrics['Model'] = model_name
    return metrics

# ==============================
#   BUILD TABLES FOR PRECISION / RECALL / F1
# ==============================
all_models = df['Model'].unique()
results = [compute_model_metrics(df, m, fig_cols) for m in all_models]
results_df = pd.DataFrame(results)

# Overall average across all models
numeric_cols = [c for c in results_df.columns if c != 'Model']
overall_avg = pd.DataFrame([{**results_df[numeric_cols].mean(numeric_only=True).round(3), 'Model': 'Overall Avg (All Models)'}])
results_df = pd.concat([results_df, overall_avg], ignore_index=True)

# ==============================
#   ACCURACY TABLE (original mean of FiG consistency)
# ==============================
fig_consistency_cols = [c + ' FiG_Consistency' for c in fig_cols]

avg_consistency = (
    df.groupby('Model')[fig_consistency_cols]
      .mean(numeric_only=True)
      .round(3)
      .reset_index()
)

avg_consistency['Overall_Accuracy'] = avg_consistency[fig_consistency_cols].mean(axis=1, skipna=True).round(3)

overall_summary = pd.DataFrame({
    'Model': ['Overall Avg (All Models)'],
    **{col: [avg_consistency[col].mean().round(3)] for col in fig_consistency_cols},
    'Overall_Accuracy': [avg_consistency['Overall_Accuracy'].mean().round(3)]
})

# Merge per-model and overall into one table
combined_accuracy = pd.concat([avg_consistency, overall_summary], ignore_index=True)

# ==============================
#   PRINT TABLES
# ==============================
# Accuracy Table
acc_cols = ['Model'] + fig_consistency_cols + ['Overall_Accuracy']
print("\n=== DrawBench FiG Accuracy vs Human Responses ===")
print(tabulate(combined_accuracy[acc_cols].fillna("-"), headers="keys", tablefmt="pretty", showindex=False))

# Precision Table
prec_cols = ['Model'] + [c+'_Precision' for c in fig_cols] + ['Overall_Precision']
print("\n=== DrawBench FiG Precision vs Human Responses ===")
print(tabulate(results_df[prec_cols].fillna("-"), headers="keys", tablefmt="pretty", showindex=False))

# Recall Table
rec_cols = ['Model'] + [c+'_Recall' for c in fig_cols] + ['Overall_Recall']
print("\n=== DrawBench FiG Recall vs Human Responses ===")
print(tabulate(results_df[rec_cols].fillna("-"), headers="keys", tablefmt="pretty", showindex=False))

# F1 Table
f1_cols = ['Model'] + [c+'_F1' for c in fig_cols] + ['Overall_F1']
print("\n=== DrawBench FiG F1 Score vs Human Responses ===")
print(tabulate(results_df[f1_cols].fillna("-"), headers="keys", tablefmt="pretty", showindex=False))



=== DrawBench FiG Accuracy vs Human Responses ===
+--------------------------+------------------------+------------------------+------------------------+----------------------------+----------------------+------------------+
|          Model           | Object FiG_Consistency | Colour FiG_Consistency | Number FiG_Consistency | Positional FiG_Consistency | Text FiG_Consistency | Overall_Accuracy |
+--------------------------+------------------------+------------------------+------------------------+----------------------------+----------------------+------------------+
|         Flux-Dev         |          0.75          |          0.0           |         0.562          |            0.8             |         1.0          |      0.622       |
|           SD2            |          0.95          |          0.5           |         0.286          |            1.0             |         1.0          |      0.747       |
|           SDXL           |          0.9           |         0.667       

In [None]:

import os
import zipfile
from google.colab import files

# Name of the output zip
zip_name = "Drawbench-FiG-human_comparision.zip"

# Find all CSV files in the current directory
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]

# Create the zip and add all CSVs
with zipfile.ZipFile(zip_name, 'w') as zipf:
    for f in csv_files:
        zipf.write(f)

# Trigger download in Colab
files.download(zip_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>