In [1]:
# ==============================
#          IMPORTS
# ==============================
# Importing necessary libraries for data processing and analysis
import pandas as pd  # Using pandas for data manipulation and DataFrame operations
import numpy as np  # Using numpy for numerical computations and array operations
import spacy  # Using spaCy for natural language processing tasks
import re  # Using regular expressions for text pattern matching
from typing import Set, Dict  # Using type hints for better code documentation
from sklearn.metrics import precision_score, recall_score, f1_score  # Importing evaluation metrics
from tabulate import tabulate  # Using tabulate for creating formatted tables

# ==============================
#        LOAD SPACY MODEL
# ==============================
# Attempting to load the English spaCy model with small word vectors
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # Downloading the model if it's not available locally
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Adding entity merging to the pipeline for better text analysis
nlp.add_pipe("merge_entities")

# ==============================
#        FINE-GRAINED METRICS
# ==============================
# Defining a class for calculating fine-grained evaluation metrics
class FineGrainedMetrics:
    @staticmethod
    def related_to_noun(doc, attribute: str, noun: str) -> bool:
        # Checking if an attribute is syntactically related to a specific noun
        for token in doc:
            if token.text == noun:
                # Looking for the attribute within the noun's syntactic subtree
                if attribute in [t.text for t in token.subtree]:
                    return True
        return False

    @classmethod
    def color(cls, meta, orig, generated_nouns: Set[str]) -> float:
        # Defining a comprehensive set of color words for detection
        COLORS = {'red','blue','green','yellow','black','white','gray','grey',
                  'orange','pink','purple','brown','violet','indigo','turquoise',
                  'cyan','magenta'}
        score, original_colors = 0.0, set()
        # Analyzing each generated noun for color associations
        for noun in generated_nouns:
            # Extracting colors that are modifying nouns in metadata
            meta_colors = {t.text for t in meta
                           if cls.related_to_noun(meta, t.text, noun)
                           and t.dep_ in {'acomp','amod'} and t.text.lower() in COLORS}
            # Extracting colors that are modifying nouns in original text
            orig_colors = {t.text for t in orig
                           if cls.related_to_noun(orig, t.text, noun)
                           and t.dep_ in {'acomp','amod'} and t.text.lower() in COLORS}
            # Collecting all original colors for scoring
            original_colors.update(orig_colors)
            # Calculating score based on color matches
            if orig_colors:
                score += len(orig_colors & meta_colors)
        # Returning -1 if no colors found, otherwise calculating precision
        return -1 if not original_colors else score / len(original_colors)

    @classmethod
    def number(cls, meta, orig, generated_nouns: Set[str]) -> float:
        # Creating mapping from quantity words to numerical values
        QUANTITY_MAP = {'a':'1','an':'1','the':'1','one':'1','two':'2','three':'3',
                        'couple':'2','few':'3','several':'4','many':'5','dozen':'12'}
        score, original_numbers = 0.0, set()
        # Analyzing each noun for numerical quantifiers
        for noun in generated_nouns:
            # Extracting numerical modifiers from metadata
            meta_nums = {QUANTITY_MAP.get(t.text.lower(), t.text) for t in meta
                         if cls.related_to_noun(meta, t.text, noun) and t.dep_ in {'nummod','det'}}
            # Extracting numerical modifiers from original text
            orig_nums = {QUANTITY_MAP.get(t.text.lower(), t.text) for t in orig
                         if cls.related_to_noun(orig, t.text, noun) and t.dep_ in {'nummod','det'}}
            # Collecting all original numbers for scoring
            original_numbers.update(orig_nums)
            # Calculating score based on number matches
            if orig_nums:
                score += len(orig_nums & meta_nums)
        # Handling cases where no numbers are found
        if not original_numbers:
            return -1
        # Calculating precision and ensuring it doesn't exceed 1.0
        raw_val = score / len(original_numbers)
        return min(raw_val, 1.0)

    @classmethod
    def text(cls, meta, orig, _: Set[str]) -> float:
        # Defining words that indicate text content is being described
        TEXT_INDICATORS = {'written','saying','says','reading','text'}
        # Pattern for detecting quoted text in captions
        QUOTE_PATTERN = r'[\"\'«»"]([^"\'«»"]*)[\"\'«»"]'
        # Checking if original caption contains text indicators
        if any(t.text in TEXT_INDICATORS for t in orig):
            # Extracting quoted text from both documents
            orig_matches = re.findall(QUOTE_PATTERN, orig.text)
            meta_matches = re.findall(QUOTE_PATTERN, meta.text)
            if not orig_matches:
                return -1
            # Normalizing text by removing spaces and converting to lowercase
            orig_norm = [''.join(s.lower().split()) for s in orig_matches]
            meta_norm = [''.join(s.lower().split()) for s in meta_matches]
            # Counting how many original quotes appear in metadata
            matches = sum(any(o in m for m in meta_norm) for o in orig_norm)
            return matches / len(orig_matches)
        return -1

    # ==============================
    # UPDATED SPATIAL RELATIONS FOR POSITION
    # ==============================
    @staticmethod
    def extract_spatial_relations(doc):
        """
        Extracting spatial relationships as (subject, preposition, object) tuples.
        Uses token ancestors to find subjects (nsubj, dobj, pobj) for prepositions.
        """
        rels = set()
        # Looking for preposition tokens in the document
        for token in doc:
            if token.dep_ == 'prep':
                # Finding prepositional objects
                for pobj in [c for c in token.children if c.dep_ == 'pobj']:
                    subj = None
                    # Traversing ancestors to find the subject
                    for anc in token.ancestors:
                        if anc.dep_ in {'nsubj', 'nsubjpass', 'dobj', 'pobj'}:
                            subj = anc.lemma_
                            break
                    # Creating spatial relation triple if subject is found
                    if subj:
                        rels.add((subj, token.lemma_, pobj.lemma_))
        return rels

    @classmethod
    def position(cls, meta, orig, _: Set[str]) -> float:
        # Extracting spatial relations from both documents
        orig_rel = cls.extract_spatial_relations(orig)
        meta_rel = cls.extract_spatial_relations(meta)
        if not orig_rel:
            return -1
        # Calculating precision of spatial relation matches
        return len(orig_rel & meta_rel) / len(orig_rel)

# ==============================
#        ANALYZE CAPTION PAIR
# ==============================
def analyze_caption_pair(meta_caption: str, orig_caption: str) -> Dict[str, float]:
    # Converting captions to strings and handling missing values
    meta_caption = str(meta_caption) if pd.notna(meta_caption) else ""
    orig_caption = str(orig_caption) if pd.notna(orig_caption) else ""
    # Processing text with spaCy to create document objects
    meta_doc, orig_doc = nlp(meta_caption), nlp(orig_caption)

    # Object FiG using simpler noun-recall logic
    # Extracting nouns and proper nouns from both documents
    meta_nouns = {t.text for t in meta_doc if t.pos_ in {"NOUN","PROPN"}}
    orig_nouns = {t.text for t in orig_doc if t.pos_ in {"NOUN","PROPN"}}
    # Finding common nouns between original and metadata
    common_nouns = orig_nouns & meta_nouns
    # Calculating noun recall metric
    noun_recall = len(common_nouns) / len(orig_nouns) if orig_nouns else 0

    # Returning dictionary with all fine-grained metrics
    return {
        "Object FiG": noun_recall,
        "Colour FiG": FineGrainedMetrics.color(meta_doc, orig_doc, common_nouns),
        "Number FiG": FineGrainedMetrics.number(meta_doc, orig_doc, common_nouns),
        "Positional FiG": FineGrainedMetrics.position(meta_doc, orig_doc, common_nouns),
        "Text FiG": FineGrainedMetrics.text(meta_doc, orig_doc, common_nouns)
    }

# ==============================
#       PROCESS MODEL CSV
# ==============================
def process_model_csv(file_path: str):
    # Reading CSV file and removing rows with missing prompts or captions
    df = pd.read_csv(file_path).dropna(subset=["Prompts", "Meta Caption"]).reset_index(drop=True)
    # Analyzing each caption pair to compute metrics
    results = [analyze_caption_pair(row["Meta Caption"], row["Prompts"]) for _, row in df.iterrows()]
    # Creating DataFrame from results and combining with original data
    metrics_df = pd.DataFrame(results)
    return pd.concat([df, metrics_df], axis=1)

# ==============================
#      PARSE HUMAN RESPONSES
# ==============================
def parse_human_responses(file_path: str) -> pd.DataFrame:
    # Reading Excel file with human annotation data
    df = pd.read_excel(file_path, header=None)
    rows = []
    # Processing each column in the Excel file
    for col in df.columns:
        caption_cell = str(df.iloc[0, col])
        # Using regex to extract model, image, and caption information
        match = re.match(r"\[(.*?)\] Image: (\d+) \| Prompt: (.*)", caption_cell)
        if not match:
            continue
        model, image, caption = match.groups()
        human_labels = []
        # Processing responses from two human annotators
        for annot_idx in range(1, 3):
            resp_cell = str(df.iloc[annot_idx, col]).strip()
            # Determining if response is positive (Yes) or negative (No)
            human_label = "Yes" if resp_cell.startswith("Yes") else "No"
            # Extracting alignment issues from negative responses
            not_aligning = ", ".join(re.findall(r"(Object|Colour|Number|Position|Text|Others)", resp_cell)) if human_label=="No" else ""
            # Formatting human label with alignment issues
            human_labels.append(f"{human_label} ({not_aligning})" if not_aligning else human_label)
        # Storing parsed data for each image-caption pair
        rows.append({
            "Model": model,
            "Image": image,
            "Caption": caption,
            "Human_Responses": "; ".join(human_labels)
        })
    return pd.DataFrame(rows)

# ==============================
#          HUMAN MAJORITY
# ==============================
def compute_human_majority(human_df: pd.DataFrame) -> pd.DataFrame:
    def majority_vote(responses):
        # Calculating majority vote (Yes if at least 2 out of 2 say Yes)
        return 'Yes' if sum([r.startswith('Yes') for r in responses.split("; ")]) >= 2 else 'No'
    # Applying majority vote to create consolidated human judgment
    human_df['Human_Majority'] = human_df['Human_Responses'].apply(majority_vote)
    return human_df

# ==============================
#       MERGE WITH HUMAN
# ==============================

def merge_with_human(model_df: pd.DataFrame, human_df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    # Ensuring consistent data types for merging
    model_df['image_name'] = model_df['image_name'].astype(str)
    human_df['Image'] = human_df['Image'].astype(str)

    # Merging model outputs with human evaluations
    merged = pd.merge(
        model_df,
        human_df[human_df['Model']==model_name][['Image','Caption','Human_Responses','Human_Majority']],
        left_on=['image_name','Prompts'],
        right_on=['Image','Caption'],
        how='left'
    ).drop(columns=['Image','Caption'])

    # Object FiG alignment
    def check_object_alignment(row):
        # Checking if Object FiG metric aligns with human judgment
        val = row.get("Object FiG", None)
        if val == -1 or pd.isna(val):
            return None
        # For positive human judgment, expecting perfect object recall
        if row['Human_Majority'] == 'Yes':
            return 1 if val == 1 else 0
        # For negative human judgment, expecting imperfect object recall
        else:
            return 1 if val < 1 else 0
    merged["Object FiG_alignment"] = merged.apply(check_object_alignment, axis=1)

    # Other FiG metrics alignment, with special logic for Positional and Text FiG
    fig_columns = ['Colour FiG','Number FiG','Positional FiG','Text FiG']
    def check_other_alignment(row):
        alignment = {}
        human = row.get("Human_Majority", "Yes")
        # Checking alignment for each fine-grained metric
        for col in fig_columns:
            val = row.get(col, None)
            if val == -1 or pd.isna(val):
                alignment[col+'_alignment'] = None
                continue
            val = min(val,1.0)
            # SPECIAL LOGIC for Positional and Text FiG
            if col in ['Positional FiG','Text FiG']:
                # Considering zero score with negative human judgment as aligned
                if val == 0 and human == 'No':
                    alignment[col+'_alignment'] = 1
                    continue
                alignment[col+'_alignment'] = 1 if val == 1 else 0
            else:
                alignment[col+'_alignment'] = 1 if val == 1 else 0
        return pd.Series(alignment)

    # Applying alignment checks to all fine-grained metrics
    merged[[c+'_alignment' for c in fig_columns]] = merged.apply(check_other_alignment, axis=1)
    merged['Model'] = model_name
    return merged


# ==============================
#   HELPER: PARSE HUMAN RESPONSE
# ==============================
def parse_human_response(resp):
    # Parsing human responses to extract alignment issues
    if pd.isna(resp) or resp.strip() == "":
        return []
    # Extracting the alignment issue categories from parentheses
    parts = [p.strip().split('(')[-1].replace(')','').strip().lower() for p in resp.split(';')]
    return parts

# ==============================
#       COMPUTE METRICS PER CATEGORY
# ==============================
def compute_fig_metrics(df, category):
    # Computing evaluation metrics for each fine-grained category
    alignment_col = f"{category} FiG_alignment"
    # Creating true labels from human responses
    y_true = df["Human_Responses"].apply(lambda x: 0 if category.lower() in parse_human_response(x) else 1)
    # Creating predicted labels from alignment scores
    y_pred = df[alignment_col].apply(lambda x: 1 if pd.notna(x) and x == 1 else 0)
    # Filtering out rows with missing values
    mask = y_true.notna() & y_pred.notna()
    y_true, y_pred = y_true[mask], y_pred[mask]
    num_samples = len(y_true)
    # Calculating various evaluation metrics
    accuracy = round(df[alignment_col].mean(), 3)
    precision = round(precision_score(y_true, y_pred, zero_division=0), 3)
    recall = round(recall_score(y_true, y_pred, zero_division=0), 3)
    f1 = round(f1_score(y_true, y_pred, zero_division=0), 3)
    return {"Category": category, "Accuracy":accuracy, "Precision":precision, "Recall":recall, "F1":f1, "Num_Samples":num_samples}

# ==============================
#            MAIN PIPELINE
# ==============================
if __name__ == "__main__":
    # Processing model CSVs for different AI models
    sd2_df = process_model_csv("drawbench_sd2.csv")
    sdxl_df = process_model_csv("drawbench_sdxl.csv")
    flux_df = process_model_csv("drawbench_fluxdev.csv")

    # Parsing and processing human evaluation responses
    human_df = parse_human_responses("human_response.xlsx")
    human_df = compute_human_majority(human_df)

    # Merging model outputs with human evaluations
    sd2_merged = merge_with_human(sd2_df, human_df, "SD2")
    sdxl_merged = merge_with_human(sdxl_df, human_df, "SDXL")
    flux_merged = merge_with_human(flux_df, human_df, "Flux-Dev")

    # Combining all model results into final dataset
    final_df = pd.concat([sd2_merged, sdxl_merged, flux_merged], ignore_index=True)
    final_df.to_csv("drawbench_all_models_fig_detailed.csv", index=False)
    print("Saved -> drawbench_all_models_fig_detailed.csv")

    # Generating per-category filtered CSVs for analysis
    categories = ["Object","Colour","Number","Positional","Text"]
    results = []
    for cat in categories:
        fig_col = f"{cat} FiG"
        alignment_col = f"{cat} FiG_alignment"
        # Creating filtered datasets for each category
        if cat == "Object":
            filtered_df = final_df[["image_name","Prompts","Meta Caption",fig_col,"Human_Responses","Human_Majority",alignment_col]]
        else:
            # Filtering out rows where metric is -1 (not applicable)
            filtered_df = final_df[final_df[fig_col]!=-1][["image_name","Prompts","Meta Caption",fig_col,"Human_Responses","Human_Majority",alignment_col]]
        out_name = f"{cat.lower()}_fig_non-1.csv"
        filtered_df.to_csv(out_name,index=False)
        print(f"Saved {out_name} with {len(filtered_df)} valid rows.")
        # Computing metrics for each category
        metrics = compute_fig_metrics(filtered_df, cat)
        results.append(metrics)

    # Combine and compute overall
    metrics_df = pd.DataFrame(results)
    total_samples = metrics_df["Num_Samples"].sum()
    if total_samples>0:
        # Calculating weighted averages across all categories
        overall = {
            "Category":"Overall Avg",
            "Accuracy": round(np.nansum(metrics_df["Accuracy"]*metrics_df["Num_Samples"])/total_samples,3),
            "Precision": round(np.nansum(metrics_df["Precision"]*metrics_df["Num_Samples"])/total_samples,3),
            "Recall": round(np.nansum(metrics_df["Recall"]*metrics_df["Num_Samples"])/total_samples,3),
            "F1": round(np.nansum(metrics_df["F1"]*metrics_df["Num_Samples"])/total_samples,3),
            "Num_Samples": total_samples
        }
        metrics_df = pd.concat([metrics_df,pd.DataFrame([overall])],ignore_index=True)

    # Printing summary results in formatted table
    print("\n=== DrawBench FiG Metrics Summary ===")
    print(tabulate(metrics_df, headers="keys", tablefmt="pretty", showindex=False))
    metrics_df.to_csv("fig_category_metrics_summary.csv",index=False)
    print("Saved metrics summary to 'fig_category_metrics_summary.csv'")

Saved -> drawbench_all_models_fig_detailed.csv
Saved object_fig_non-1.csv with 60 valid rows.
Saved colour_fig_non-1.csv with 7 valid rows.
Saved number_fig_non-1.csv with 44 valid rows.
Saved positional_fig_non-1.csv with 27 valid rows.
Saved text_fig_non-1.csv with 11 valid rows.

=== DrawBench FiG Metrics Summary ===
+-------------+----------+-----------+--------+-------+-------------+
|  Category   | Accuracy | Precision | Recall |  F1   | Num_Samples |
+-------------+----------+-----------+--------+-------+-------------+
|   Object    |  0.867   |   0.731   | 0.844  | 0.784 |     60      |
|   Colour    |  0.714   |    1.0    | 0.714  | 0.833 |      7      |
|   Number    |   0.75   |   0.97    |  0.78  | 0.865 |     44      |
| Positional  |  0.852   |    1.0    | 0.852  | 0.92  |     27      |
|    Text     |   1.0    |   0.273   |  1.0   | 0.429 |     11      |
| Overall Avg |  0.832   |   0.829   | 0.832  | 0.809 |     149     |
+-------------+----------+-----------+--------+-

In [2]:
import shutil
from google.colab import files
import os

# Set the name of the zip file
zip_name = "all_files.zip"

# Zip the entire runtime folder (current working directory)
shutil.make_archive("all_files", 'zip', os.getcwd())

# Download the zip file
files.download(zip_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>