In [1]:
# Importing required libraries for NLP processing, visualization, data handling and typing
import spacy
from spacy import displacy
import pandas as pd
import re
from typing import List, Set, Tuple, Dict, Optional

# Loading the English language model from spaCy
nlp = spacy.load("en_core_web_sm")
# Adding a pipeline component to merge named entities into single tokens
nlp.add_pipe("merge_entities")

class FineGrainedMetrics:
    """
    A class for evaluating fine-grained matching between original and generated captions.
    Provides methods to compare different aspects like color, number, position, etc.
    """

    @staticmethod
    def related_to_noun(doc: spacy.tokens.Doc, attribute: str, noun: str) -> bool:
        """
        Checking if an attribute is grammatically related to a specific noun in the document.

        Args:
            doc: The spaCy processed document
            attribute: The word/text to check for relation
            noun: The noun to check relation with

        Returns:
            bool: True if attribute is related to noun, False otherwise
        """
        # Iterating through all tokens in the document
        for token in doc:
            # Checking if current token matches the target noun
            if token.text == noun:
                # Collecting all words in the noun's syntactic subtree
                subtree_words = [t.text for t in token.subtree]
                # Verifying if the attribute appears in the subtree
                if attribute in subtree_words:
                    return True
        return False

    @classmethod
    def color(cls, meta: spacy.tokens.Doc, orig: spacy.tokens.Doc, generated_nouns: Set[str]) -> float:
        """
        Comparing color attributes between original and generated captions.

        Args:
            meta: Processed generated caption
            orig: Processed original caption
            generated_nouns: Set of nouns that appear in both captions

        Returns:
            float: Matching score (0-1), or -1 if no colors in original
        """
        # Defining a set of common color terms for comparison
        COLORS = {
            'red', 'blue', 'green', 'yellow', 'black', 'white', 'gray', 'grey',
            'orange', 'pink', 'purple', 'brown', 'violet', 'indigo',
            'turquoise', 'cyan', 'magenta'
        }

        # Initializing score counter and storage for original colors
        color_score = 0.0
        original_colors = set()

        # Checking color attributes for each common noun
        for noun in generated_nouns:
            # Finding color adjectives related to current noun in generated caption
            meta_colors = {
                token.text for token in meta
                if (cls.related_to_noun(meta, token.text, noun) and
                    token.dep_ in {'acomp', 'amod'} and
                    token.text.lower() in COLORS)
            }

            # Finding color adjectives related to current noun in original caption
            orig_colors = {
                token.text for token in orig
                if (cls.related_to_noun(orig, token.text, noun) and
                    token.dep_ in {'acomp', 'amod'} and
                    token.text.lower() in COLORS)
            }

            # Adding found colors to the original colors set
            original_colors.update(orig_colors)

            # Only comparing colors if they exist in the original caption
            if orig_colors:
                # Counting matching colors between original and generated
                color_score += len(orig_colors & meta_colors)

        # Returning -1 if no colors were found in original caption
        if not original_colors:
            return -1

        # Calculating final score as ratio of matches to total original colors
        return color_score / len(original_colors)

    @classmethod
    def number(cls, meta: spacy.tokens.Doc, orig: spacy.tokens.Doc, generated_nouns: Set[str]) -> float:
        """
        Comparing numeric quantities between original and generated captions.

        Args:
            meta: Processed generated caption
            orig: Processed original caption
            generated_nouns: Set of nouns that appear in both captions

        Returns:
            float: Matching score (0-1), or -1 if no numbers in original
        """
        # Creating mapping from words to their numeric equivalents
        QUANTITY_MAP = {
            'a': '1',
            'an': '1',
            'the': '1',
            'one': '1',
            'two': '2',
            'three': '3',
            'couple': '2',
            'few': '3',
            'several': '4',
            'many': '5',
            'dozen': '12'
        }

        # Initializing score counter and storage for original numbers
        number_score = 0.0
        original_numbers = set()

        # Checking numeric quantities for each common noun
        for noun in generated_nouns:
            # Finding numeric modifiers in generated caption
            meta_nums = {
                QUANTITY_MAP.get(token.text.lower(), token.text)
                for token in meta
                if (cls.related_to_noun(meta, token.text, noun) and
                    token.dep_ in {'nummod', 'det'})
            }

            # Finding numeric modifiers in original caption
            orig_nums = {
                QUANTITY_MAP.get(token.text.lower(), token.text)
                for token in orig
                if (cls.related_to_noun(orig, token.text, noun) and
                    token.dep_ in {'nummod', 'det'})
            }

            # Adding found numbers to the original numbers set
            original_numbers.update(orig_nums)

            # Only comparing numbers if they exist in the original caption
            if orig_nums:
                # Counting matching numbers between original and generated
                number_score += len(orig_nums & meta_nums)

        # Returning -1 if no numbers were found in original caption
        if not original_numbers:
            return -1

        # Calculating final score as ratio of matches to total original numbers
        return number_score / len(original_numbers)

    @classmethod
    def text(cls, meta: spacy.tokens.Doc, orig: spacy.tokens.Doc, _: Set[str]) -> float:
        """
        Comparing quoted text between original and generated captions.

        Args:
            meta: Processed generated caption
            orig: Processed original caption
            _: Ignored (for interface consistency)

        Returns:
            float: Matching score (0-1), or -1 if no text in original
        """
        # Defining indicators that text might follow in the caption
        TEXT_INDICATORS = {'written', 'saying', 'says', 'reading', 'text'}
        # Creating pattern to match quoted text segments
        QUOTE_PATTERN = r'[\"\'«»“”]([^\"\'«»“”]*)[\"\'«»“”]'

        # Checking if original caption contains any text indicators
        has_text_indicator = any(token.text in TEXT_INDICATORS for token in orig)

        # Proceeding only if text indicators are present
        if has_text_indicator:
            # Finding all quoted segments in both captions
            orig_matches = re.findall(QUOTE_PATTERN, orig.text)
            meta_matches = re.findall(QUOTE_PATTERN, meta.text)

            # Returning -1 if no quoted text found in original
            if not orig_matches:
                return -1

            # Normalizing text by removing spaces and converting to lowercase
            orig_normalized = [''.join(s.lower().split()) for s in orig_matches]
            meta_normalized = [''.join(s.lower().split()) for s in meta_matches]

            # Counting matches between original and generated quoted text
            matches = 0
            for orig_text in orig_normalized:
                if any(orig_text in meta_text for meta_text in meta_normalized):
                    matches += 1

            # Calculating score as ratio of matches to total original quotes
            return matches / len(orig_matches)

        return -1

    @staticmethod
    def extract_spatial_relations(doc: spacy.tokens.Doc) -> Set[Tuple[str, str, str]]:
        """
        Extracting spatial relationships as (subject, preposition, object) tuples.

        Args:
            doc: Processed spaCy document

        Returns:
            Set of (subject, relation, object) tuples
        """
        relations = set()

        # Analyzing each token in the document
        for token in doc:
            # Looking for prepositional phrases
            if token.dep_ == 'prep':
                # Finding prepositional objects
                pobjects = [child for child in token.children if child.dep_ == 'pobj']

                if pobjects:
                    pobject = pobjects[0].text
                    # Initializing subject as None before searching
                    subject = None

                    # Checking ancestors to find the related subject
                    for ancestor in token.ancestors:
                        if ancestor.dep_ in {'nsubj', 'nsubjpass', 'dobj', 'pobj'}:
                            subject = ancestor.text
                            break

                    # Checking for compound nouns if no subject found yet
                    if subject is None and token.head.dep_ == 'compound':
                        subject = token.head.text

                    # Adding relation if subject was found
                    if subject:
                        relations.add((subject, token.text, pobject))

        return relations

    @classmethod
    def position(cls, meta: spacy.tokens.Doc, orig: spacy.tokens.Doc, _: Set[str]) -> float:
        """
        Comparing spatial relationships between original and generated captions.

        Args:
            meta: Processed generated caption
            orig: Processed original caption
            _: Ignored (for interface consistency)

        Returns:
            float: Matching score (0-1), or -1 if no positions in original
        """
        # Extracting spatial relations from both captions
        orig_relations = cls.extract_spatial_relations(orig)
        meta_relations = cls.extract_spatial_relations(meta)

        # Returning -1 if no spatial relations in original
        if not orig_relations:
            return -1

        # Calculating matching relations between original and generated
        matches = orig_relations & meta_relations
        # Returning score as ratio of matches to total original relations
        return len(matches) / len(orig_relations)

    @classmethod
    def shape(cls, meta: spacy.tokens.Doc, orig: spacy.tokens.Doc, generated_nouns: Set[str]) -> float:
        """
        Comparing shape descriptions between original and generated captions.

        Args:
            meta: Processed generated caption
            orig: Processed original caption
            generated_nouns: Set of nouns that appear in both captions

        Returns:
            float: Matching score (0-1), or -1 if no shapes in original
        """
        # Defining set of common shape descriptors
        SHAPES = {
            'circular', 'round', 'square', 'triangular', 'rectangular',
            'oval', 'hexagonal', 'pentagonal', 'octagonal', 'spherical',
            'cubical', 'cylindrical', 'conical', 'pyramidal', 'flat', 'curved'
        }

        # Initializing score counter and storage for original shapes
        shape_score = 0.0
        original_shapes = set()

        # Checking shape descriptions for each common noun
        for noun in generated_nouns:
            # Finding shape adjectives in generated caption
            meta_shapes = {
                token.text for token in meta
                if (cls.related_to_noun(meta, token.text, noun) and
                    token.dep_ in {'acomp', 'amod'} and
                    token.text.lower() in SHAPES)
            }

            # Finding shape adjectives in original caption
            orig_shapes = {
                token.text for token in orig
                if (cls.related_to_noun(orig, token.text, noun) and
                    token.dep_ in {'acomp', 'amod'} and
                    token.text.lower() in SHAPES)
            }

            # Adding found shapes to the original shapes set
            original_shapes.update(orig_shapes)

            # Counting matching shapes if they exist in original
            if orig_shapes:
                shape_score += len(orig_shapes & meta_shapes)

        # Returning -1 if no shapes were found in original
        if not original_shapes:
            return -1

        # Calculating final score as ratio of matches to total original shapes
        return shape_score / len(original_shapes)


def analyze_caption_pair(meta_caption: str, orig_caption: str) -> Dict[str, float]:
    """
    Analyzing a pair of captions (original and generated) across multiple dimensions.

    Args:
        meta_caption: The generated caption text
        orig_caption: The original prompt text

    Returns:
        Dictionary containing all metrics and their scores
    """
    # Processing both captions with spaCy NLP pipeline
    meta_doc = nlp(meta_caption)
    orig_doc = nlp(orig_caption)

    # Extracting nouns and proper nouns from both captions
    meta_nouns = {token.text for token in meta_doc if token.pos_ in {"NOUN", "PROPN"}}
    orig_nouns = {token.text for token in orig_doc if token.pos_ in {"NOUN", "PROPN"}}

    # Calculating noun recall score
    noun_recall = len(orig_nouns & meta_nouns) / len(orig_nouns) if orig_nouns else 0
    # Identifying missing nouns from generated caption
    missing_nouns = orig_nouns - meta_nouns
    # Finding common nouns between both captions
    common_nouns = orig_nouns & meta_nouns

    # Calculating all fine-grained metrics
    metrics = {
        'noun_recall': noun_recall,
        'missing_nouns': list(missing_nouns),
        'color': FineGrainedMetrics.color(meta_doc, orig_doc, common_nouns),
        'number': FineGrainedMetrics.number(meta_doc, orig_doc, common_nouns),
        'text': FineGrainedMetrics.text(meta_doc, orig_doc, common_nouns),
        'position': FineGrainedMetrics.position(meta_doc, orig_doc, common_nouns),
        'shape': FineGrainedMetrics.shape(meta_doc, orig_doc, common_nouns)
    }

    return metrics


def main():
    """Main execution function to process all caption pairs."""
    # Defining target categories for analysis
    target_categories = ['Colors', 'Positional', 'Counting', 'Descriptions']
    # Loading generated captions and original prompts datasets
    sdxl_captions = pd.read_csv('meta_captions_sdxl.csv')
    prompt_df = pd.read_csv('DrawBenchPrompts.csv')
    # Filtering prompts to only include target categories
    prompt_df = prompt_df.loc[prompt_df['Category'].isin(target_categories)].reset_index()

    # Initializing list to store analysis results
    results = []

    # Processing each caption pair in the dataset
    for _, row in sdxl_captions.iterrows():
        # Analyzing current caption pair
        metrics = analyze_caption_pair(row['Meta Caption'], row['Prompts'])
        # Adding additional metadata to results
        metrics.update({
            'category': row['Category'],
            'original_prompt': row['Prompts'],
            'generated_caption': row['Meta Caption']
        })
        results.append(metrics)

        # Printing progress every 50 processed pairs
        if len(results) % 50 == 0:
            print(f"Processed {len(results)} caption pairs")

    # Converting results to DataFrame and saving to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv('caption_analysis_results.csv', index=False)
    print("Analysis complete. Results saved to caption_analysis_results.csv")


if __name__ == "__main__":
    main()

Processed 50 caption pairs
Processed 100 caption pairs
Processed 150 caption pairs
Processed 200 caption pairs
Analysis complete. Results saved to caption_analysis_results.csv


In [5]:
# Importing required libraries for data analysis and numerical operations
import pandas as pd
from typing import Dict
import numpy as np

def analyze_drawbench_vs_metacaptions(drawbench_csv: str, metacaptions_csv: str) -> Dict[str, float]:
    """
    Comparing DrawBench prompts with their corresponding Meta Captions using fine-grained metrics.

    Args:
        drawbench_csv: Path to DrawBench prompts CSV
        metacaptions_csv: Path to Meta Captions CSV

    Returns:
        Dictionary of average metric scores across all comparisons
    """
    # Loading both datasets into pandas DataFrames
    drawbench = pd.read_csv(drawbench_csv)
    metacaptions = pd.read_csv(metacaptions_csv)

    # Merging datasets on the 'Prompts' column to align matching pairs
    merged = pd.merge(drawbench, metacaptions,
                     left_on='Prompts', right_on='Prompts',
                     how='inner', suffixes=('_db', '_meta'))

    # Initializing dictionary to store metric values for each comparison
    metrics = {
        'Object_FtG': [],
        'Colour_FtG': [],
        'Number_FtG': [],
        'Positional_FtG': [],
        'Text_FtG': []
    }

    # Processing each prompt-caption pair in the merged dataset
    for _, row in merged.iterrows():
        # Processing both texts with spaCy's NLP pipeline
        meta_doc = nlp(row['Meta Caption'])
        orig_doc = nlp(row['Prompts'])

        # Stage 1: Calculating noun recall (Object Fine-to-Grained metric)
        meta_nouns = {t.text for t in meta_doc if t.pos_ in {"NOUN", "PROPN"}}
        orig_nouns = {t.text for t in orig_doc if t.pos_ in {"NOUN", "PROPN"}}
        # Computing recall as ratio of matching nouns to original nouns
        noun_recall = len(orig_nouns & meta_nouns) / len(orig_nouns) if orig_nouns else 0
        metrics['Object_FtG'].append(noun_recall)

        # Stage 2: Calculating fine-grained metrics for shared nouns
        common_nouns = orig_nouns & meta_nouns
        metrics['Colour_FtG'].append(FineGrainedMetrics.color(meta_doc, orig_doc, common_nouns))
        metrics['Number_FtG'].append(FineGrainedMetrics.number(meta_doc, orig_doc, common_nouns))
        metrics['Positional_FtG'].append(FineGrainedMetrics.position(meta_doc, orig_doc, common_nouns))
        metrics['Text_FtG'].append(FineGrainedMetrics.text(meta_doc, orig_doc, common_nouns))

    # Calculating average scores while handling -1 (non-applicable cases)
    results = {}
    for metric, values in metrics.items():
        # Filtering out non-applicable (-1) values
        valid_values = [v for v in values if v != -1]
        # Converting to percentage and storing, or marking as N/A if no valid values
        results[metric] = np.mean(valid_values) * 100 if valid_values else "N/A"

    return results

def format_results_table(results: Dict[str, float]) -> pd.DataFrame:
    """
    Formatting the analysis results into a table matching the reference style.

    Args:
        results: Dictionary of metric scores

    Returns:
        Formatted DataFrame with one row showing the comparison
    """
    # Creating multi-level column structure for the results table
    columns = pd.MultiIndex.from_tuples([
        ('DrawBench vs MetaCaptions', 'Stage-1', 'Object FtG'),
        ('DrawBench vs MetaCaptions', 'Stage-2', 'Colour FtG'),
        ('DrawBench vs MetaCaptions', 'Stage-2', 'Number FtG'),
        ('DrawBench vs MetaCaptions', 'Stage-2', 'Positional FtG'),
        ('DrawBench vs MetaCaptions', 'Stage-2', 'Text FtG')
    ])

    # Creating DataFrame with formatted percentage values
    df = pd.DataFrame([[
        f"{results.get('Object_FtG', 'N/A'):.2f}",
        f"{results.get('Colour_FtG', 'N/A'):.2f}",
        f"{results.get('Number_FtG', 'N/A'):.2f}",
        f"{results.get('Positional_FtG', 'N/A'):.2f}",
        f"{results.get('Text_FtG', 'N/A'):.2f}"
    ]], columns=columns, index=['SDXL'])

    return df

# Example usage:
if __name__ == "__main__":
    # Defining paths to input CSV files
    drawbench_path = "DrawBenchPrompts.csv"
    metacaptions_path = "meta_captions_sdxl.csv"

    # Running the comparison analysis
    comparison_results = analyze_drawbench_vs_metacaptions(drawbench_path, metacaptions_path)

    # Formatting results into table structure
    results_table = format_results_table(comparison_results)

    # Displaying and saving results
    print("Comparison Results:")
    print(results_table)
    # Saving results to CSV file
    results_table.to_csv("drawbench_metacaptions_comparison.csv")
    print("\nResults saved to drawbench_metacaptions_comparison.csv")

Comparison Results:
     DrawBench vs MetaCaptions                                              
                       Stage-1    Stage-2                                   
                    Object FtG Colour FtG Number FtG Positional FtG Text FtG
SDXL                     35.67      72.73     111.07           0.00    23.81

Results saved to drawbench_metacaptions_comparison.csv


In [6]:
from tabulate import tabulate
# Creating a table for results
def display_tabulated_results(results: Dict[str, float]):
    headers = [
        "Stage-1: Object FtG",
        "Stage-2: Colour FtG",
        "Stage-2: Number FtG",
        "Stage-2: Positional FtG",
        "Stage-2: Text FtG"
    ]

    row = [
        f"{results.get('Object_FtG', 'N/A'):.2f}",
        f"{results.get('Colour_FtG', 'N/A'):.2f}",
        f"{results.get('Number_FtG', 'N/A'):.2f}",
        f"{results.get('Positional_FtG', 'N/A'):.2f}",
        f"{results.get('Text_FtG', 'N/A'):.2f}"
    ]

    table = tabulate([row], headers=headers, tablefmt="grid", showindex=["SDXL"])
    print("\n Comparison Table:\n")
    print(table)

# After getting results:
display_tabulated_results(comparison_results)



 Comparison Table:

+------+-----------------------+-----------------------+-----------------------+---------------------------+---------------------+
|      |   Stage-1: Object FtG |   Stage-2: Colour FtG |   Stage-2: Number FtG |   Stage-2: Positional FtG |   Stage-2: Text FtG |
| SDXL |                 35.67 |                 72.73 |                111.07 |                         0 |               23.81 |
+------+-----------------------+-----------------------+-----------------------+---------------------------+---------------------+
