In [6]:
import pandas as pd
import spacy
import re
from typing import Set, Dict, List, Tuple

# Loading the English language model with entity recognition capabilities
nlp = spacy.load("en_core_web_sm")
# Enhancing the pipeline to merge recognized entities into single tokens
nlp.add_pipe("merge_entities")

class FineGrainedMetrics:
    """
    Implementing the fine-grained evaluation metrics as specified in the original paper.
    The class provides methods for comparing various linguistic aspects between captions.
    """

    @classmethod
    def related_to_noun(cls, doc, attr: str, noun: str) -> bool:
        """
        Determining if a given attribute is grammatically associated with a specific noun.
        Examining the syntactic subtree of each noun occurrence to verify attribute relationships.
        """
        for token in doc:
            if token.text == noun:
                subtree = [t.text for t in token.subtree]
                if attr in subtree:
                    return True
        return False

    @classmethod
    def colour(cls, meta, orig, generated_nouns: Set[str]) -> float:
        """
        Calculating color attribute matching between original and generated captions.
        Processing all nouns present in both captions and comparing associated color adjectives.
        Returning -1 when no color attributes exist in the original caption.
        """
        colours = {
            'red', 'blue', 'green', 'yellow', 'black', 'white', 'gray', 'grey',
            'orange', 'pink', 'purple', 'brown', 'violet', 'indigo',
            'turquoise', 'cyan', 'magenta'
        }
        colour_avg = 0.
        orig_colour_set = set()

        for noun in generated_nouns:
            meta_adjectives = {
                token.text for token in meta
                if cls.related_to_noun(meta, token.text, noun)
                and token.dep_ in ['acomp', 'amod']
            }
            orig_adjectives = {
                token.text for token in orig
                if cls.related_to_noun(orig, token.text, noun)
                and token.dep_ in ['acomp', 'amod']
            }

            meta_colours = colours.intersection(meta_adjectives)
            orig_colours = colours.intersection(orig_adjectives)
            orig_colour_set.update(orig_colours)

            if orig_colours:
                colour_avg += len(orig_colours & meta_colours)

        return colour_avg / len(orig_colour_set) if orig_colour_set else -1

    @classmethod
    def number(cls, meta, orig, generated_nouns: Set[str]) -> float:
        """
        Evaluating numerical quantity matching between captions.
        Normalizing textual number representations (e.g., 'a' → '1') before comparison.
        Handling both explicit numerals and quantifier words through a mapping dictionary.
        """
        quantities_map = {
            'a': '1', 'an': '1', 'the': '1', 'couple': '2',
            'dozen': '12', 'one': '1', 'two': '2', 'three': '3'
        }
        num_avg = 0.
        orig_num_set = set()

        for noun in generated_nouns:
            meta_nums = {
                quantities_map.get(token.text.lower(), token.text)
                for token in meta
                if cls.related_to_noun(meta, token.text, noun)
                and token.dep_ in ['nummod', 'det']
            }
            orig_nums = {
                quantities_map.get(token.text.lower(), token.text)
                for token in orig
                if cls.related_to_noun(orig, token.text, noun)
                and token.dep_ in ['nummod', 'det']
            }

            orig_num_set.update(orig_nums)
            if orig_nums:
                num_avg += len(orig_nums & meta_nums)

        return num_avg / len(orig_num_set) if orig_num_set else -1

    @classmethod
    def text(cls, meta, orig, _) -> float:
        """
        Assessing text/quote matching between captions.
        Identifying quoted segments when preceded by specific indicator words.
        Implementing case-insensitive comparison after whitespace normalization.
        """
        indicators = {'written', 'saying', 'says', 'reading', 'text'}
        pattern = r'[\"\'«»"]([^\"\'«»"]*)[\"\'«»"]'

        if any(token.text in indicators for token in orig):
            orig_matches = re.findall(pattern, orig.text)
            meta_matches = re.findall(pattern, meta.text)

            if not orig_matches:
                return -1

            orig_normalized = [''.join(s.lower().split()) for s in orig_matches]
            meta_normalized = [''.join(s.lower().split()) for s in meta_matches]

            matches = sum(
                1 for orig_text in orig_normalized
                if any(orig_text in meta_text for meta_text in meta_normalized)
            )
            return matches / len(orig_matches)
        return -1

    @classmethod
    def extract_triplets(cls, doc) -> Set[Tuple[str, str, str]]:
        """
        Extracting spatial relationships as subject-preposition-object triplets.
        Analyzing dependency parse trees to identify prepositional phrases and their governors.
        """
        triplets = set()
        for token in doc:
            if token.dep_ == 'prep':
                pobjects = [child for child in token.children if child.dep_ == 'pobj']
                if pobjects:
                    subject = None
                    for ancestor in token.ancestors:
                        if ancestor.dep_ in ['nsubj', 'nsubjpass']:
                            subject = ancestor.text
                            break
                    if subject:
                        triplets.add((subject, token.text, pobjects[0].text))
        return triplets

    @classmethod
    def position(cls, meta, orig, _) -> float:
        """
        Computing positional relationship matching using extracted triplets.
        Comparing spatial relations between original and generated captions.
        Only evaluating when spatial relations exist in the original caption.
        """
        orig_triplets = cls.extract_triplets(orig)
        meta_triplets = cls.extract_triplets(meta)

        if not orig_triplets:
            return -1
        return len(orig_triplets & meta_triplets) / len(orig_triplets)

    @classmethod
    def shape(cls, meta, orig, generated_nouns: Set[str]) -> float:
        """
        Evaluating shape attribute matching for shared nouns.
        Utilizing a predefined vocabulary of shape descriptors.
        Considering only adjectives grammatically connected to the target nouns.
        """
        shapes = {
            'circular', 'round', 'square', 'triangular', 'rectangular',
            'oval', 'hexagonal', 'pentagonal', 'octagonal', 'spherical',
            'cubical', 'cylindrical', 'conical', 'pyramidal', 'flat', 'curved'
        }
        shape_avg = 0.
        orig_shape_set = set()

        for noun in generated_nouns:
            meta_shapes = {
                token.text for token in meta
                if cls.related_to_noun(meta, token.text, noun)
                and token.dep_ in ['acomp', 'amod']
            }
            orig_shapes = {
                token.text for token in orig
                if cls.related_to_noun(orig, token.text, noun)
                and token.dep_ in ['acomp', 'amod']
            }

            meta_filtered = shapes.intersection(meta_shapes)
            orig_filtered = shapes.intersection(orig_shapes)
            orig_shape_set.update(orig_filtered)

            if orig_filtered:
                shape_avg += len(orig_filtered & meta_filtered)

        return shape_avg / len(orig_shape_set) if orig_shape_set else -1

def stage_one_metric(meta, orig):
    """
    Performing first-stage evaluation: noun phrase recall analysis.
    Identifying:
    1. The proportion of original nouns present in generated caption (recall)
    2. Nouns missing from the generated caption
    3. Nouns successfully generated (intersection)
    """
    meta_nouns = {token.text for token in meta if token.pos_ in {"NOUN", "PROPN"}}
    orig_nouns = {token.text for token in orig if token.pos_ in {"NOUN", "PROPN"}}
    non_generated = orig_nouns - meta_nouns
    noun_recall = len(orig_nouns & meta_nouns) / len(orig_nouns) if orig_nouns else 0
    generated_nouns = orig_nouns & meta_nouns
    return noun_recall, non_generated, generated_nouns

def stage_two_metric(meta, orig, generated_nouns: Set[str]) -> Dict[str, float]:
    """
    Conducting second-stage fine-grained attribute evaluation.
    Computing five specific metrics for the shared nouns:
    1. Color adjective matching
    2. Numerical quantity matching
    3. Text/quote matching
    4. Positional relationship matching
    5. Shape descriptor matching
    """
    return {
        'color': FineGrainedMetrics.colour(meta, orig, generated_nouns),
        'number': FineGrainedMetrics.number(meta, orig, generated_nouns),
        'text': FineGrainedMetrics.text(meta, orig, generated_nouns),
        'position': FineGrainedMetrics.position(meta, orig, generated_nouns),
        'shape': FineGrainedMetrics.shape(meta, orig, generated_nouns)
    }

def process_caption_comparison(input_csv: str, output_csv: str):
    """
    Executing the complete caption evaluation pipeline:
    1. Loading and processing the input CSV containing caption pairs
    2. Performing linguistic analysis on each caption pair
    3. Calculating both stage-one and stage-two metrics
    4. Saving comprehensive results to output CSV
    5. Generating and displaying average performance metrics
    """
    df = pd.read_csv(input_csv)
    results = []

    for idx, row in df.iterrows():
        # Processing captions through spaCy's NLP pipeline
        meta_blip = nlp(row['blip_caption'])
        meta_vit = nlp(row['vit_caption'])
        orig = nlp(row['mscoco_caption'])

        # Evaluating BLIP model performance
        blip_recall, blip_missing, blip_nouns = stage_one_metric(meta_blip, orig)
        blip_metrics = stage_two_metric(meta_blip, orig, blip_nouns)
        results.append({
            'image_name': row['image_name'],
            'model': 'BLIP',
            'noun_recall': blip_recall,
            **blip_metrics,
            'missing_nouns': ', '.join(blip_missing)
        })

        # Evaluating ViT model performance
        vit_recall, vit_missing, vit_nouns = stage_one_metric(meta_vit, orig)
        vit_metrics = stage_two_metric(meta_vit, orig, vit_nouns)
        results.append({
            'image_name': row['image_name'],
            'model': 'ViT',
            'noun_recall': vit_recall,
            **vit_metrics,
            'missing_nouns': ', '.join(vit_missing)
        })

        if (idx + 1) % 20 == 0:
            print(f"Processed {idx + 1} images")

    # Compiling and saving all evaluation results
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)

    # Computing and displaying model-wise average metrics
    numeric_cols = results_df.select_dtypes(include=['number']).columns
    avg_results = results_df.groupby('model')[numeric_cols].mean()
    print("\nAverage Metrics:")
    print(avg_results.to_markdown(floatfmt=".2f"))

    return results_df

# Executing the complete evaluation pipeline
results = process_caption_comparison(
    input_csv='mscoco-blip-vit_captions.csv',
    output_csv='mscoco-blip-vit-metrics.csv'
)

Processed 20 images
Processed 40 images
Processed 60 images
Processed 80 images
Processed 100 images
Processed 120 images
Processed 140 images
Processed 160 images
Processed 180 images
Processed 200 images

Average Metrics:
| model   |   noun_recall |   color |   number |   text |   position |   shape |
|:--------|--------------:|--------:|---------:|-------:|-----------:|--------:|
| BLIP    |          0.35 |   -0.89 |     0.54 |  -1.00 |      -0.94 |   -0.99 |
| ViT     |          0.42 |   -0.89 |     0.76 |  -1.00 |      -0.94 |   -0.99 |
