In [19]:
# If your annotation_file/result_file live in Drive, you can mount:
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = "cs231n/project/"
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Process model-customized output captions_eval.json to generate COCO-format annotation json files**

Transform your custom annotation input.
```
[
  {
    "image_path": "/content/drive/My Drive/cs231n/project/test/ALandscapewithaRuinedCastleandaChurch.png",
    "ground_truth": [
      "On a pool are three swans."
    ],
    "baseline_caption": "a painting of a river with a castle in the background",
    "finetuned_caption": "in the foreground, with a view of the countryside beyond, is an oil - on - canvas painting by vincent van gogh"
  }
]
```

to
 1. a COCO-style annotation JSON file("images" + annotations" sections")
 2. Two or more sepearte "result" JSON files in the format that COCO's evalution code expects:
  *   baseline model's captions
  *   fine-tuned model's captions

In [20]:
import json
import os

def convert_custom_to_coco(
    input_path: str,
    groundtruth_annotation_path: str,
    baseline_results_path: str,
    finetuned_results_path: str
):
    """
    Read your custom-format JSON and produce:
      1) a COCO-style ground-truth annotation JSON (written to groundtruth_annotation_path)
      2) a "baseline" result JSON (written to baseline_results_path)
      3) a "finetuned" result JSON (written to finetuned_results_path)

    Args:
        input_path (str):
            Path to your original JSON. It should look like:
            [
                {
                    "image_path": "/content/drive/My Drive/cs231n/project/test/ALandscapewithaRuinedCastleandaChurch.png",
                    "ground_truth": [
                        "On a pool are three swans."
                    ],
                    "baseline_caption": "a painting of a river with a castle in the background",
                    "finetuned_caption": "in the foreground, with a view of the countryside beyond, is an oil-on-canvas painting by vincent van gogh"
                },
                {
                    "image_path": "/content/drive/My Drive/cs231n/project/test/AloneintheWorld_Bouguereau_.png",
                    "ground_truth": [
                        "This indicates that the girl is standing on the Pont de Solférino."
                    ],
                    "baseline_caption": "a painting of a young girl holding a violin",
                    "finetuned_caption": "the painting depicts a young woman holding a violin in front of a cityscape"
                },
                {
                    "image_path": "/content/drive/My Drive/cs231n/project/test/AmericanProgress.png",
                    "ground_truth": [
                        "Progress lays a telegraph wire with one hand and carries a school book in the other.",
                        "As she moves westward, indigenous people and a herd of buffalo are seen fleeing her and the settlers."
                    ],
                    "baseline_caption": "a painting of a woman flying in the air",
                    "finetuned_caption": "in the center of the painting, the woman in the foreground is the subject of the artist's work."
                },
                …
            ]

        groundtruth_annotation_path (str):
            Where to write out the COCO-style ground-truth annotation JSON, e.g.
            "/content/drive/MyDrive/cs231n/project/test/groundtruth_annotations.json".

        baseline_results_path (str):
            Where to write the baseline model’s results JSON, e.g.
            "/content/drive/MyDrive/cs231n/project/test/baseline_results.json".

        finetuned_results_path (str):
            Where to write the finetuned model’s results JSON, e.g.
            "/content/drive/MyDrive/cs231n/project/test/finetuned_results.json".
    """

    # 1) Load your custom-format JSON
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    coco_root = {
        "images": [],
        "annotations": []
    }

    baseline_results = []
    finetuned_results = []

    annotation_id = 0

    # Assign a unique integer ID to each image (start from 1)
    for idx, entry in enumerate(data, start=1):
        image_id = idx
        image_path = entry["image_path"]
        file_name = os.path.basename(image_path)

        # Add this image to the "images" list
        coco_root["images"].append({
            "id": image_id,
            "file_name": file_name
        })

        # Add one annotation object per ground-truth caption
        for gt_caption in entry.get("ground_truth", []):
            annotation_id += 1
            coco_root["annotations"].append({
                "id": annotation_id,
                "image_id": image_id,
                "caption": gt_caption
            })

        # Build the baseline result entry
        baseline_results.append({
            "image_id": image_id,
            "caption": entry["baseline_caption"]
        })

        # Build the finetuned result entry
        finetuned_results.append({
            "image_id": image_id,
            "caption": entry["finetuned_caption"]
        })

    # 2) Write out the COCO-style ground-truth JSON
    with open(groundtruth_annotation_path, "w", encoding="utf-8") as f:
        json.dump(coco_root, f, ensure_ascii=False, indent=2)

    # 3) Write out the baseline results JSON
    with open(baseline_results_path, "w", encoding="utf-8") as f:
        json.dump(baseline_results, f, ensure_ascii=False, indent=2)

    # 4) Write out the finetuned results JSON
    with open(finetuned_results_path, "w", encoding="utf-8") as f:
        json.dump(finetuned_results, f, ensure_ascii=False, indent=2)

    print(f"Wrote ground-truth annotations → {groundtruth_annotation_path}")
    print(f"Wrote baseline results            → {baseline_results_path}")
    print(f"Wrote finetuned results           → {finetuned_results_path}")

In [21]:

# ──────────────────────────────────────────────────────────────────────────────
# Example of how to call the converter in Colab:
#
# Suppose your original custom JSON is:
input_json = "/content/drive/MyDrive/cs231n/project/eval_data/captions_eval.json"

# Define output paths (all on Drive, for instance):
groundtruth_annotation_path = "/content/drive/MyDrive/cs231n/project/eval_data/groundtruth_annotations.json"
baseline_results_path       = "/content/drive/MyDrive/cs231n/project/eval_data/baseline_results.json"
finetuned_results_path      = "/content/drive/MyDrive/cs231n/project/eval_data/finetuned_results.json"

convert_custom_to_coco(
    input_path=input_json,
    groundtruth_annotation_path=groundtruth_annotation_path,
    baseline_results_path=baseline_results_path,
    finetuned_results_path=finetuned_results_path
)

Wrote ground-truth annotations → /content/drive/MyDrive/cs231n/project/eval_data/groundtruth_annotations.json
Wrote baseline results            → /content/drive/MyDrive/cs231n/project/eval_data/baseline_results.json
Wrote finetuned results           → /content/drive/MyDrive/cs231n/project/eval_data/finetuned_results.json


In [22]:
%cd /content/drive/MyDrive/cs231n/project/
!git clone https://github.com/salaniz/pycocoevalcap.git

/content/drive/.shortcut-targets-by-id/1wr3oM_e1S6oqX5xOKf31n-KmlL40G0MY/cs231n/project
fatal: destination path 'pycocoevalcap' already exists and is not an empty directory.


In [23]:
%cd /content/drive/MyDrive/cs231n/project/pycocoevalcap
!pip install -e .
!pip install pycocotools

/content/drive/.shortcut-targets-by-id/1wr3oM_e1S6oqX5xOKf31n-KmlL40G0MY/cs231n/project/pycocoevalcap
Obtaining file:///content/drive/.shortcut-targets-by-id/1wr3oM_e1S6oqX5xOKf31n-KmlL40G0MY/cs231n/project/pycocoevalcap
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: pycocoevalcap
  Attempting uninstall: pycocoevalcap
    Found existing installation: pycocoevalcap 1.2
    Uninstalling pycocoevalcap-1.2:
      Successfully uninstalled pycocoevalcap-1.2
  Running setup.py develop for pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [24]:
%cd /content/drive/MyDrive/cs231n/project/

/content/drive/.shortcut-targets-by-id/1wr3oM_e1S6oqX5xOKf31n-KmlL40G0MY/cs231n/project


Assume you have two JSON files:
- human_annoation.json(COCO-format ground-truth annotations)

- caption_results.json(your generated captions in COCO result format)

You can place/upload these files in your drive /content.

In [25]:
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

def evaluate_captions(annotation_file, result_file):
    """
    Evaluate generated captions against COCO ground-truth annotations.

    Args:
        annotation_file (str): Path to the COCO-format annotation JSON.
        result_file (str): Path to the JSON file containing generated captions.
                           Example format:
                           [
                             {"image_id": 42, "caption": "a man riding a horse"},
                             {"image_id": 73, "caption": "two dogs playing in the park"},
                             ...
                           ]

    Returns:
        dict: A dictionary of evaluation metrics (BLEU, METEOR, ROUGE_L, CIDEr, SPICE, etc.).
    """
    # 1) Load COCO ground-truth annotations
    coco = COCO(annotation_file)

    # 2) Load the generated captions (must match COCO result format)
    cocoRes = coco.loadRes(result_file)

    # 3) Create the COCO evaluator
    cocoEval = COCOEvalCap(coco, cocoRes)

    # 4) Evaluate only on the images present in your result file
    cocoEval.params['image_id'] = cocoRes.getImgIds()

    # 5) Run evaluation
    cocoEval.evaluate()

    # 6) Return the computed metrics as a dictionary
    return cocoEval.eval

COCO’s caption evaluator (COCOEvalCap) by default computes these seven scores for each set of predicted captions:

1. Bleu_1, Bleu_2, Bleu_3, Bleu_4

  BLEU (BiLingual Evaluation Understudy) n-gram precision scores, where Bleu_1 uses unigrams, Bleu_2 uses bigrams, and so on up to Bleu_4 (4-grams). Higher n means a stricter match.

  See the original BLEU paper for details: https://www.aclweb.org/anthology/P02-1040.pdf

  And a summary in the COCO-caption repo:
https://github.com/salaniz/pycocoevalcap/blob/master/pycocoevalcap/bleu/bleu.py

2. METEOR

Stands for “Metric for Evaluation of Translation with Explicit ORdering.” It aligns unigrams by exact, stem, synonym, and paraphrase matches, then computes a precision/recall harmonic mean with a penalty for word-order differences.

More info: https://www.cs.cmu.edu/~alavie/METEOR/

3. ROUGE_L

ROUGE-L measures the longest common subsequence (LCS) between candidate and reference captions, capturing sentence-level structure.

Details: https://aclanthology.org/W04-1013.pdf

4. CIDEr

  “Consensus‐Based Image Description Evaluation.” It weights n-grams by how frequently they appear in the reference corpus (TF-IDF style), then computes a cosine similarity between candidate and references. This tends to reward captions that match the consensus of human annotations.

  Read the CIDEr paper: https://arxiv.org/abs/1411.5726

5. SPICE

  “Semantic Propositional Image Caption Evaluation.” Instead of n-gram overlap, SPICE parses both candidate and reference captions into scene graphs (objects, attributes, relations) and computes an F-score over those. It tends to correlate better with human judgments of semantics.

  See the SPICE paper: https://arxiv.org/abs/1604.08889


In [26]:
#Run evaluations and print raw metric dictionaries

import pandas as pd
from IPython.display import display

# Paths to our JSON files
groundtruth_ann = "/content/drive/MyDrive/cs231n/project/eval_data/groundtruth_annotations.json"
baseline_res    = "/content/drive/MyDrive/cs231n/project/eval_data/baseline_results.json"
finetuned_res   = "/content/drive/MyDrive/cs231n/project/eval_data/finetuned_results.json"

# ----- Baseline Model -----
print("Baseline model evaluation:")
baseline_metrics = evaluate_captions(groundtruth_ann, baseline_res)
print(baseline_metrics)

# ----- Finetuned Model -----
print("\nFinetuned model evaluation:")
finetuned_metrics = evaluate_captions(groundtruth_ann, finetuned_res)
print(finetuned_metrics)

Baseline model evaluation:
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 803, 'reflen': 2007, 'guess': [803, 716, 629, 542], 'correct': [244, 30, 5, 1]}
ratio: 0.4000996512205281
Bleu_1: 0.068
Bleu_2: 0.025
Bleu_3: 0.010
Bleu_4: 0.005
computing METEOR score...
METEOR: 0.041
computing Rouge score...
ROUGE_L: 0.131
computing CIDEr score...
CIDEr: 0.047
computing SPICE score...
SPICE: 0.057
{'Bleu_1': 0.067842677280321, 'Bleu_2': 0.025192423813897684, 'Bleu_3': 0.010404694216117031, 'Bleu_4': 0.004641189954504011, 'METEOR': 0.041186415640672124, 'ROUGE_L': np.float64(0.1306372175038461), 'CIDEr': np.float64(0.046974308715503926), 'SPICE': np.float64(0.05737535715845713)}

Finetuned model evaluation:
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and prepari

In [27]:
# Baseline metrics table
baseline_df = pd.DataFrame.from_dict(baseline_metrics, orient='index', columns=['Score'])
baseline_df.index.name = 'Metric'
print("Baseline Model Metrics:")
display(baseline_df)

# Finetuned metrics table
finetuned_df = pd.DataFrame.from_dict(finetuned_metrics, orient='index', columns=['Score'])
finetuned_df.index.name = 'Metric'
print("\nFinetuned Model Metrics:")
display(finetuned_df)

Baseline Model Metrics:


Unnamed: 0_level_0,Score
Metric,Unnamed: 1_level_1
Bleu_1,0.067843
Bleu_2,0.025192
Bleu_3,0.010405
Bleu_4,0.004641
METEOR,0.041186
ROUGE_L,0.130637
CIDEr,0.046974
SPICE,0.057375



Finetuned Model Metrics:


Unnamed: 0_level_0,Score
Metric,Unnamed: 1_level_1
Bleu_1,0.220258
Bleu_2,0.088089
Bleu_3,0.035669
Bleu_4,0.016642
METEOR,0.06919
ROUGE_L,0.186814
CIDEr,0.122553
SPICE,0.050681


In [29]:
import json
import pandas as pd
from IPython.display import display

# Original JSON paths
orig_gt_path        = "/content/drive/MyDrive/cs231n/project/eval_data/groundtruth_annotations.json"
orig_baseline_path  = "/content/drive/MyDrive/cs231n/project/eval_data/baseline_results.json"
orig_finetuned_path = "/content/drive/MyDrive/cs231n/project/eval_data/finetuned_results.json"

# Paths for filtered outputs
filtered_gt_path        = "/content/drive/MyDrive/cs231n/project/eval_data/gt_at_least5.json"
filtered_baseline_path  = "/content/drive/MyDrive/cs231n/project/eval_data/baseline_at_least5.json"
filtered_finetuned_path = "/content/drive/MyDrive/cs231n/project/eval_data/finetuned_at_least5.json"

# 1) Load the original ground-truth annotations
with open(orig_gt_path, "r", encoding="utf-8") as f:
    coco = json.load(f)

# 2) Count how many captions each image_id has
caption_counts = {}
for ann in coco["annotations"]:
    img_id = ann["image_id"]
    caption_counts[img_id] = caption_counts.get(img_id, 0) + 1

# 3) Identify image_ids with at least 5 captions
valid_ids = {img_id for img_id, cnt in caption_counts.items() if cnt >= 5}

# 4) Print how many images meet this criterion
print(f"Total images in ground-truth: {len({img['id'] for img in coco['images']})}")
print(f"Images with ≥ 5 captions: {len(valid_ids)}")

# 5) Filter the "images" list
filtered_images = [img for img in coco["images"] if img["id"] in valid_ids]

# 6) Filter the "annotations" list
filtered_annotations = [ann for ann in coco["annotations"] if ann["image_id"] in valid_ids]

# 7) Write out the filtered ground-truth JSON
coco_filtered = {
    "images": filtered_images,
    "annotations": filtered_annotations
}
with open(filtered_gt_path, "w", encoding="utf-8") as f:
    json.dump(coco_filtered, f, ensure_ascii=False, indent=2)

print(f"Filtered annotation file written to: {filtered_gt_path}")
print(f"  → Kept {len(filtered_images)} images (each ≥ 5 captions).")

# 8) Load and filter baseline results
with open(orig_baseline_path, "r", encoding="utf-8") as f:
    baseline = json.load(f)
baseline_filtered = [r for r in baseline if r["image_id"] in valid_ids]
with open(filtered_baseline_path, "w", encoding="utf-8") as f:
    json.dump(baseline_filtered, f, ensure_ascii=False, indent=2)
print(f"Filtered baseline results written to: {filtered_baseline_path}")
print(f"  → Kept {len(baseline_filtered)} entries.")

# 9) Load and filter finetuned results
with open(orig_finetuned_path, "r", encoding="utf-8") as f:
    finetuned = json.load(f)
finetuned_filtered = [r for r in finetuned if r["image_id"] in valid_ids]
with open(filtered_finetuned_path, "w", encoding="utf-8") as f:
    json.dump(finetuned_filtered, f, ensure_ascii=False, indent=2)
print(f"Filtered finetuned results written to: {filtered_finetuned_path}")
print(f"  → Kept {len(finetuned_filtered)} entries.")

# 10) Run evaluation on the filtered sets
print("\nBaseline model evaluation (filtered):")
baseline_metrics = evaluate_captions(filtered_gt_path, filtered_baseline_path)
print(baseline_metrics)

print("\nFinetuned model evaluation (filtered):")
finetuned_metrics = evaluate_captions(filtered_gt_path, filtered_finetuned_path)
print(finetuned_metrics)

# 11) Display results as tables
baseline_df = pd.DataFrame.from_dict(baseline_metrics, orient='index', columns=['Score'])
baseline_df.index.name = 'Metric'
print("\nBaseline Model Metrics (filtered):")
display(baseline_df)

finetuned_df = pd.DataFrame.from_dict(finetuned_metrics, orient='index', columns=['Score'])
finetuned_df.index.name = 'Metric'
print("\nFinetuned Model Metrics (filtered):")
display(finetuned_df)

Total images in ground-truth: 87
Images with ≥ 5 captions: 0
Filtered annotation file written to: /content/drive/MyDrive/cs231n/project/eval_data/gt_at_least5.json
  → Kept 0 images (each ≥ 5 captions).
Filtered baseline results written to: /content/drive/MyDrive/cs231n/project/eval_data/baseline_at_least5.json
  → Kept 0 entries.
Filtered finetuned results written to: /content/drive/MyDrive/cs231n/project/eval_data/finetuned_at_least5.json
  → Kept 0 entries.

Baseline model evaluation (filtered):
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...


IndexError: list index out of range