# Model Evaluation

---

## Setup


In [1]:
# Move up to project root directory (parent directory) for module imports
import os

os.chdir("../")

# Current working directory should now be project root
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Ryan Lee\Desktop\50.040 Natural Language Processing\gpt2-image-captioning


In [2]:
# Imports

import json
import tempfile

import matplotlib.pyplot as plt
import torch
from transformers import set_seed

from src.dataset import CocoDataset
from src.eval import compute_caption_metrics, evaluate_captions, generate_and_evaluate
from src.models import ImageCaptioningModel, TransformerMappingNetwork
from src.visualize import create_captioning_dataset
from src.utils import load_gpt2_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [4]:
# Seed
SEED = 42
set_seed(SEED)  # Helper function that sets the seed in all relevant libraries

In [5]:
MAX_CAPTION_LENGTH = 50
DATA_DIR = "coco_data/"
EMBEDDINGS_PATH = DATA_DIR + "embeddings/"
ANNOTATIONS_PATH = DATA_DIR + "annotations/"
CHECKPOINTS_PATH = "checkpoints/"

# Tokenizer
gpt2_tokenizer = load_gpt2_tokenizer()

# Validation Dataset
val_dataset = CocoDataset(
    embeddings_path=EMBEDDINGS_PATH + "val_clip_embeddings.pt",
    annotations_path=ANNOTATIONS_PATH + "captions_val2017.json",
    tokenizer=gpt2_tokenizer,
    max_length=MAX_CAPTION_LENGTH,
    normalize_embeddings=False,
)

# Note: No test dataset as test annotations are not publicly available

Dataset ready: 25014 captions.


---

## Prepare Model


In [6]:
# Load Trained Model

# Transformer Mapping Network Params
EMBED_DIM = 512  # Embedding dimension
GPT_DIM = 768  # GPT-2 embedding dimension
PREFIX_LENGTH = 40
HIDDEN_LENGTH = 40

# Image Captioning Model Params
FREEZE_GPT_WEIGHTS = True  # Whether to fine-tune GPT-2 alongside the mapping network
PREFIX_TASK_PROMPT: str | None = None

# Mapping Network
mapping_network = TransformerMappingNetwork(
    embed_dim=EMBED_DIM,
    gpt_dim=GPT_DIM,
    prefix_length=PREFIX_LENGTH,
    hidden_length=HIDDEN_LENGTH,
)

# Image Captioning Model
model = ImageCaptioningModel(
    mapping_network=mapping_network,
    prefix_task_prompt=PREFIX_TASK_PROMPT,
    tokenizer=gpt2_tokenizer,
    freeze_gpt_weights=FREEZE_GPT_WEIGHTS,
).to(DEVICE)

# Load checkpoint
model.load_saved_parameters(CHECKPOINTS_PATH + "base_model_epoch_1.pth")

# View model architecture
print(model)



ImageCaptioningModel(
  (mapping_network): TransformerMappingNetwork(
    (linear): Linear(in_features=512, out_features=30720, bias=True)
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0-7): 8 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (linear1): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=3072, out_features=768, bias=True)
          (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (gpt): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(

In [8]:
dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=4, shuffle=False, num_workers=0
)

# Just extract one batch
batch = next(iter(dataloader))
with torch.no_grad():
    # Get actual captions
    actual_captions = batch["caption_text"]
    image_embeddings = batch["image_embedding"].to(DEVICE)
    generated_ids = model.generate_captions(image_embeddings, max_length=MAX_CAPTION_LENGTH)

In [9]:
actual_captions

['A black Honda motorcycle parked in front of a garage.',
 'A Honda motorcycle parked in a grass driveway',
 'An office cubicle with four different types of computers.',
 'A small closed toilet in a cramped space.']

In [10]:
generated_ids

['A blue motorcycle parked on the ground',
 'A motorcycle parked in the middle of a grassy field',
 'A desk, laptop, computer, telephone, mirror, and all kinds of other objects.',
 'A toilet with two doors on top.']

---

## Validate Model


In [None]:
# Validation Params
EVAL_EVERY_EPOCH = 1
EVAL_BATCH_SIZE = 64
EVAL_MAX_CAPTION_LENGTH = MAX_CAPTION_LENGTH
EVAL_TEMPERATURE = 1.0
EVAL_TOP_P = 0.9

In [None]:
predictions, metrics = generate_and_evaluate(
    model=model,
    dataset=val_dataset,
    annotations_path=ANNOTATIONS_PATH + "captions_val2017.json",
    batch_size=EVAL_BATCH_SIZE,
    num_workers=0,
    max_length=EVAL_MAX_CAPTION_LENGTH,
    temperature=EVAL_TEMPERATURE,
    top_p=EVAL_TOP_P,
    device=DEVICE,
)

In [None]:
predictions, metrics