In [18]:
%load_ext autoreload
%autoreload 2

## Setup

In [19]:
# Add project root (parent directory) to system path (for module imports)
import sys

sys.path.append("../")

In [29]:
import torch
import os
import importlib

import src.models
import src.train
import src.database.image_store

importlib.reload(src.models)
importlib.reload(src.train)
importlib.reload(src.database.image_store)

from src.dataset import CocoDataset, split_coco_annotations
from src.models import RetrievalAugmentedTransformer, TransformerMappingNetwork, ImageCaptioningModel
from src.train import train, train_rat
from src.database.image_store import create_objectbox_store

In [9]:
# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [10]:
# Seed
SEED = 42
# TODO: Set seed for each relevant library (torch, numpy, random, etc.)

## Prepare Datasets

In [11]:
MAX_CAPTION_LENGTH = 50
EMBEDDINGS_PATH = "/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/embeddings/"
ANNOTATIONS_PATH = "/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/annotations/"

In [24]:
# We split the original COCO 2014 training set into a new training and validation set
split_coco_annotations(
    annotations_path=ANNOTATIONS_PATH + "captions_train2014.json",
    output_dir=ANNOTATIONS_PATH,
    split_ratio=0.8,
    seed=SEED,
)

Splitting: 66226 Train images, 16557 Val images.
Created:
- /mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/annotations/train_split.json
- /mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/annotations/val_split.json


In [31]:
# Training Dataset (orig. COCO 2014 TRAIN)
train_dataset = CocoDataset(
    embeddings_path=EMBEDDINGS_PATH + "train_clip_embeddings.pt",
    annotations_path=ANNOTATIONS_PATH + "captions_train2014.json",
    max_length=MAX_CAPTION_LENGTH,
    normalize_embeddings=False,  # `.pt` files already contain normalized embeddings
)

# # Validation Dataset (orig. COCO 2014 TRAIN)
# val_dataset = CocoDataset(
#     embeddings_path=EMBEDDINGS_PATH + "train_val_clip_embeddings.pt",
#     annotations_path=ANNOTATIONS_PATH + "val_split.json",
#     max_length=MAX_CAPTION_LENGTH,
#     normalize_embeddings=False,
# )

# # Test Dataset (orig. COCO 2017 Val)
# test_dataset = CocoDataset(
#     embeddings_path=EMBEDDINGS_PATH + "test_clip_embeddings.pt",
#     annotations_path=ANNOTATIONS_PATH + "captions_val2017.json",
#     max_length=MAX_CAPTION_LENGTH,
#     normalize_embeddings=False,
# )

Dataset ready: 414113 captions.


## Prepare Model

In [32]:
# Models
mapping_network = TransformerMappingNetwork(
    embed_dim=512,  # CLIP embedding dimension
    gpt_dim=768,  # GPT-2 embedding dimension
    prefix_length=40,
    hidden_length=40,
)

model = RetrievalAugmentedTransformer(
    embed_dim=512,
    mapping_network=mapping_network,
    freeze_gpt_weights=True,  # We only fine-tune the mapping network during training
).to(DEVICE)

print(model)



RetrievalAugmentedTransformer(
  (mapping_network): TransformerMappingNetwork(
    (linear): Linear(in_features=512, out_features=30720, bias=True)
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0-7): 8 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (linear1): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=3072, out_features=768, bias=True)
          (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (gpt): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): E

## Train Model

In [33]:
import os
# Add project root (parent directory) to system path (for module imports)
import sys

sys.path.append("../")
import src.database
from src.database.image_store import create_objectbox_store

In [34]:
# Only for RAT: specify ObjectBox database store path and create store.
DB_STORE_PATH = os.path.join(os.path.expanduser("~"), "val_vector_db")

db_store = create_objectbox_store(db_directory=DB_STORE_PATH)

CoreException: 10001 (ILLEGAL_STATE) - Cannot open store: another store is still open using the same path: "/home/hoxiaoyang/val_vector_db"

In [35]:
#query dbstore to get items to see
from src.database.entities import Image, Caption

chunks = db_store.box(Image).get_all()
print(f"Number of items in db store: {len(chunks)}")

Number of items in db store: 5000


In [24]:
# load first image from train_dataset
# try to perform objectbox query

In [36]:
# Train image captioning model
if isinstance(model, RetrievalAugmentedTransformer):

    train_rat(
        train_dataset=train_dataset,
        model=model,
        db_store=db_store,
        top_k=2,
        top_i=10,
        batch_size=64,
        num_epochs=1,
        device=DEVICE,
    )

elif isinstance(model, ImageCaptioningModel):

    train(
        train_dataset=train_dataset,
        model=model, batch_size=64,
        num_epochs=1,
        device=DEVICE
    )


Epoch 1/1:   0%|          | 0/6471 [00:00<?, ?it/s]

Retrieved Images and Similarity Scores:
Filename: 000000013597.jpg, Similarity Score: 0.1263067126274109
Filename: 000000504711.jpg, Similarity Score: 0.17074048519134521
Filename: 000000327780.jpg, Similarity Score: 0.23165571689605713
Filename: 000000182417.jpg, Similarity Score: 0.23378324508666992
Filename: 000000089670.jpg, Similarity Score: 0.2346813678741455
Filename: 000000098633.jpg, Similarity Score: 0.23823601007461548
Filename: 000000157138.jpg, Similarity Score: 0.24878859519958496
Filename: 000000242060.jpg, Similarity Score: 0.24946147203445435
Filename: 000000496954.jpg, Similarity Score: 0.25243818759918213
Filename: 000000136772.jpg, Similarity Score: 0.2532321810722351
Retrieved Images and Similarity Scores:
Filename: 000000067406.jpg, Similarity Score: 0.22996193170547485
Filename: 000000077460.jpg, Similarity Score: 0.2589000463485718
Filename: 000000527528.jpg, Similarity Score: 0.2641037106513977
Filename: 000000159399.jpg, Similarity Score: 0.2676318883895874
Fi

: 