In [27]:
import json

# load json file
def load_json(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

data = load_json("/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/annotations/captions_train2014.json")

In [28]:
data.keys()

dict_keys(['info', 'images', 'licenses', 'annotations'])

In [29]:
data['annotations'][0]

{'image_id': 318556,
 'id': 48,
 'caption': 'A very clean and well decorated empty bathroom'}

In [None]:
import torch

# load the pt file
data_pt = torch.load("/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/embeddings/train_val_clip_embeddings.pt")

In [31]:
# analyze the data_pt structure
data_pt.keys()

dict_keys(['filenames', 'embeddings'])

In [37]:
data_pt['filenames']

['COCO_train2014_000000263229.jpg',
 'COCO_train2014_000000381595.jpg',
 'COCO_train2014_000000147733.jpg',
 'COCO_train2014_000000559395.jpg',
 'COCO_train2014_000000374072.jpg',
 'COCO_train2014_000000233539.jpg',
 'COCO_train2014_000000213863.jpg',
 'COCO_train2014_000000471409.jpg',
 'COCO_train2014_000000487632.jpg',
 'COCO_train2014_000000242092.jpg',
 'COCO_train2014_000000203982.jpg',
 'COCO_train2014_000000540384.jpg',
 'COCO_train2014_000000186883.jpg',
 'COCO_train2014_000000447520.jpg',
 'COCO_train2014_000000316443.jpg',
 'COCO_train2014_000000087507.jpg',
 'COCO_train2014_000000069256.jpg',
 'COCO_train2014_000000259616.jpg',
 'COCO_train2014_000000494409.jpg',
 'COCO_train2014_000000351298.jpg',
 'COCO_train2014_000000464415.jpg',
 'COCO_train2014_000000044954.jpg',
 'COCO_train2014_000000079696.jpg',
 'COCO_train2014_000000096573.jpg',
 'COCO_train2014_000000098257.jpg',
 'COCO_train2014_000000517936.jpg',
 'COCO_train2014_000000368731.jpg',
 'COCO_train2014_00000046213

In [32]:
data_pt['embeddings']

tensor([[ 0.0044,  0.0386,  0.0086,  ...,  0.1112, -0.0252,  0.0167],
        [-0.0074,  0.0594,  0.0263,  ...,  0.0550,  0.0061, -0.0366],
        [ 0.0126,  0.0143,  0.0014,  ...,  0.0509,  0.0260,  0.0367],
        ...,
        [-0.0117,  0.0183, -0.0192,  ...,  0.1000,  0.0164, -0.0103],
        [-0.0308, -0.0024,  0.0178,  ...,  0.0631,  0.0074,  0.0170],
        [ 0.0118,  0.0193, -0.0262,  ...,  0.0361, -0.0006,  0.0091]])

In [33]:
data_pt['filenames']

# extract out the image ids from the filenames
def get_image_id_from_filename(filename: str) -> int:
    """Extracts the image ID from a COCO filename.
    E.g., 'COCO_train2014_000000123456.jpg' -> 123456
    """
    base_name = filename.split('_')[-1]  # '000000123456.jpg'
    image_id_str = base_name.split('.')[0]  # '000000123456'
    return int(image_id_str)

get_image_id_from_filename(data_pt['filenames'][0])

263229

In [34]:
# Create a mapping from image_id to all its captions
from collections import defaultdict

image_to_captions = defaultdict(list)
for ann in data['annotations']:
    image_to_captions[ann['image_id']].append({
        'caption_id': ann['id'],
        'caption': ann['caption']
    })

print(f"Total unique images with captions: {len(image_to_captions)}")
print(f"Sample captions for first image: {list(image_to_captions.values())[0]}")

Total unique images with captions: 82783
Sample captions for first image: [{'caption_id': 48, 'caption': 'A very clean and well decorated empty bathroom'}, {'caption_id': 126, 'caption': 'A blue and white bathroom with butterfly themed wall tiles.'}, {'caption_id': 219, 'caption': 'A bathroom with a border of butterflies and blue paint on the walls above it.'}, {'caption_id': 255, 'caption': 'An angled view of a beautifully decorated bathroom.'}, {'caption_id': 3555, 'caption': 'A clock that blends in with the wall hangs in a bathroom. '}]


In [35]:
# Now we need caption embeddings - load or compute them
# Assuming you have a model to generate embeddings for captions
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
model.eval()

def get_caption_embedding(caption: str) -> torch.Tensor:
    """Generate embedding for a caption using GPT2."""
    inputs = tokenizer(caption, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the mean of the last hidden state as the embedding
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding

In [36]:
# Process all images and their captions
processed_data = []

for filename in data_pt['filenames']:
    image_id = get_image_id_from_filename(filename)
    
    # Get all captions for this image
    captions_info = image_to_captions.get(image_id, [])
    
    if not captions_info:
        continue
    
    # Generate embeddings for each caption
    caption_embeddings = []
    for cap_info in captions_info:
        embedding = get_caption_embedding(cap_info['caption'])
        caption_embeddings.append({
            'caption_id': cap_info['caption_id'],
            'embedding': embedding
        })
    
    # Structure as requested
    processed_data.append({
        'filename': image_id,
        'caption_embeddings': caption_embeddings
    })

print(f"Processed {len(processed_data)} images")

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
# Save the processed data
output_path = "/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/embeddings/caption_embeddings_structured.pt"
torch.save(processed_data, output_path)
print(f"Saved to {output_path}")

# Verify the structure
sample = processed_data[0]
print(f"\nSample structure:")
print(f"Filename (image_id): {sample['filename']}")
print(f"Number of captions: {len(sample['caption_embeddings'])}")
print(f"First caption_id: {sample['caption_embeddings'][0]['caption_id']}")
print(f"Embedding shape: {sample['caption_embeddings'][0]['embedding'].shape}")

In [None]:


run_indexing_pipeline(db_directory="/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/src/database/objectbox_db",
                      image_embedding_file_path="/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/embeddings/train_val_clip_embeddings.pt",
                      caption_embedding_file_path="/mnt/c/Users/hoxia/Documents/NLDeeznuts/gpt2-image-captioning/data/data/coco/embeddings/caption_embeddings_structured.pt"
                      )



ModuleNotFoundError: No module named 'database'