In [1]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip annotations_trainval2017.zip


--2025-09-25 19:00:32--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.164.113, 3.5.21.203, 16.15.192.185, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.164.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2025-09-25 19:00:35 (96.5 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  


In [1]:
train_captions = "/content/annotations/captions_train2017.json"


In [2]:
import json

Loading Captions

In [3]:
with open(train_captions, "r") as f:
    data = json.load(f)

print(data.keys())
print(len(data["annotations"]))
print(data["annotations"][0].values())

dict_keys(['info', 'licenses', 'images', 'annotations'])
591753
dict_values([203564, 37, 'A bicycle replica with a clock as the front wheel.'])


In [4]:
annotations=data["annotations"]
print(annotations[0])
images=data["images"]
print(images[0])

{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}
{'license': 3, 'file_name': '000000391895.jpg', 'coco_url': 'http://images.cocodataset.org/train2017/000000391895.jpg', 'height': 360, 'width': 640, 'date_captured': '2013-11-14 11:18:45', 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', 'id': 391895}


Data Pairing

In [5]:
image_dir="/content/train2017"
pairs=[]
for ann in annotations:
    image_id=ann["image_id"]
    caption=ann["caption"]
    filename = f"{image_id:012d}.jpg"
    image_path = f"{image_dir}/{filename}"
    pairs.append((image_path,caption))

Data Cleaning

In [6]:
import re

def clean_caption(caption: str) -> str:
    caption = caption.lower()
    caption = re.sub(r"[^a-z0-9\s]", "", caption)  # keep alphanumeric
    return caption.strip()

pairs = [(img, clean_caption(cap)) for img, cap in pairs]


In [7]:
captions = [cap for _, cap in pairs]

Tokenization

In [8]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import word_tokenize

tokenized_captions = [word_tokenize(cap) for cap in captions]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [11]:
import numpy as np

Text Embedding

In [9]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences=tokenized_captions, vector_size=300, window=5, min_count=2, workers=4)
w2v_model.save("coco_word2vec.model")

In [10]:
def caption_to_vec(caption_tokens, model):
    vectors = []
    for token in caption_tokens:
        if token in model.wv:
            vectors.append(model.wv[token])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [12]:
import numpy as np

In [13]:
caption_embeddings = [caption_to_vec(tokens, w2v_model) for tokens in tokenized_captions]

print("Number of captions:", len(caption_embeddings))
print("Shape of first embedding:", caption_embeddings[0].shape)

# Example: show one pair with embedding
print("Image path:", pairs[0][0])
print("Caption:", pairs[0][1])
print("Embedding vector (first 10 dims):", caption_embeddings[0][:10])

Number of captions: 591753
Shape of first embedding: (300,)
Image path: /content/train2017/000000203564.jpg
Caption: a bicycle replica with a clock as the front wheel
Embedding vector (first 10 dims): [ 0.14693505 -0.4813532  -0.31320772  0.25158104 -0.11906289  0.33852834
  0.00603118 -0.66434944  0.07965587 -0.17264692]


Save Embeddings

In [14]:
caption_embeddings = np.array(caption_embeddings)
np.save("caption_embeddings.npy", caption_embeddings)

In [15]:
# caption_embeddings = np.load("caption_embeddings.npy") #load them
