# downloading the captions train data

## imports 

In [63]:
import os
import json
import urllib.request
import zipfile

## Paths


In [64]:
ANNOTATIONS_ZIP = "annotations_trainval2017.zip"
ANNOTATIONS_URL = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
ANNOTATIONS_DIR = "annotations"
CAPTIONS_FILE_TRAIN = os.path.join(ANNOTATIONS_DIR, "captions_train2017.json")
CAPTIONS_FILE_VAL   = os.path.join(ANNOTATIONS_DIR, "captions_val2017.json")

# download and extract the data

In [65]:
def download_and_extract():
    if not (os.path.exists(CAPTIONS_FILE_TRAIN) and os.path.exists(CAPTIONS_FILE_VAL)):
        print("Downloading COCO annotations (captions)")
        urllib.request.urlretrieve(ANNOTATIONS_URL, ANNOTATIONS_ZIP)
        print("Extracting")
        with zipfile.ZipFile(ANNOTATIONS_ZIP, 'r') as zip_ref:
            zip_ref.extractall(".")
        print("Done")
    else:
        print("Captions already downloaded")

# loading the data

In [66]:
def load_captions(split='train'):
    """split should be 'train' or 'val'"""
    download_and_extract()
    path = CAPTIONS_FILE_TRAIN if split == 'train' else CAPTIONS_FILE_VAL
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [67]:
if __name__ == "__main__":
    data_train = load_captions('train')
    print("Train captions count:", len(data_train["annotations"]))
    print("One example:", data_train["annotations"][0])

Downloading COCO annotations (captions)


KeyboardInterrupt: 

# Extract captions

###### we extracting the captions into data so it will be easier to use in pre- processing

In [None]:
import os
import json

In [None]:
captions = []

for ann in data_train["annotations"] : 
    captions.append(ann["caption"])

print(len(captions))
captions[:5]

591753


['A bicycle replica with a clock as the front wheel.',
 'A room with blue walls and a white sink and door.',
 'A car that seems to be parked illegally behind a legally parked car',
 'A large passenger airplane flying through the air.',
 'There is a GOL plane taking off in a partly cloudy sky.']

In [None]:
lower_captions = [c.lower() for c in captions]
lower_captions[0]

'a bicycle replica with a clock as the front wheel.'

In [None]:
import spacy, string  
nlp = spacy.load("en_core_web_sm")  

stopwords = nlp.Defaults.stop_words  

def preprocess_caption(caption):
    doc = nlp(caption)
    tokens = []
    for token in doc:
        if token.text in string.punctuation: # removed all the punctuation {. , ! ,.... }
            continue
        if token.is_stop: # removed all the stop words {a , as ,with , .... }
            continue
        if token.like_num: # removed all the numbers
            continue
        lemma = token.lemma_.strip()  # turn tokens into their lemma
        if lemma != "":
            tokens.append(lemma)
    return tokens


processed_captions = [preprocess_caption(c) for c in lower_captions] #  preprocessing on the lowercase

print( "before : " , lower_captions[0])
print("after : " , processed_captions[0])

before :  a bicycle replica with a clock as the front wheel.
after :  ['bicycle', 'replica', 'clock', 'wheel']


In [None]:
processed_captions[:]

[['bicycle', 'replica', 'clock', 'wheel'],
 ['room', 'blue', 'wall', 'white', 'sink', 'door'],
 ['car', 'park', 'illegally', 'legally', 'park', 'car'],
 ['large', 'passenger', 'airplane', 'fly', 'air'],
 ['gol', 'plane', 'take', 'partly', 'cloudy', 'sky'],
 ['blue', 'white', 'color', 'scheme', 'small', 'bathroom'],
 ['blue', 'white', 'bathroom', 'wall', 'sink', 'lifesaver', 'wall'],
 ['blue', 'boat', 'theme', 'bathroom', 'life', 'preserver', 'wall'],
 ['bike', 'clock', 'tire'],
 ['car', 'park', 'sidewalk', 'street'],
 ['airplane', 'land', 'take'],
 ['bathroom', 'wall', 'paint', 'baby', 'blue'],
 ['bathroom', 'toilet', 'sink', 'shower'],
 ['long', 'minimal', 'modern', 'skylit', 'home', 'kitchen'],
 ['bathroom', 'sink', 'toiletry', 'counter'],
 ['bathroom', 'sink', 'personal', 'hygiene', 'item'],
 ['open', 'box', 'contain', 'cucumber'],
 ['old',
  'fashioned',
  'green',
  'station',
  'wagon',
  'park',
  'shady',
  'driveway'],
 ['gas', 'stove', 'stainless', 'steel', 'kitchen', 'sink',

# Vocabulary Building

In [None]:
from collections import Counter


In [None]:
tokens = [] 
for caption in processed_captions :
  for token in caption :
   tokens.append(token)
vocab = Counter(tokens)
vocab

Counter({'man': 84055,
         'sit': 65733,
         'stand': 50366,
         'people': 41970,
         'woman': 39133,
         'white': 36387,
         'hold': 32193,
         'table': 31547,
         'street': 30732,
         'person': 25142,
         'large': 24790,
         'group': 21859,
         'plate': 21167,
         'dog': 21134,
         'field': 20854,
         'ride': 20773,
         'small': 20507,
         'train': 19752,
         'near': 19614,
         'tennis': 19595,
         'black': 19483,
         'park': 19200,
         'cat': 18639,
         'walk': 18481,
         'room': 18376,
         'sign': 17964,
         'red': 17408,
         'young': 17228,
         'water': 16852,
         'look': 16790,
         'play': 16605,
         'baseball': 15725,
         'building': 15390,
         'bus': 14663,
         'bathroom': 14464,
         'tree': 14156,
         'food': 13990,
         'blue': 13803,
         'pizza': 13674,
         'kitchen': 13402,
         

In [None]:
filtered_vocab = {
    word: freq
    for word, freq in vocab.items()
    if freq >= 2  # min frequency = 2
}

In [None]:
special_tokens = {
    '<pad>': 0,
    '<start>': 1,
    '<end>': 2
}

In [None]:
word_to_idx = dict(special_tokens)

In [None]:
idx_to_word = {idx: word
                for word, idx in special_tokens.items()
                }

In [None]:
for idx, word in enumerate(filtered_vocab.keys(), start=3):
    word_to_idx[word] = idx
    idx_to_word[idx] = word

In [None]:
numerical_captions = []
for caption_tokens in processed_captions:
    indices = [word_to_idx['<start>']]
    for token in caption_tokens:
        if token in word_to_idx:  
            indices.append(word_to_idx[token])
    indices.append(word_to_idx['<end>'])
    numerical_captions.append(indices)

In [None]:

print("Original:", processed_captions[0])
print("Numerical:", numerical_captions[0])

Original: ['bicycle', 'replica', 'clock', 'wheel']
Numerical: [1, 3, 4, 5, 6, 2]


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max_length = max(len(caption) for caption in numerical_captions)
max_length

31

In [None]:
padded_captions = pad_sequences( numerical_captions, 
                                maxlen=max_length, 
                                padding='post',
                                value=word_to_idx['<pad>'] )

In [None]:
padded_captions[:3]

array([[ 1,  3,  4,  5,  6,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  7,  8,  9, 10, 11, 12,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 13, 14, 15, 16, 14, 13,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [None]:
print(len(numerical_captions))


591753


In [None]:
import pickle

with open("padded_captions.pkl", "wb") as f:
    pickle.dump(padded_captions, f)