In [22]:
import re
import time
import pickle

from pycocotools.coco import COCO
from nltk import FreqDist
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def add_suffixes_and_prefixes(descriptions):
    for k in descriptions.keys():
        value = descriptions[k]
        caption_list = []
        for ec in value:

            # replaces specific and general phrases
            sent = decontracted(ec)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)

            # startseq is for kick starting the partial sequence generation and endseq is to stop while predicting.
            # for more referance please check https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
            image_cap = 'startseq ' + sent.lower() + ' endseq'
            caption_list.append(image_cap)
        descriptions[k] = caption_list
    return descriptions

In [None]:
dataDir='coco'
dataType='train2014'
annFile='{}/annotations/captions_{}.json'.format(dataDir,dataType)

In [None]:
coco=COCO(annFile)

In [None]:
annIds = coco.getAnnIds(imgIds=35783)
anns = coco.loadAnns(annIds)
coco.showAnns(anns)
# print(type(coco.getImgIds()[0]))

In [None]:
descriptions = {}
imgIds = coco.getImgIds()
# imgIds = [151, 260, 307, 404, 450, 491, 514, 529, 575, 671] # dummy list because I don't have all images extracted

# print(len(imgIds))
start = time.time()
for imgId in imgIds:
    annIds = coco.getAnnIds(imgIds=imgId)
    # print(len(annIds))
    anns = coco.loadAnns(annIds)
    for annotation in anns:
        if imgId in descriptions:
            descriptions[imgId].append(annotation['caption'])
        else:
            descriptions[imgId] = list()
            descriptions[imgId].append(annotation['caption'])
print("Created Descriptions Dict in {:0.2f}s".format(time.time() - start))

In [None]:
start = time.time()
descriptions = add_suffixes_and_prefixes(descriptions)
print("Added suffixes and prefixes in {:0.2f}s".format(time.time() - start))


In [None]:
for k, v in descriptions.items():
    print(v)
    break

In [None]:
len(descriptions)

In [3]:
def dump_descriptions(descriptions):
    """Dump processed captions into a pickle"""
    with open("coco_descriptions.pkl", "wb") as f:
        pickle.dump(descriptions, f)

def load_descriptions(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

In [None]:
dump_descriptions(descriptions)

### Note
Go to `pycocoImageEmbedding` and find out how many corrupt images are present, and remove their captions from the above pickle before moving on.

Here, we're using the `corruption_free_coco_descriptions.pkl` to generate stats about our datasets.

In [4]:
new_desc = load_descriptions("./corruption_free_coco_descriptions.pkl")

In [None]:
type(new_desc)

In [None]:
len(new_desc)

In [5]:
for k, v in new_desc.items():
    print(v)
    break

['startseq a restaurant has modern wooden tables and chairs  endseq', 'startseq a long restaurant table with rattan rounded back chairs  endseq', 'startseq a long table with a plant on top of it surrounded with wooden chairs  endseq', 'startseq a long table with a flower arrangement in the middle for meetings endseq', 'startseq a table is adorned with wooden chairs with blue accents  endseq']


In [6]:
corpus = ""
start = time.time()
for ec in new_desc.values():
    for el in ec:
        corpus += " "+el
print("Generated Corpus in {:.2f}s".format(time.time() - start))

total_words = corpus.split()
vocabulary = set(total_words)
print("The size of vocabulary is {}".format(len(vocabulary)))

Generated Corpus in 0.31s
The size of vocabulary is 23124


In [7]:
# creating frequency distribution of words
freq_dist = FreqDist(total_words)
freq_dist.most_common(5)

[('a', 684603),
 ('startseq', 414108),
 ('endseq', 414108),
 ('on', 150689),
 ('of', 142762)]

In [8]:
#removing least common words from vocabulary
for ew in list(vocabulary):
    if(freq_dist[ew]<10):
        vocabulary.remove(ew)

In [9]:
VOCAB_SIZE = len(vocabulary)+1
print("Total unique words after removing less frequent word from our corpus = {}".format(VOCAB_SIZE))

Total unique words after removing less frequent word from our corpus = 6321


In [10]:
caption_list = []
for el in new_desc.values():
    for ec in el:
        caption_list.append(ec)
print("The total caption present = {}".format(len(caption_list)))

The total caption present = 414108


In [11]:
token = Tokenizer(num_words=VOCAB_SIZE)
token.fit_on_texts(caption_list)

In [12]:
# index to words are assigned according to frequency. i.e the most frequent word has index of 1
ix_to_word = token.index_word

In [13]:
for k in list(ix_to_word):
    if k>=6321:
        ix_to_word.pop(k, None)

In [14]:
word_to_ix = dict()
for k,v in ix_to_word.items():
    word_to_ix[v] = k

In [15]:
print(len(word_to_ix))
print(len(ix_to_word))

6320
6320


In [16]:
# finding the max_length caption
MAX_LENGTH = 0
temp = 0
for ec in caption_list:
    temp = len(ec.split())
    if(MAX_LENGTH<=temp):
        MAX_LENGTH = temp

print("Maximum caption has length of {}".format(MAX_LENGTH))

Maximum caption has length of 52


### Generating Glove Vectors file
using 300 dimensions glove file since the article we're following uses a embedding size of 300

Download pre-trained glove_vectors from [this link](https://nlp.stanford.edu/projects/glove/)

We'll load the text file (`glove.6B.300d.txt`) and save it as a pickle (`glove_vectors.pkl`) to save us time.

In [23]:
def save_glove_pickle(file_path):
    embeddings_index = {}
    start = time.time()
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print("Created embeddings_index in {:.2f}s".format(time.time() - start))
    
    print("Saving embeddings_index as glove_vectors.pkl")
    start = time.time()
    with open("glove_vectors.pkl", "wb") as f:
        pickle.dump(embeddings_index, f)
    print("Saved glove_vectors in {:.2f}s".format(time.time() - start))

In [24]:
save_glove_pickle("glove/glove.6B.300d.txt")

Created embeddings_index in 27.34s
Saving embeddings_index as glove_vectors.pkl
Saved glove_vectors in 35.15s


In [33]:
def load_glove_vectors(file_path):
    start = time.time()
    with open(file_path, "rb") as f:
        glove = pickle.load(f)
        glove_words = set(glove.keys())
    print("Loaded {} in {:.2f}s".format(file_path, time.time() - start))
    return glove, glove_words

In [34]:
glove, glove_words = load_glove_vectors("glove_vectors.pkl")

Loaded glove_vectors.pkl in 2.03s


In [31]:
# len(glove_words) # 400k
# type(glove_words) # set

In [35]:
EMBEDDING_SIZE = 300

# Get 300-dim dense vector for each of the words in vocabulary
embedding_matrix = np.zeros((VOCAB_SIZE,EMBEDDING_SIZE))
embedding_matrix.shape

start = time.time()
for word, i in word_to_ix.items():
    embedding_vector = np.zeros(300)
    if word in glove_words:
        embedding_vector = glove[word]
        embedding_matrix[i] = embedding_vector
    else:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
print("Generated embedding_matrix in {:.2f}".format(time.time() - start))

Generated embedding_matrix in 0.49


In [36]:
# save the embedding matrix to file
with open("embedding_matrix.pkl","wb") as f:
    pickle.dump(embedding_matrix,f)