In [2]:
import re
import time
import pickle

from pycocotools.coco import COCO
from nltk import FreqDist
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [3]:
def add_suffixes_and_prefixes(descriptions):
    for k in descriptions.keys():
        value = descriptions[k]
        caption_list = []
        for ec in value:

            # replaces specific and general phrases
            sent = decontracted(ec)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)

            # startseq is for kick starting the partial sequence generation and endseq is to stop while predicting.
            # for more referance please check https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
            image_cap = 'startseq ' + sent.lower() + ' endseq'
            caption_list.append(image_cap)
        descriptions[k] = caption_list
    return descriptions

In [4]:
dataDir='coco'
dataType='train2014'
annFile='{}/annotations/captions_{}.json'.format(dataDir,dataType)

In [5]:
coco=COCO(annFile)

loading annotations into memory...
Done (t=0.88s)
creating index...
index created!


In [6]:
annIds = coco.getAnnIds(imgIds=35783)
anns = coco.loadAnns(annIds)
coco.showAnns(anns)
# print(type(coco.getImgIds()[0]))

A stop sign sits in an empty parking lot
A stop sign in front of a bunch of trees on a cloudy day.
A stop sign stands out in front of the clouds.
A traffic sign near a rail and several trees.
A stop sign with a very pretty sky filled with clouds.


In [7]:
descriptions = {}
imgIds = coco.getImgIds()
# imgIds = [151, 260, 307, 404, 450, 491, 514, 529, 575, 671] # dummy list because I don't have all images extracted

# print(len(imgIds))
start = time.time()
for imgId in imgIds:
    annIds = coco.getAnnIds(imgIds=imgId)
    # print(len(annIds))
    anns = coco.loadAnns(annIds)
    for annotation in anns:
        if imgId in descriptions:
            descriptions[imgId].append(annotation['caption'])
        else:
            descriptions[imgId] = list()
            descriptions[imgId].append(annotation['caption'])
print("Created Descriptions Dict in {:0.2f}s".format(time.time() - start))

Created Descriptions Dict in 0.82s


In [8]:
start = time.time()
descriptions = add_suffixes_and_prefixes(descriptions)
print("Added suffixes and prefixes in {:0.2f}s".format(time.time() - start))


Added suffixes and prefixes in 4.37s


In [9]:
for k, v in descriptions.items():
    print(v)
    break

['startseq a restaurant has modern wooden tables and chairs  endseq', 'startseq a long restaurant table with rattan rounded back chairs  endseq', 'startseq a long table with a plant on top of it surrounded with wooden chairs  endseq', 'startseq a long table with a flower arrangement in the middle for meetings endseq', 'startseq a table is adorned with wooden chairs with blue accents  endseq']


In [10]:
len(descriptions)

82783

In [3]:
def dump_descriptions(descriptions):
    """Dump processed captions into a pickle"""
    with open("coco_descriptions.pkl", "wb") as f:
        pickle.dump(descriptions, f)

def load_descriptions(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

In [12]:
dump_descriptions(descriptions)

### Note
Go to `pycocoImageEmbedding` and find out how many corrupt images are present, and remove their captions from the above pickle before moving on.

Here, we're using the `corruption_free_coco_descriptions.pkl` to generate stats about our datasets.

In [4]:
new_desc = load_descriptions("./corruption_free_coco_descriptions.pkl")

In [8]:
type(new_desc)

dict

In [9]:
len(new_desc)

82782

In [10]:
for k, v in new_desc.items():
    print(v)
    break

['startseq a restaurant has modern wooden tables and chairs  endseq', 'startseq a long restaurant table with rattan rounded back chairs  endseq', 'startseq a long table with a plant on top of it surrounded with wooden chairs  endseq', 'startseq a long table with a flower arrangement in the middle for meetings endseq', 'startseq a table is adorned with wooden chairs with blue accents  endseq']


In [11]:
corpus = ""
start = time.time()
for ec in new_desc.values():
    for el in ec:
        corpus += " "+el
print("Generated Corpus in {:.2f}s".format(time.time() - start))

total_words = corpus.split()
vocabulary = set(total_words)
print("The size of vocabulary is {}".format(len(vocabulary)))

Generated Corpus in 0.30s
The size of vocabulary is 23124


In [12]:
# creating frequency distribution of words
freq_dist = FreqDist(total_words)
freq_dist.most_common(5)

[('a', 684603),
 ('startseq', 414108),
 ('endseq', 414108),
 ('on', 150689),
 ('of', 142762)]

In [13]:
#removing least common words from vocabulary
for ew in list(vocabulary):
    if(freq_dist[ew]<10):
        vocabulary.remove(ew)

In [14]:
VOCAB_SIZE = len(vocabulary)+1
print("Total unique words after removing less frequent word from our corpus = {}".format(VOCAB_SIZE))

Total unique words after removing less frequent word from our corpus = 6321


In [15]:
caption_list = []
for el in new_desc.values():
    for ec in el:
        caption_list.append(ec)
print("The total caption present = {}".format(len(caption_list)))

The total caption present = 414108


In [16]:
token = Tokenizer(num_words=VOCAB_SIZE)
token.fit_on_texts(caption_list)

In [17]:
# index to words are assigned according to frequency. i.e the most frequent word has index of 1
ix_to_word = token.index_word

In [18]:
for k in list(ix_to_word):
    if k>=6321:
        ix_to_word.pop(k, None)

In [19]:
word_to_ix = dict()
for k,v in ix_to_word.items():
    word_to_ix[v] = k

In [20]:
print(len(word_to_ix))
print(len(ix_to_word))

6320
6320


In [21]:
# finding the max_length caption
MAX_LENGTH = 0
temp = 0
for ec in caption_list:
    temp = len(ec.split())
    if(MAX_LENGTH<=temp):
        MAX_LENGTH = temp

print("Maximum caption has length of {}".format(MAX_LENGTH))

Maximum caption has length of 52
