In [1]:
from pycocotools.coco import COCO
import re
import time
import pickle

In [2]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [3]:
def add_suffixes_and_prefixes(descriptions):
    for k in descriptions.keys():
        value = descriptions[k]
        caption_list = []
        for ec in value:

            # replaces specific and general phrases
            sent = decontracted(ec)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)

            # startseq is for kick starting the partial sequence generation and endseq is to stop while predicting.
            # for more referance please check https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
            image_cap = 'startseq ' + sent.lower() + ' endseq'
            caption_list.append(image_cap)
        descriptions[k] = caption_list
    return descriptions

In [4]:
dataDir='coco'
dataType='train2014'
annFile='{}/annotations/captions_{}.json'.format(dataDir,dataType)

In [5]:
coco=COCO(annFile)

loading annotations into memory...
Done (t=0.87s)
creating index...
index created!


In [6]:
# annIds = coco.getAnnIds(imgIds=35783)
# anns = coco.loadAnns(annIds)
# coco.showAnns(anns)

In [7]:
descriptions = {}
imgIds = coco.getImgIds()
# print(len(imgIds))
start = time.time()
for imgId in imgIds:
    annIds = coco.getAnnIds(imgIds=imgId)
    # print(len(annIds))
    anns = coco.loadAnns(annIds)
    for annotation in anns:
        if imgId in descriptions:
            descriptions[imgId].append(annotation['caption'])
        else:
            descriptions[imgId] = list()
            descriptions[imgId].append(annotation['caption'])
print("Created Descriptions Dict in {:0.2f}s".format(time.time() - start))

Created Descriptions Dict in 0.75s


In [8]:
start = time.time()
descriptions = add_suffixes_and_prefixes(descriptions)
print("Added suffixes and prefixes in {:0.2f}s".format(time.time() - start))


Added suffixes and prefixes in 4.51s


In [9]:
for k, v in descriptions.items():
    print(v)
    break

['startseq a restaurant has modern wooden tables and chairs  endseq', 'startseq a long restaurant table with rattan rounded back chairs  endseq', 'startseq a long table with a plant on top of it surrounded with wooden chairs  endseq', 'startseq a long table with a flower arrangement in the middle for meetings endseq', 'startseq a table is adorned with wooden chairs with blue accents  endseq']


In [10]:
len(descriptions)

82783

In [11]:
def dump_descriptions(descriptions):
    """Dump processed captions into a pickle"""
    with open("coco_descriptions.pkl", "wb") as f:
        pickle.dump(descriptions, f)

def load_descriptions(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

In [12]:
dump_descriptions(descriptions)

In [13]:
new_desc = load_descriptions("./coco_descriptions.pkl")

In [14]:
type(new_desc)

dict

In [15]:
len(new_desc)

82783