In [1]:
import nltk
from collections import Counter
from pycocotools.coco import COCO
import logging
import numpy as np
import pickle

class Vocabulary(object):
    """Simple vocabulary wrapper."""

    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<<unknown>>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)


if __name__ == "__main__":

    json = "annotations/captions_train2014.json"
    portion = 0.995 
    # Manually setting threshold
    threshold = 4
    save_path = "vocab1.pkl"
    # construct coco instance
    coco = COCO(json)
    ids = coco.anns.keys()
    counter = Counter()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        if ']' in tokens : 
            print("yes")
        counter.update(tokens)
        if (i+1) % 5000 == 0:
            print("Tokenization Process: {0:.2f}%.".format((i+1)*100/len(ids)))
            
    

loading annotations into memory...
Done (t=0.82s)
creating index...
index created!
Tokenization Process: 1.21%.
Tokenization Process: 2.41%.
Tokenization Process: 3.62%.
yes
Tokenization Process: 4.83%.
Tokenization Process: 6.04%.
Tokenization Process: 7.24%.
Tokenization Process: 8.45%.
Tokenization Process: 9.66%.
Tokenization Process: 10.87%.
Tokenization Process: 12.07%.
Tokenization Process: 13.28%.
Tokenization Process: 14.49%.
Tokenization Process: 15.70%.
yes
Tokenization Process: 16.90%.
Tokenization Process: 18.11%.
Tokenization Process: 19.32%.
Tokenization Process: 20.53%.
Tokenization Process: 21.73%.
Tokenization Process: 22.94%.
Tokenization Process: 24.15%.
Tokenization Process: 25.36%.
Tokenization Process: 26.56%.
Tokenization Process: 27.77%.
Tokenization Process: 28.98%.
Tokenization Process: 30.18%.
Tokenization Process: 31.39%.
Tokenization Process: 32.60%.
Tokenization Process: 33.81%.
Tokenization Process: 35.01%.
Tokenization Process: 36.22%.
Tokenization Proc

In [None]:
words = []
counts = []
for word, count in counter.items():
    if count >= threshold:
        words.append(word)
        counts.append(count)
for i in range(len(words)):
    if(len(words[i]) ==1):
        #print(counts[i],words[i])
        pass
vocab = Vocabulary()
# for padding purpose
vocab.add_word('<<padding>>')
vocab.add_word('<<start>>')
vocab.add_word('<<end>>')
vocab.add_word('<<unknown>>')

# Add the words to the vocabulary.
for word in words:
    #print(word)
    vocab.add_word(word)

with open(save_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))