In [35]:
from collections import Counter
from pycocotools.coco import COCO
import nltk
nltk.download('punkt')
import pickle
import argparse

[nltk_data] Downloading package punkt to /Users/vijay/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [36]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.index = 0
        
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.index
            self.idx2word[self.index] = word
            self.index += 1
            
    def __call__(self,word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.word2idx)

In [37]:
def build_vocab(json,threshold):
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    
    for i,no in enumerate(ids):
        caption = str(coco.anns[no]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)
        
        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions ".format(i+1,len(ids)))
    
    words = [word for word,cnt in counter.items() if cnt>=threshold]
    
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')
                  
    for i,word in enumerate(words):
        vocab.add_word(word)
        
    return vocab

In [38]:
def main(args):
    vocab = build_vocab(json = args.json_path , threshold = args.threshold)
    vocab_path = args.vocab_path
    with open(vocab_path,'wb') as f:
        pickle.dump(vocab , f)
    
    print('total vocabulary size is {}'.format(len(vocab)))
    print('saved the vocabulary to {}'.format(vocab_path))
    

In [39]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--json_path',type=str,
                       default='data/annotations/captions_train2014.json',
                       help = 'path for train in annotation file')
    parser.add_argument('--vocab_path',type = str,
                       default = 'data/vocab.pkl',
                       help = 'path for vocabulary wrapper')
    parser.add_argument('--threshold',type = int ,
                       default = 4,
                       help ='minimum word count threshold')
    args = parser.parse_args(args=[])
    main(args)

loading annotations into memory...
Done (t=1.49s)
creating index...
index created!
[1000/414113] Tokenized the captions 
[2000/414113] Tokenized the captions 
[3000/414113] Tokenized the captions 
[4000/414113] Tokenized the captions 
[5000/414113] Tokenized the captions 
[6000/414113] Tokenized the captions 
[7000/414113] Tokenized the captions 
[8000/414113] Tokenized the captions 
[9000/414113] Tokenized the captions 
[10000/414113] Tokenized the captions 
[11000/414113] Tokenized the captions 
[12000/414113] Tokenized the captions 
[13000/414113] Tokenized the captions 
[14000/414113] Tokenized the captions 
[15000/414113] Tokenized the captions 
[16000/414113] Tokenized the captions 
[17000/414113] Tokenized the captions 
[18000/414113] Tokenized the captions 
[19000/414113] Tokenized the captions 
[20000/414113] Tokenized the captions 
[21000/414113] Tokenized the captions 
[22000/414113] Tokenized the captions 
[23000/414113] Tokenized the captions 
[24000/414113] Tokenized the 

[207000/414113] Tokenized the captions 
[208000/414113] Tokenized the captions 
[209000/414113] Tokenized the captions 
[210000/414113] Tokenized the captions 
[211000/414113] Tokenized the captions 
[212000/414113] Tokenized the captions 
[213000/414113] Tokenized the captions 
[214000/414113] Tokenized the captions 
[215000/414113] Tokenized the captions 
[216000/414113] Tokenized the captions 
[217000/414113] Tokenized the captions 
[218000/414113] Tokenized the captions 
[219000/414113] Tokenized the captions 
[220000/414113] Tokenized the captions 
[221000/414113] Tokenized the captions 
[222000/414113] Tokenized the captions 
[223000/414113] Tokenized the captions 
[224000/414113] Tokenized the captions 
[225000/414113] Tokenized the captions 
[226000/414113] Tokenized the captions 
[227000/414113] Tokenized the captions 
[228000/414113] Tokenized the captions 
[229000/414113] Tokenized the captions 
[230000/414113] Tokenized the captions 
[231000/414113] Tokenized the captions 


[412000/414113] Tokenized the captions 
[413000/414113] Tokenized the captions 
[414000/414113] Tokenized the captions 
total vocabulary size is 9956
saved the vocabulary to data/vocab.pkl
