In [16]:
# coding: utf-8

import collections
import os
import pickle

In [28]:
def mk_vocab1(data_file):
    """Create the vocabulary and word embeddings for a single corpus."""
    count = collections.Counter()
    with open(data_file, 'r') as text:
        for line in text:
            for tok in line.split():
                count[tok.encode('utf8')] += 1
                
    vocab_by_freq = [tok for tok,num in count.most_common()]
    print("vocab size: {0:d}".format(len(vocab_by_freq)+4))
    vocab = dict(count)
    w2i = {b'_PAD': 0, b'_GO': 1, b'_EOS': 2, b'_UNK': 3}
    w2i.update(zip(vocab_by_freq, range(4,len(vocab_by_freq)+4)))
    i2w = {v: k for k,v in w2i.items()}
    return (vocab, w2i, i2w)

In [29]:
def mk_vocab(data_dir):
    """Create the vocabulary and word embeddings for a parallel corpus."""
    vocab_en, w2i_en, i2w_en = mk_vocab1(os.path.join(data_dir,'text.en'))
    vocab_fr, w2i_fr, i2w_fr = mk_vocab1(os.path.join(data_dir,'text.fr'))
    vocab = {'en': vocab_en, 'fr': vocab_fr}
    w2i   = {'en': w2i_en  , 'fr': w2i_fr  }
    i2w   = {'en': i2w_en  , 'fr': i2w_fr  }
    pickle.dump(vocab, open(os.path.join(data_dir,'vocab.dict'), 'wb'))
    pickle.dump(w2i  , open(os.path.join(data_dir,'w2i.dict')  , 'wb'))
    pickle.dump(i2w  , open(os.path.join(data_dir,'i2w.dict')  , 'wb'))

In [30]:
mk_vocab('in_en_data_50000')

vocab size: 10000
vocab size: 72738


In [27]:
vocab = pickle.load(open(os.path.join('in_en_data_50000', 'vocab.dict'), 'rb'))
print("vocab size: en={0:d}, fr={1:d}".format(len(vocab['en']), len(vocab['fr'])))

vocab size: en=9359, fr=64854
