In [1]:
# coding: utf-8

import collections
import itertools
import os
import pickle

In [2]:
def create_vocab(fn,dev_size=0.1):
    """Create the vocabulary and word embeddings for a single corpus."""
    
    # compute the size of the train/dev set
    num_sentences = sum(1 for line in open(fn, 'r'))
    num_dev_sentences = int(num_sentences*dev_size)
    num_training_sentences = num_sentences - num_dev_sentences
    
    # count tokens in the train set
    count = collections.Counter()
    with open(fn, 'r') as text:
        for line in itertools.islice(text,num_training_sentences):
            for tok in line.split():
                count[tok.encode('utf8')] += 1
    
    vocab_by_freq = [tok for tok,num in count.most_common()]
    vocab = dict(count)
    w2i = {b'_PAD': 0, b'_GO': 1, b'_EOS': 2, b'_UNK': 3}
    w2i.update(zip(vocab_by_freq, range(4,len(vocab_by_freq)+4)))
    i2w = {v: k for k,v in w2i.items()}
    return (vocab, w2i, i2w)

In [3]:
def create_vocab2(text_en,text_fr):
    """Create the vocabulary and word embeddings for a parallel corpus."""
    vocab_en, w2i_en, i2w_en = create_vocab(text_en)
    vocab_fr, w2i_fr, i2w_fr = create_vocab(text_fr)
    print("vocab size: en={0:d}, fr={1:d}".format(len(vocab_en),len(vocab_fr)))
    vocab = {'en': vocab_en, 'fr': vocab_fr}
    w2i   = {'en': w2i_en  , 'fr': w2i_fr  }
    i2w   = {'en': i2w_en  , 'fr': i2w_fr  }
    return (vocab, w2i, i2w)

In [4]:
def create_and_dump_vocab2(data_dir,
                           text_en='text.en',
                           text_fr='text.fr',
                           vocab_dict='vocab.dict',
                           w2i_dict='w2i.dict',
                           i2w_dict='i2w.dict'):
    data_files = [text_en,text_fr,vocab_dict,w2i_dict,i2w_dict]
    data_files = list(map(lambda fn: os.path.join(data_dir,fn),data_files))
    text_en, text_fr, vocab_dict, w2i_dict, i2w_dict = data_files

    vocab, w2i, i2w = create_vocab2(text_en,text_fr)
    pickle.dump(vocab, open(vocab_dict,'wb'))
    pickle.dump(w2i, open(w2i_dict,'wb'))
    pickle.dump(i2w, open(i2w_dict,'wb'))

In [14]:
def char_tokenise_file(data_file_in,data_file_out):
    with open(data_file_out, 'w') as text_out:
        with open(data_file_in, 'r') as text_in:
            for line in text_in:
                text_out.write(line.replace(' '*0,' '*1).replace(' '*3,' '*2))

In [15]:
def char_tokenise_dir(data_dir_in,data_dir_out,text_en='text.en',text_fr='text.fr',):
    
    # compute paths to input files
    data_files_in = [text_en, text_fr]
    data_files_in = list(map(lambda fn: os.path.join(data_dir_in,fn), data_files_in))
    text_en_in, text_fr_in = data_files_in
    
    # compute paths to output files
    data_files_out = [text_en, text_fr]
    data_files_out = list(map(lambda fn: os.path.join(data_dir_out,fn), data_files_out))
    text_en_out, text_fr_out = data_files_out
    
    # tokenise by character
    char_tokenise_file(text_en_in,text_en_out)
    char_tokenise_file(text_fr_in,text_fr_out)

In [16]:
# data created using filter_and_sample.ipynb
create_and_dump_vocab2('fr_en_data_50000')

vocab size: en=41786, fr=55790


In [18]:
char_tokenise_dir(data_dir_in='fr_en_data_50000',data_dir_out='fr_en_data_char_50000')
create_and_dump_vocab2('fr_en_data_char_50000')

vocab size: en=154, fr=154


In [19]:
char_tokenise_dir(data_dir_in='in_en_data_50000',data_dir_out='in_en_data_char_50000')
create_and_dump_vocab2('in_en_data_char_50000')

vocab size: en=46, fr=47


In [20]:
char_tokenise_dir(data_dir_in='hu_en_data_50000',data_dir_out='hu_en_data_char_50000')
create_and_dump_vocab2('hu_en_data_char_50000')

vocab size: en=74, fr=73
