In [2]:
# coding: utf-8

import collections
import itertools
import os
import pickle
import shutil
import tqdm

# Creating `.dict` files

In [3]:
def create_vocab_for_corpus(fn,dev_size=0.1):
    """Create the vocabulary and word embeddings for a single corpus."""
    
    # compute the size of the train/dev set
    num_sentences = sum(1 for line in open(fn, 'r'))
    num_dev_sentences = int(num_sentences*dev_size)
    num_training_sentences = num_sentences - num_dev_sentences
    
    # count tokens in the train set
    count = collections.Counter()
    with open(fn, 'r') as text:
        for line in itertools.islice(text,num_training_sentences):
            for tok in line.split():
                count[tok.encode('utf8')] += 1
    
    vocab_by_freq = [tok for tok,num in count.most_common()]
    vocab = dict(count)
    w2i = {b'_PAD': 0, b'_GO': 1, b'_EOS': 2, b'_UNK': 3}
    w2i.update(zip(vocab_by_freq, range(4,len(vocab_by_freq)+4)))
    i2w = {v: k for k,v in w2i.items()}
    return (vocab, w2i, i2w)

In [4]:
def create_vocab_for_parallel_corpus(text_en,text_fr):
    """Create the vocabulary and word embeddings for a parallel corpus."""
    vocab_en, w2i_en, i2w_en = create_vocab_for_corpus(text_en)
    vocab_fr, w2i_fr, i2w_fr = create_vocab_for_corpus(text_fr)
    print("vocab size: en={0:d}, fr={1:d}".format(len(vocab_en),len(vocab_fr)))
    vocab = {'en': vocab_en, 'fr': vocab_fr}
    w2i   = {'en': w2i_en  , 'fr': w2i_fr  }
    i2w   = {'en': i2w_en  , 'fr': i2w_fr  }
    return (vocab, w2i, i2w)

In [5]:
def create_and_dump_vocab(data_dir,
                          text_en='text.en',
                          text_fr='text.fr',
                          vocab_dict='vocab.dict',
                          w2i_dict='w2i.dict',
                          i2w_dict='i2w.dict'):
    data_files = [text_en,text_fr,vocab_dict,w2i_dict,i2w_dict]
    data_files = list(map(lambda fn: os.path.join(data_dir,fn),data_files))
    text_en, text_fr, vocab_dict, w2i_dict, i2w_dict = data_files

    vocab, w2i, i2w = create_vocab_for_parallel_corpus(text_en,text_fr)
    pickle.dump(vocab, open(vocab_dict,'wb'))
    pickle.dump(w2i, open(w2i_dict,'wb'))
    pickle.dump(i2w, open(i2w_dict,'wb'))

# Character-based tokenisation

In [6]:
def char_tokenise_file(data_file_in,data_file_out):
    with open(data_file_out, 'w') as text_out:
        with open(data_file_in, 'r') as text_in:
            for line in text_in:
                text_out.write(line.replace(' '*0,' '*1).replace(' '*3,' '*2))

In [7]:
def char_tokenise_dir(data_dir_in,data_dir_out,
                      text_en='text.en',text_fr='text.fr',
                      char_tokenise_en=True,char_tokenise_fr=True):
    
    # compute paths to input files
    data_files_in = [text_en, text_fr]
    data_files_in = list(map(lambda fn: os.path.join(data_dir_in,fn), data_files_in))
    text_en_in, text_fr_in = data_files_in
    
    # compute paths to output files
    data_files_out = [text_en, text_fr]
    data_files_out = list(map(lambda fn: os.path.join(data_dir_out,fn), data_files_out))
    text_en_out, text_fr_out = data_files_out
    
    # create the output directory
    if not os.path.isdir(data_dir_out):
        os.makedirs(data_dir_out)
    
    # tokenise by character
    if char_tokenise_en:
        char_tokenise_file(text_en_in,text_en_out)
    else:
        shutil.copyfile(text_en_in,text_en_out)
    if char_tokenise_fr:
        char_tokenise_file(text_fr_in,text_fr_out)
    else:
        shutil.copyfile(text_fr_in,text_fr_out)

# Most Common Bigram Tokenisation

In [8]:
def create_toks_dict(toks_list):
    toks_list = sorted(toks_list)
    return {k:list(sorted(v,key=len,reverse=True))
            for k,v in itertools.groupby(toks_list, key=lambda ln: ln[0])}

In [9]:
def toks_tokenise_line(line,toks_dict,start=0):
    toks = []
    while start < len(line) - 1:
        for tok in toks_dict[line[start]]:
            if line.startswith(tok,start):
                start += len(tok)
                toks.append(tok)
                break
    return toks

In [10]:
def toks_tokenise_text(text,toks_dict):
    count = collections.Counter()
    for line in text:
        toks = toks_tokenise_line(line,toks_dict)
        for bigram in zip(toks,toks[1:]):
            count[bigram] += 1
    return count

In [11]:
def create_toks_list(text,max_size):
    text = [line.replace(' '*1,' '*0).strip() for line in text]
    toks_list = set(itertools.chain(*[line.replace(' '*0,' '*1).strip().split() for line in text]))
    toks_dict = create_toks_dict(toks_list)
    start_size = len(toks_list)
    for _ in tqdm.tqdm_notebook(range(start_size,max_size)):
        count = toks_tokenise_text(text,toks_dict)
        most_common = ''.join(count.most_common()[:1][0][0])
        toks_dict[most_common[0]].append(most_common)
        toks_dict[most_common[0]].sort(key=len,reverse=True)
    return tuple(itertools.chain(*toks_dict.values()))

In [12]:
def mcb_tokenise_file(data_file_in,data_file_out,fmax_size):
    with open(data_file_in, 'r') as text_in:
        text_in = [line for line in text_in]
    # compute the max size based on the start corpus size
    max_size = fmax_size(len(set(itertools.chain(*[line.strip().split() for line in text_in]))))
    toks_list = create_toks_list(text_in,max_size)
    toks_dict = create_toks_dict(toks_list)
    with open(data_file_out, 'w') as text_out:
        for line in text_in:
            text_out.write(' '.join(toks_tokenise_line(line.replace(' '*1,' '*0),toks_dict)))

In [13]:
def mcb_tokenise_dir(data_dir_in,data_dir_out,fmax_size,
                      text_en='text.en',text_fr='text.fr',
                      mcb_tokenise_en=True,mcb_tokenise_fr=True):
    
    # compute paths to input files
    data_files_in = [text_en,text_fr]
    data_files_in = list(map(lambda fn: os.path.join(data_dir_in,fn), data_files_in))
    text_en_in, text_fr_in = data_files_in
    
    # compute paths to output files
    data_files_out = [text_en, text_fr]
    data_files_out = list(map(lambda fn: os.path.join(data_dir_out,fn), data_files_out))
    text_en_out, text_fr_out = data_files_out
    
    # create the output directory
    if not os.path.isdir(data_dir_out):
        os.makedirs(data_dir_out)
    
    # tokenise by character
    if mcb_tokenise_en:
        mcb_tokenise_file(text_en_in,text_en_out,fmax_size)
    else:
        shutil.copyfile(text_en_in,text_en_out)
    if mcb_tokenise_fr:
        mcb_tokenise_file(text_fr_in,text_fr_out,fmax_size)
    else:
        shutil.copyfile(text_fr_in,text_fr_out)

# Creating datasets

In [14]:
create_and_dump_vocab('in_en_data_mcb8_50000')

vocab size: en=9359, fr=34892


In [24]:
# data created using filter_and_sample.ipynb
create_and_dump_vocab('fr_en_data_50000')

vocab size: en=41786, fr=55790


In [14]:
char_tokenise_dir(data_dir_in='fr_en_data_50000',data_dir_out='fr_en_data_char_50000')
create_and_dump_vocab('fr_en_data_char_50000')

vocab size: en=154, fr=154


In [15]:
char_tokenise_dir(data_dir_in='in_en_data_50000',data_dir_out='in_en_data_char_50000')
create_and_dump_vocab('in_en_data_char_50000')

vocab size: en=46, fr=47


In [16]:
char_tokenise_dir(data_dir_in='hu_en_data_50000',data_dir_out='hu_en_data_char_50000')
create_and_dump_vocab('hu_en_data_char_50000')

vocab size: en=74, fr=73


In [17]:
char_tokenise_dir(data_dir_in='fr_en_data_50000',data_dir_out='fr_en_data_halfjack_50000',char_tokenise_en=False)
create_and_dump_vocab('fr_en_data_halfjack_50000')

vocab size: en=41786, fr=154


In [18]:
char_tokenise_dir(data_dir_in='in_en_data_50000',data_dir_out='in_en_data_halfjack_50000',char_tokenise_en=False)
create_and_dump_vocab('in_en_data_halfjack_50000')

vocab size: en=9359, fr=47


In [19]:
char_tokenise_dir(data_dir_in='hu_en_data_50000',data_dir_out='hu_en_data_halfjack_50000',char_tokenise_en=False)
create_and_dump_vocab('hu_en_data_halfjack_50000')

vocab size: en=13066, fr=73
