We'll learn embeddings from WMT and will learn translation task from multi30k, so results are more comparable (if we would extract 30k sentences from WMT, we couldn't compare with anybody). Besides, in the article authors do precisely this.

First thing: tokenization.

In [None]:
import os
import nltk

multi30k_data_dir = '../data/multi30k'
wmt17_data_dir = '../data/wmt17'
generated_data_dir = '../data/generated'

if not os.path.exists(generated_data_dir): os.mkdir(generated_data_dir)

nltk.download('punkt')
files_to_tokenize = []

# Tokenizing multi30k
for file_name in os.listdir(multi30k_data_dir):
    input_file_path = '{}/{}'.format(multi30k_data_dir, file_name)
    output_file_path = '{}/{}.tok'.format(generated_data_dir, file_name)

    files_to_tokenize.append((input_file_path, output_file_path))

# Tokenizing WMT
wmt17_file_name_src = '{}/{}'.format(wmt17_data_dir, 'europarl-v7.de-en.en')
wmt17_file_name_trg = '{}/{}'.format(wmt17_data_dir, 'europarl-v7.de-en.de')
files_to_tokenize.append((wmt17_file_name_src, '%s/wmt17.en.tok' % generated_data_dir))
files_to_tokenize.append((wmt17_file_name_trg, '%s/wmt17.de.tok' % generated_data_dir))


# Tokenization
for input_file_path, output_file_path in files_to_tokenize:
    print('Tokenizing', input_file_path)
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.read().splitlines()
    
    tokenized = [' '.join(nltk.word_tokenize(line)) for line in lines]
    
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for line in tokenized:
            file.write(line + os.linesep)

Ok, we have tokenized staff. Let's now compute BPEs

In [None]:
%%bash
# TODO: use WMT after dev is done
# src="../data/generated/wmt17.en.tok"
# trg="../data/generated/wmt17.de.tok"
src="../data/generated/train.en.tok"
trg="../data/generated/train.de.tok"
num_bpes=16000

bpes="../data/generated/bpes"
src_vocab="../data/generated/vocab.en"
trg_vocab="../data/generated/vocab.de"

python ../ext-libs/subword-nmt/learn_joint_bpe_and_vocab.py \
    --input "$src" "$trg" \
    -s "$num_bpes" \
    -o "$bpes" \
    --write-vocabulary "$src_vocab" "$trg_vocab"

# Let's apply bpe here for all our tokenized files
for file in $(ls ../data/generated/*.tok)
do
    lang="${file: -6:2}"
    echo "For file $file we use lang: $lang."
    vocab="../data/generated/vocab.$lang"
  
    python ../ext-libs/subword-nmt/apply_bpe.py -c $bpes \
       --vocabulary "$vocab" < "$file" > "$file.bpe"
done

It is not a good thing to learn embeddings here, buut...

In [None]:
import fasttext


# TODO: skipgram is more accurate (but slower to train)
# TODO: do not forget to use wmt17 after development is done
# model_src = fasttext.cbow('../data/generated/wmt17.en.tok.bpe', '../trained_models/wmt17.en.tok.bpe_cbow', dim=512)
# model_trg = fasttext.cbow('../data/generated/wmt17.de.tok.bpe', '../trained_models/wmt17.de.tok.bpe_cbow', dim=512)
model_src = fasttext.cbow('../data/generated/train.en.tok.bpe', '../trained_models/wmt17.en.tok.bpe_cbow',
                          dim=512, min_count=1, silent=0)
model_trg = fasttext.cbow('../data/generated/train.de.tok.bpe', '../trained_models/wmt17.de.tok.bpe_cbow',
                          dim=512, min_count=1, silent=0)

In [None]:
# Let's remove .bin files which we do not use
!rm ../trained_models/*.bin