In [1]:
# Cell to copy data from Google Storage to the runtime

import os

if not os.path.isdir("/content/Separate Corpuses/FullCorpus/") or not os.path.isdir("/joey_experiments/"):
    !gsutil -m cp -r "gs://mytranslator-298419-vcm/RuBeCorpus_BelapanNews/Separate Corpuses/FullCorpus/" .
    !gsutil -m cp -r "gs://mytranslator-298419-vcm/joey_experiments/" .
    !gsutil -m cp -r "gs://mytranslator-298419-vcm/joeynmt/" .

Copying gs://mytranslator-298419-vcm/RuBeCorpus_BelapanNews/Separate Corpuses/FullCorpus/Corpus_Bel.txt...
/ [0/8 files][    0.0 B/651.5 MiB]   0% Done                                    Copying gs://mytranslator-298419-vcm/RuBeCorpus_BelapanNews/Separate Corpuses/FullCorpus/Corpus_Bel.txt_tc.model...
/ [0/8 files][    0.0 B/651.5 MiB]   0% Done                                    Copying gs://mytranslator-298419-vcm/RuBeCorpus_BelapanNews/Separate Corpuses/FullCorpus/Corpus_Rus.txt_tc.model...
/ [0/8 files][    0.0 B/651.5 MiB]   0% Done                                    Copying gs://mytranslator-298419-vcm/RuBeCorpus_BelapanNews/Separate Corpuses/FullCorpus/Corpus_Rus.txt_tok.txt...
/ [0/8 files][    0.0 B/651.5 MiB]   0% Done                                    Copying gs://mytranslator-298419-vcm/RuBeCorpus_BelapanNews/Separate Corpuses/FullCorpus/Corpus_Bel.txt_tok.txt...
/ [0/8 files][    0.0 B/651.5 MiB]   0% Done                                    Copying gs://mytranslator-

In [2]:
# Helper settings

source_language = "ru"
target_language = "be"
lang_pair = source_language+target_language
tag = "new_full"
model_type = "bpe"

use_cuda = True

if model_type == "bpe":
    # Learn BPEs on the training data.
    number_of_splits = 5000
    os.environ["data_path"] = os.path.join("joeynmt", "data", source_language + target_language)
    bpe_codes_file = "bpe.codes."+str(number_of_splits)

os.environ["src"] = source_language # Sets them in bash as well, since we often use bash scripts
os.environ["tgt"] = target_language
os.environ["lang_pair"] = lang_pair
os.environ["tag"] = tag
os.environ["model_type"] = model_type

runtime_path = "/content/joey_experiments/%s-%s-%s" % (source_language, target_language, tag)
os.environ["runtime_path"] = runtime_path
!echo $runtime_path

raw_source_file = r"/content/FullCorpus/Corpus_Rus.txt"
raw_target_file = r"/content/FullCorpus/Corpus_Bel.txt"

source_tc_model = f"{raw_source_file}_tc.model"

# They should both have the same length.
! wc -l "$raw_source_file"
! wc -l "$raw_target_file"

/content/joey_experiments/ru-be-new_full
429479 /content/FullCorpus/Corpus_Rus.txt
429479 /content/FullCorpus/Corpus_Bel.txt


In [3]:
%%capture
# installing sacremoses - tokenization library
! pip install sacremoses

In [4]:
! cd joeynmt; pip3 install .

Processing /content/joeynmt
Collecting numpy<1.19.0,>=1.14.5
[?25l  Downloading https://files.pythonhosted.org/packages/d6/c6/58e517e8b1fb192725cfa23c01c2e60e4e6699314ee9684a1c5f5c9b27e1/numpy-1.18.5-cp37-cp37m-manylinux1_x86_64.whl (20.1MB)
[K     |████████████████████████████████| 20.1MB 1.4MB/s 
Collecting torch==1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/90/5d/095ddddc91c8a769a68c791c019c5793f9c4456a688ddd235d6670924ecb/torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8MB)
[K     |████████████████████████████████| 776.8MB 23kB/s 
Collecting torchtext==0.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/13/80/046f0691b296e755ae884df3ca98033cb9afcaf287603b2b7999e94640b8/torchtext-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (7.0MB)
[K     |████████████████████████████████| 7.0MB 49.2MB/s 
[?25hCollecting sacrebleu>=1.3.6
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/s

In [5]:
# testing if joeynmt installed correctly
! cd joeynmt; python3 -m unittest

........................s.......................
----------------------------------------------------------------------
Ran 56 tests in 1.004s

OK (skipped=1)


In [6]:
%%capture
# installing cuda
if use_cuda:
    !wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
    !sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
    !sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
    !sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
    !sudo apt-get update
    !sudo apt-get -y install cuda

In [7]:
# copying data from runtime folder to joeynmt
!mkdir -p joeynmt/models/${src}${tgt}_transformer_new/ && cp -r "$runtime_path"/models/{config.yaml,config_non_cuda.yaml} joeynmt/models/${src}${tgt}_transformer_new/
!mkdir -p joeynmt/data/${src}${tgt}/ && cp -r "$runtime_path/data/"* joeynmt/data/${src}${tgt}/
!cp -r joeynmt/data/${src}${tgt}/{bpe.codes.5000,vocab.ru} ./

In [8]:
# Functions to test translation quality in two modes: 'interactive' and 'file'.

from sacrebleu import corpus_bleu

def interactive_translate(text):
    os.environ["config"] = "config.yaml" if use_cuda else "config_non_cuda.yaml"
    ! echo "$text" | sacremoses tokenize | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary vocab.$source_language > "joeynmt/in.txt"
    ! cd joeynmt; python3 -m joeynmt translate "$runtime_path/models/$config" < in.txt 2> /dev/null | sacremoses detruecase 2> /dev/null | sacremoses detokenize 2> /dev/null | sed "s/ '/'/" | sed "s/' /'/"


# use_cuda = true is strongly recommended for file translation!
def file_translate(file, mode):
    os.environ["config"] = "config.yaml" if use_cuda else "config_non_cuda.yaml"
    ! sed -i 's/"//g' "$file"
    ! sacremoses tokenize < "$file" | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary vocab.$source_language > "joeynmt/in.txt"
    ! cd joeynmt; python3 -m joeynmt translate "$runtime_path/models/$config" < in.txt 2> /dev/null | sacremoses detruecase 2> /dev/null | sacremoses detokenize 2> /dev/null | sed "s/ '/'/" | sed "s/' /'/" > "joey_pred.txt"
    outfile = f"{file}_pred_{mode}"
    os.rename(os.path.join('joeynmt', 'joey_pred.txt'), outfile)
    return outfile


def get_bleu(reference_file, prediction_file):
    with open(reference_file, 'r', encoding="utf-8") as reference_f, \
            open(prediction_file, 'r', encoding="utf-8") as prediction_f:
        real = reference_f.readlines()
        prediction = prediction_f.readlines()

    return corpus_bleu(prediction, [real]).score

In [9]:
interactive_translate("Как объяснили медработники, сделано для того, чтобы защитить стены от возможных ударов каталок.")

In [10]:
#Running tests for base translation model and two domain adaptations

path_to_test_data_folder = '/content/joey_experiments/ru-be-new_full/data_test'

for mode in ['base', 'medical', 'legal']:
    print(f"MODE: {mode}.")
    os.environ["checkpoint"] = mode + "_best.ckpt"
    ! cp -r "$runtime_path"/models/"$checkpoint" joeynmt/models/${src}${tgt}_transformer_new/

    interactive_translate("Как объяснили медработники, сделано для того, чтобы защитить стены от возможных ударов каталок.")
    interactive_translate("Коллективный договор может иметь приложения, являющиеся его неотъемлемой составной частью.")
    interactive_translate("В теплое время года на прилегающей к стационарным объектам территории была организована работа летних кафе.")

    for test_file_prefix in ['med_sentences', 'euroradio', 'belapan', 'legal']:
        in_file = os.path.join(path_to_test_data_folder, f"{test_file_prefix}_ru.txt")
        pred_file = file_translate(in_file, mode)
        real_file = os.path.join(path_to_test_data_folder, f"{test_file_prefix}_be.txt")
        print(f"{test_file_prefix}: {get_bleu(real_file, pred_file)}.")
    
    ! rm joeynmt/models/${src}${tgt}_transformer_new/"$checkpoint"

MODE: base.
Як растлумачылі медработнікі, зроблена для таго, каб абараніць сцены ад магчымых удараў каталок.
Калектыўная дагавор можа мець прыкладанні, якія з'яўляюцца яго неад'емнай складнай часткай.
У цёплае час года на прылеглай да стацыянарных аб'ектаў тэрыторыі была арганізавана праца летніх кафэ.
med_sentences: 70.01743131026568.
euroradio: 69.8678115859498.
belapan: 75.31419717393905.
legal: 73.64540803530869.
MODE: medical.
Як патлумачылі медработнікі, зроблена для таго, каб абараніць сцены ад магчымых удараў каталок.
Калектыўны дамову можа мець прыкладання, якія з'яўляюцца яго неад'емнай складнай часткай.
У цеплае час года на прылеглай да стацыянарных аб'ектаў тэрыторыі была арганізавана праца лётных кафе.
med_sentences: 83.06449670035552.
euroradio: 58.06088338423993.
belapan: 58.8655141280291.
legal: 60.92894169379268.
MODE: legal.
Як тлумачылі адработнікі, зроблена для таго, каб абароныць стены ад магчымых удараў каталок
Калектыўны дагавор можа мець прыкладання, якія з'яўля