In [None]:
import os

if not os.path.isdir("/content/Separate Corpuses/FullCorpus/") or not os.path.isdir("/joey_experiments/"):
    !gsutil -m cp -r "gs://mytranslator-298419-vcm/joeynmt/" .

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Configuring model parameters.
source_language = "ru"
target_language = "be"
lang_pair = source_language+target_language
tag = "legal"
config_name = "legal_data_config.yaml"

use_cuda = True
first_run = True

gdrive_path = "/content/drive/My Drive/joey_experiments/%s-%s-%s" % (source_language, target_language, tag)

raw_source_file = r"/content/drive/My Drive/TranslatorData/LegalData/legal_data_train_ru.txt"
raw_target_file = r"/content/drive/My Drive/TranslatorData/LegalData/legal_data_train_be.txt"

raw_source_test_file = r"/content/drive/My Drive/TranslatorData/LegalData/legal_data_test_ru.txt"
raw_target_test_file = r"/content/drive/My Drive/TranslatorData/LegalData/legal_data_test_be.txt"


In [None]:
# Setting helper parameters. Do not change it.
from os import path

os.environ["gdrive_path"] = gdrive_path
! echo $gdrive_path
! mkdir -p "$gdrive_path"

# Sets params in bash as well, since we often use bash scripts.
os.environ["data_path"] = path.join("joeynmt", "data", source_language + target_language)
os.environ["bpe_codes_file"] = path.join(gdrive_path, "data", "bpe.codes.5000")
os.environ["src"] = source_language 
os.environ["tgt"] = target_language
os.environ["lang_pair"] = lang_pair
os.environ["tag"] = tag
os.environ["config_name"] = config_name

/content/drive/My Drive/joey_experiments/ru-be-legal


In [None]:
%%capture
! pip install -Iv sacremoses==0.0.40
! pip install -Iv six==1.12.0

In [None]:
def preprocess(filepath, language, force_preprocess=False, pretrained_tc_model=None):
    tok_file = filepath+"_tok.txt"
    tc_model = pretrained_tc_model if pretrained_tc_model else filepath+"_tc.model"
    truecased_file = tok_file+"_true.txt"

    if force_preprocess:
      ! sed -i 's/"//g' "$filepath"
      ! sacremoses tokenize -l language -e 'utf-8' < "$filepath" > "$tok_file"
      if not pretrained_tc_model:
        ! sacremoses train-truecase -m "$tc_model" -j 4 < "$tok_file"
      ! echo "$tc_model"
      ! sacremoses truecase -m "$tc_model" -j 4 < "$tok_file" > "$truecased_file"

    return truecased_file, tc_model

# Change the pointers to our files such that we continue to work with the tokenized and truecased data.
force_preprocess = True if first_run else False

source_file, source_tc_model = preprocess(raw_source_file, source_language, force_preprocess=force_preprocess)
target_file, target_tc_model = preprocess(raw_target_file, target_language, force_preprocess=force_preprocess)
source_test_file, _model = preprocess(raw_source_test_file, source_language, force_preprocess=force_preprocess, pretrained_tc_model=source_tc_model)
target_test_file, _model = preprocess(raw_target_test_file, target_language, force_preprocess=force_preprocess, pretrained_tc_model=target_tc_model)

100% 10691/10691 [00:02<00:00, 3651.92it/s]
10691it [00:04, 2156.18it/s]
/content/drive/My Drive/TranslatorData/LegalData/legal_data_train_ru.txt_tc.model
10691it [00:03, 2897.84it/s]
100% 10691/10691 [00:03<00:00, 3561.40it/s]
10691it [00:05, 2086.47it/s]
/content/drive/My Drive/TranslatorData/LegalData/legal_data_train_be.txt_tc.model
10691it [00:03, 2860.86it/s]
100% 1887/1887 [00:00<00:00, 3544.57it/s]
/content/drive/My Drive/TranslatorData/LegalData/legal_data_train_ru.txt_tc.model
1887it [00:00, 2959.27it/s]
100% 1887/1887 [00:00<00:00, 3521.95it/s]
/content/drive/My Drive/TranslatorData/LegalData/legal_data_train_be.txt_tc.model
1887it [00:00, 2726.94it/s]


In [None]:
# Train and test files should have the same length.
! wc -l "$raw_source_file"
! wc -l "$raw_target_file"
! wc -l "$raw_source_test_file"
! wc -l "$raw_target_test_file"

10691 /content/drive/My Drive/TranslatorData/LegalData/legal_data_train_ru.txt
10691 /content/drive/My Drive/TranslatorData/LegalData/legal_data_train_be.txt
1887 /content/drive/My Drive/TranslatorData/LegalData/legal_data_test_ru.txt
1887 /content/drive/My Drive/TranslatorData/LegalData/legal_data_test_be.txt


In [None]:
# # Let's take a look what preprocessing did to the text.
# ! head "$raw_source_file"*
# ! head "$raw_target_file"*
# ! head "$raw_source_test_file"*
# ! head "$raw_target_test_file"*

In [None]:
import pandas as pd
import csv

def create_df(source_f, target_f, lowercase_first_letter=True):
   source = []
   target = []
   with open(source_f, "r") as source_f, open(target_f, "r") as target_f:
       for source_line, target_line in zip(source_f, target_f):
         if lowercase_first_letter:
            source_line = source_line[0].lower() + source_line[1:]
            target_line = target_line[0].lower() + target_line[1:]
         source.append(source_line.strip())
         target.append(target_line.strip())
  
   print(source[:5])
   df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])

   df_pp = df.drop_duplicates()
   df_pp = df_pp.sample(frac=1, random_state=42).reset_index(drop=True)
   return df_pp

def write_to_files(prefix, source_df):
  print(source_df['source_sentence'][:5])
  source_df['source_sentence'].to_csv(f"{prefix}.{source_language}", sep='\n', index=False, header=False)
  source_df['target_sentence'].to_csv(f"{prefix}.{target_language}", sep='\n', index=False, header=False)

train_dataset = create_df(source_file, target_file)
print(train_dataset.head())
test_df = create_df(source_test_file, target_test_file)

# We use 90% for train, rest for validation
num_train_patterns = int(len(train_dataset)*0.9)
train_df = train_dataset.head(num_train_patterns)
val_df = train_dataset.drop(train_df.index)

write_to_files("train", train_df)
write_to_files("dev", val_df)
write_to_files("test", test_df)

['материально-техническое обеспечение национальных и сборных команд Республики Беларусь по видам спорта за счет средств республиканского бюджета осуществляется в порядке , установленном республиканским органом государственного управления , проводящим государственную политику в сфере физической культуры и спорта', 'при этом диспаша может оспариваться только на том основании , что она является явно неправильной', 'решение об отказе в предоставлении участка лесного фонда в аренду деревообрабатывающей организации для заготовки древесины может быть обжаловано в судебном порядке', 'граждане , зарегистрированные по месту пребывания , включаются в список только на участке для голосования по месту пребывания', 'счетная комиссия избирает из своего состава председателя , заместителя председателя и секретаря комиссии']
                                     source_sentence                                    target_sentence
0  объектами отношений в области охраны и использ...  аб &apos; ектамі адносі

In [None]:
# ! head train.*
# ! head test.*

In [None]:
%%capture
# installing cuda
if use_cuda:
    !wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
    !sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
    !sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
    !sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
    !sudo apt-get update
    !sudo apt-get -y install cuda

In [None]:
# Install JoeyNMT
! cd joeynmt; pip3 install .

In [None]:
! cd joeynmt; python3 -m unittest

................................s.......................
----------------------------------------------------------------------
Ran 56 tests in 1.242s

OK (skipped=1)


In [None]:
# Apply BPE splits to the train and test data.
# NB! We use vocabulary and BPE codes from out-of-domain training process
! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${src}" < train.$src > train.bpe.$src
! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${tgt}" < train.$tgt > train.bpe.$tgt

! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${src}" < dev.$src > dev.bpe.$src
! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${tgt}" < dev.$tgt > dev.bpe.$tgt
! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${src}" < test.$src > test.bpe.$src
! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${tgt}" < test.$tgt > test.bpe.$tgt

In [None]:
# Create directory, move everyone we care about to the correct location
! mkdir -p $data_path
! cp train.bpe* $data_path
! cp test.bpe* $data_path
! cp dev.bpe* $data_path
! cp "${gdrive_path}/data/vocab.txt" $data_path
! cp "${gdrive_path}/${config_name}" $data_path
! ls $data_path

dev.bpe.be  legal_data_config.yaml  test.bpe.ru   train.bpe.ru
dev.bpe.ru  test.bpe.be		    train.bpe.be  vocab.txt


In [None]:
# Also move everything we care about to a mounted location in google drive (relevant if running in colab) at gdrive_path
! cp train.* "$gdrive_path/data"
! cp test.* "$gdrive_path/data"
! cp dev.* "$gdrive_path/data"
! ls "$gdrive_path"
! ls "$gdrive_path/data"

data  legal_data_config.yaml
bpe.codes.5000	dev.bpe.ru  test.bpe.be  train.be      train.ru  vocab.txt
dev.be		dev.ru	    test.bpe.ru  train.bpe.be  vocab.be
dev.bpe.be	test.be     test.ru	 train.bpe.ru  vocab.ru


In [None]:
# Train the model
# You can press Ctrl-C to stop. And then run the next cell to save your checkpoints! 
!cd joeynmt; python3 -m joeynmt train "/content/${data_path}/${config_name}"

2021-02-23 08:39:15,132 - INFO - root - Hello! This is Joey-NMT (version 1.0).
2021-02-23 08:39:15,197 - INFO - joeynmt.data - loading training data...
2021-02-23 08:39:15,437 - INFO - joeynmt.data - building vocabulary...
2021-02-23 08:39:15,798 - INFO - joeynmt.data - loading dev data...
2021-02-23 08:39:15,820 - INFO - joeynmt.data - loading test data...
2021-02-23 08:39:15,863 - INFO - joeynmt.data - data loaded.
2021-02-23 08:39:16.352634: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2021-02-23 08:39:19,152 - INFO - joeynmt.training - Total params: 12424192
2021-02-23 08:39:21,024 - INFO - joeynmt.training - Loading model from /content/drive/My Drive/joey_experiments/ru-be-new_full/models/best.ckpt
2021-02-23 08:39:23,874 - INFO - joeynmt.training - Reset optimizer.
2021-02-23 08:39:23,875 - INFO - joeynmt.training - Reset scheduler.
2021-02-23 08:39:23,875 - INFO - joeynmt.training - Reset tracking of the be

In [None]:
# # Copy the created models from the notebook storage to google drive for persistant storage 
!mkdir -p "$gdrive_path/models/" && cp -r joeynmt/models/${src}${tgt}_transformer_legal/* "$gdrive_path/models/"