In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Configuring model parameters.
source_language = "ru"
target_language = "be"
lang_pair = source_language+target_language
tag = "medical_data"
config_name = "medical_data_config.yaml"

use_cuda = True
first_run = False

gdrive_path = "/content/drive/My Drive/joey_experiments/%s-%s-%s" % (source_language, target_language, tag)

raw_source_file = r"/content/drive/My Drive/TranslatorData/MedicalData/med_sentences_ru_train.txt"
raw_target_file = r"/content/drive/My Drive/TranslatorData/MedicalData/med_sentences_be_train.txt"

raw_source_test_file = r"/content/drive/My Drive/TranslatorData/MedicalData/med_sentences_ru_test.txt"
raw_target_test_file = r"/content/drive/My Drive/TranslatorData/MedicalData/med_sentences_be_test.txt"

In [None]:
# Setting helper parameters. Do not change it.
import os
from os import path

os.environ["gdrive_path"] = gdrive_path
!echo $gdrive_path

# Sets params in bash as well, since we often use bash scripts.
os.environ["data_path"] = path.join("joeynmt", "data", source_language + target_language)
os.environ["bpe_codes_file"] = path.join(gdrive_path, "data", "bpe.codes.5000")
os.environ["src"] = source_language 
os.environ["tgt"] = target_language
os.environ["lang_pair"] = lang_pair
os.environ["tag"] = tag
os.environ["config_name"] = config_name

/content/drive/My Drive/joey_experiments/ru-be-medical_data


In [None]:
%%capture
! pip install sacremoses

In [None]:
def preprocess(filepath, language, force_preprocess=False, pretrained_tc_model=None):
    tok_file = filepath+"_tok.txt"
    tc_model = pretrained_tc_model if pretrained_tc_model else filepath+"_tc.model"
    truecased_file = tok_file+"_true.txt"

    if force_preprocess:
      ! sed -i 's/"//g' "$filepath"
      ! sacremoses tokenize -l language -e 'utf-8' < "$filepath" > "$tok_file"
      if not pretrained_tc_model:
        ! sacremoses train-truecase -m "$tc_model" -j 4 < "$tok_file"
      ! echo "$tc_model"
      ! sacremoses truecase -m "$tc_model" -j 4 < "$tok_file" > "$truecased_file"

    return truecased_file, tc_model

# Change the pointers to our files such that we continue to work with the tokenized and truecased data.
force_preprocess = True if first_run else False

source_file, source_tc_model = preprocess(raw_source_file, source_language, force_preprocess=force_preprocess)
target_file, target_tc_model = preprocess(raw_target_file, target_language, force_preprocess=force_preprocess)
# source_test_file, _model = preprocess(raw_source_test_file, source_language, force_preprocess=force_preprocess, pretrained_tc_model=source_tc_model)
# target_test_file, _model = preprocess(raw_target_test_file, target_language, force_preprocess=force_preprocess, pretrained_tc_model=target_tc_model)

In [None]:
# Train and test files should have the same length.
! wc -l "$raw_source_file"
! wc -l "$raw_target_file"
! wc -l "$raw_source_test_file"*
! wc -l "$raw_target_test_file"*

3798 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_ru_train.txt
3798 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_be_train.txt
   457 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_ru_test.txt
   458 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_ru_test.txt_tok.txt
   458 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_ru_test.txt_tok.txt_true.txt
  1373 total
   457 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_be_test.txt
   458 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_be_test.txt_tok.txt
   458 /content/drive/My Drive/TranslatorData/MedicalData/med_sentences_be_test.txt_tok.txt_true.txt
  1373 total


In [None]:
# # Let's take a look what preprocessing did to the text.
# ! head "$raw_source_file"*
# ! head "$raw_target_file"*
# ! head "$raw_source_test_file"*
# ! head "$raw_target_test_file"*

In [None]:
# import pandas as pd
# import csv

# def create_df(source_f, target_f, lowercase_first_letter=True):
#    source = []
#    target = []
#    with open(source_f, "r") as source_f, open(target_f, "r") as target_f:
#        for source_line, target_line in zip(source_f, target_f):
#          if lowercase_first_letter:
#             source_line = source_line[0].lower() + source_line[1:]
#             target_line = target_line[0].lower() + target_line[1:]
#          source.append(source_line.strip())
#          target.append(target_line.strip())
  
#    print(source[:5])
#    df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])

#    df_pp = df.drop_duplicates()
#    df_pp = df_pp.sample(frac=1, random_state=42).reset_index(drop=True)
#    return df_pp

# def write_to_files(prefix, source_df):
#   print(source_df['source_sentence'][:5])
#   source_df['source_sentence'].to_csv(f"{prefix}.{source_language}", sep='\n', index=False, header=False)
#   source_df['target_sentence'].to_csv(f"{prefix}.{target_language}", sep='\n', index=False, header=False)

# train_dataset = create_df(source_file, target_file)
# print(train_dataset.head())
# test_df = create_df(source_test_file, target_test_file)

# # We use 90% for train, rest for validation
# num_train_patterns = int(len(train_dataset)*0.9)
# train_df = train_dataset.head(num_train_patterns)
# val_df = train_dataset.drop(train_df.index)

# write_to_files("train", train_df)
# write_to_files("dev", val_df)
# write_to_files("test", test_df)

In [None]:
# ! head train.*
# ! head test.*

In [None]:
# %%capture
# # installing cuda
# if use_cuda:
#     !wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
#     !sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
#     !sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
#     !sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
#     !sudo apt-get update
#     !sudo apt-get -y install cuda

In [None]:
# Install JoeyNMT
%%capture
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

In [None]:
! cd joeynmt; python3 -m unittest

................................s.......................
----------------------------------------------------------------------
Ran 56 tests in 0.917s

OK (skipped=1)


In [None]:
# # Apply BPE splits to the train and test data.
# # NB! We use vocabulary and BPE codes from out-of-domain training process
# ! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/vocab.${src}" < train.$src > train.bpe.$src
# ! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/vocab.${tgt}" < train.$tgt > train.bpe.$tgt

# ! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/vocab.${src}" < dev.$src > dev.bpe.$src
# ! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/vocab.${tgt}" < dev.$tgt > dev.bpe.$tgt
# ! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/vocab.${src}" < test.$src > test.bpe.$src
# ! subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/vocab.${tgt}" < test.$tgt > test.bpe.$tgt

In [None]:
# # Create directory, move everyone we care about to the correct location
# ! mkdir -p $data_path
# ! cp train.bpe* $data_path
# ! cp test.bpe* $data_path
# ! cp dev.bpe* $data_path
# ! cp "${gdrive_path}/vocab.txt" $data_path
# ! cp "${gdrive_path}/${config_name}" $data_path
# ! ls $data_path

In [None]:
# # Also move everything we care about to a mounted location in google drive (relevant if running in colab) at gdrive_path
# ! cp train.* "$gdrive_path/data"
# ! cp test.* "$gdrive_path/data"
# ! cp dev.* "$gdrive_path/data"
# ! ls "$gdrive_path"
# ! ls "$gdrive_path/data"

In [None]:
# # Train the model
# # You can press Ctrl-C to stop. And then run the next cell to save your checkpoints! 
# !cd joeynmt; python3 -m joeynmt train "/content/${data_path}/${config_name}"

In [None]:
# # Copy the created models from the notebook storage to google drive for persistant storage 
# !cp -r joeynmt/models/${src}${tgt}_transformer_medical/* "$gdrive_path/models/"

In [None]:
# # Output our validation accuracy
# ! cat "$gdrive_path/models/${src}${tgt}_transformer_new/validations.txt"

In [None]:
!mkdir -p /content/joeynmt/models/${src}${tgt}_transformer_medical/ && cp -r "$gdrive_path/models/"* joeynmt/models/${src}${tgt}_transformer_medical/
!mkdir -p /content/joeynmt/data/${src}${tgt}/ && cp -r "$gdrive_path/data/"* joeynmt/data/${src}${tgt}/
!cp -r "$bpe_codes_file" ./

In [None]:
# # Test our model
# ! cd joeynmt; python3 -m joeynmt test "$gdrive_path/models/config.yaml" --output_path "$gdrive_path/models/predictions"

In [None]:
def interactive_translate(text):
  os.environ["config"] = config_name
  os.environ["text"] = text
  os.environ["source_tc_model"] = source_tc_model
  ! echo "$text" | sacremoses tokenize | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "${gdrive_path}/data/vocab.${src}" > "/content/joeynmt/in.txt"
  ! cd joeynmt; python3 -m joeynmt translate "$gdrive_path/models/$config" < in.txt 2> /dev/null | sacremoses detruecase 2> /dev/null | sacremoses detokenize 2> /dev/null | sed "s/ '/'/"

In [None]:
interactive_translate("Пусть мой комментарий затеряется, пусть утонет среди других.")

100% 1/1 [00:00<00:00,  6.47it/s]
1it [00:00, 14.07it/s]


In [None]:
#interactive_translate("В Витебске кратковременный снег, мокрый снег.")
interactive_translate("В дневные часы 15 марта кое-где отмечались дожди.")

  0% 0/1 [00:00<?, ?it/s]0it [00:00, ?it/s]100% 1/1 [00:00<00:00,  8.63it/s]100% 1/1 [00:00<00:00,  8.62it/s]
1it [00:00,  9.29it/s]


In [None]:
interactive_translate("Как объяснили медработники, сделано для того, чтобы защитить стены от возможных ударов каталок.")
interactive_translate("Также на первом этаже расположены различные диагностические службы.")
interactive_translate("Например кабинет компьютерной томографии.")
interactive_translate("Аппарат, с помощью которого проводят это информативное высокотехнологичное исследование, самый современный.")
interactive_translate("Рядом - изотопная лаборатория, где выполняют радионуклидную диагностику.")

usage: subword-nmt apply-bpe [-h] [--input PATH] --codes PATH [--merges INT]
                             [--output PATH] [--separator STR]
                             [--vocabulary PATH] [--vocabulary-threshold INT]
                             [--dropout P] [--glossaries STR [STR ...]]
subword-nmt apply-bpe: error: argument --vocabulary: can't open 'vocab.': [Errno 2] No such file or directory: 'vocab.'
0it [00:00, ?it/s]

Traceback (most recent call last):
  File "/usr/local/bin/sacremoses", line 8, in <module>
    sys.exit(cli())
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 782, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1259, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1066, in invoke
 

In [None]:
def file_translate(file):
  os.environ["config"] = config_name
  os.environ["file"] = file
  ! echo "$gdrive_path/models/$config"
  ! sacremoses tokenize < "$file" | sacremoses truecase -m "$source_tc_model" -j 4 | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "vocab.$src" > "joeynmt/in.txt"
  ! cd joeynmt; python3 -m joeynmt translate "$gdrive_path/models/$config" < in.txt 2> /dev/null | sacremoses detruecase 2> /dev/null | sacremoses detokenize 2> /dev/null | sed "s/ '/'/" > joey_pred.txt

In [None]:
folder = f'{gdrive_path}/data_test'
in_file_name = "med_sentences_ru.txt"
in_filepath = f'{folder}/{in_file_name}'
print(in_filepath)
file_translate(in_filepath)

/content/drive/My Drive/joey_experiments/ru-be-medical_data/data_test/med_sentences_ru.txt
/content/drive/My Drive/joey_experiments/ru-be-medical_data/models/medical_data_config.yaml
100% 4257/4257 [00:01<00:00, 2270.80it/s]
4257it [00:01, 2258.66it/s]


In [None]:
!cp -r joeynmt/joey_pred.txt "$folder"
!cp -r joeynmt/in.txt "$folder"

In [None]:
# folder = f'{gdrive_path}/data/data_test/TestDatasets/TsimafeiMiddleCorpusTest'
# in_file_name = "MiddleCorpusTestPart_Rus.txt"
# in_filepath = f'{folder}/{in_file_name}'
# file_translate(in_filepath)

In [None]:
# !cp -r joeynmt/joey_pred.txt "$folder"
# !cp -r joeynmt/in.txt "$folder"