Steps to apply demo after training
1. Tweak config file from models dir
  - Change models dir name to *'models'* str.
  - Comment path to the checkpoint file.

2. Manually create non_cuda config by changing a *'use_cuda'* flag to False in config file from 1st step. Name it *'config_non_cuda.yaml'* and put to the same *'models'* folder.

3. Change filename  for best checkpoint to *'TV.ckpt'*. For example, rename *'344000.ckpt'* to *'TV.ckpt'*.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%%capture
# installing sacremoses - tokenization library
! pip install sacremoses
# wget - for files dowloading from git
! pip install wget

# cloning joeynmt and installing dependecies
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

In [None]:
import os
import wget
from timeit import default_timer as timer

In [None]:
# Setting main parameters.
# Set use cuda to True for file translation
use_cuda = True
# root directory name with 'data' and 'models' subfolders left after training
model_name = 'joeynmt_en_ru_TV'
# name of the best chekpoint after training
best_ckpt_name = 'TV.ckpt'

In [None]:
# setting up helper variables, please ignore hardcode
runtime_path = "/content/drive/MyDrive/TV_project/{}".format(model_name)
bpe_codes_file = 'bpe.codes.5000'
source_tc_model = 'en.txt_tc.model'
source_vocab_file = 'vocab.en'

os.environ["runtime_path"] = runtime_path
os.environ["model_name"] = model_name
os.environ["best_ckpt_name"] = best_ckpt_name
os.environ["bpe_codes_file"] = bpe_codes_file
os.environ["source_tc_model"] = source_tc_model
os.environ["source_vocab_file"] = source_vocab_file
! echo $runtime_path

/content/drive/MyDrive/TV_project/en-ru-TV_basic


In [None]:
# testing if joeynmt installed correctly
! cd joeynmt; python3 -m unittest

................................s.........................
----------------------------------------------------------------------
Ran 58 tests in 4.613s

OK (skipped=1)


In [None]:
# path to the joeynmt data and model folder
# here you see default values, 
# but we can add metainfo about lang pairs or architecture to the path
joeynmt_models_folder = 'joeynmt/models/'
joeynmt_data_folder = 'joeynmt/data/'

os.environ["joeynmt_models_folder"] = joeynmt_models_folder
os.environ["joeynmt_data_folder"] = joeynmt_data_folder

In [None]:
# copying data from runtime folder to joeynmt
! mkdir -p "$joeynmt_models_folder"
! cp -r "$runtime_path"/models/config.yaml "$joeynmt_models_folder"
! cp -r "$runtime_path"/models/config_non_cuda.yaml "$joeynmt_models_folder"
! cp -r "$runtime_path"/models/"$best_ckpt_name" "$joeynmt_models_folder"
! mkdir -p "$joeynmt_data_folder" && cp -r "$runtime_path/data/"* "$joeynmt_data_folder"
! cp -r "$joeynmt_data_folder"/"$source_tc_model" ./
! cp -r "$joeynmt_data_folder"/"$bpe_codes_file" ./
! cp -r "$joeynmt_data_folder"/"$source_vocab_file" ./

In [None]:
# Functions to test translation quality in two modes: 'interactive' and 'file'.
from sacrebleu import corpus_bleu

def interactive_translate(text: str, tokenised_input: bool = False):
    """
    Translates input text. 
    Non_cuda mode recommended to avoid extensive installation of cuda packages.
    """
    os.environ["config"] = "config.yaml" if use_cuda else "config_non_cuda.yaml"
    if not tokenised_input:
        ! echo "$text" | sacremoses tokenize | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "$source_vocab_file" > "joeynmt/in.txt"
    else:
        ! echo "$text" | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "$source_vocab_file" > "joeynmt/in.txt"
    ! cd joeynmt; python3 -m joeynmt translate "$runtime_path/models/$config" < in.txt 2> /dev/null | sacremoses detruecase 2> /dev/null | sacremoses detokenize 2> /dev/null | sed "s/ '/'/" | sed "s/' /'/"


# use_cuda = true is strongly recommended for file translation!
def file_translate(file: str, tokenised_input: bool = True) -> str:
    """Accepts file to trainslate and returns translation result."""
    os.environ["config"] = "config.yaml" if use_cuda else "config_non_cuda.yaml"
    ! sed -i 's/"//g' "$file"
    if not tokenised_input:
        ! sacremoses tokenize < "$file" | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "$source_vocab_file" > "joeynmt/in.txt"
    else:
        ! cat "$file" | sacremoses truecase -m "$source_tc_model" | subword-nmt apply-bpe -c "$bpe_codes_file" --vocabulary "$source_vocab_file" > "joeynmt/in.txt"
    ! cd joeynmt; python3 -m joeynmt translate "$runtime_path/models/$config" < in.txt 2> /dev/null | sacremoses detruecase 2> /dev/null | sacremoses detokenize 2> /dev/null | sed "s/ '/'/" | sed "s/' /'/" > "joey_pred.txt"
    outfile = f"{file}_pred"
    os.rename(os.path.join('joeynmt', 'joey_pred.txt'), outfile)
    return outfile


def get_bleu(reference_file: str, prediction_file: str) -> float:
    """Calculates BLEU metric for two provided files."""
    with open(reference_file, 'r', encoding="utf-8") as reference_f, \
            open(prediction_file, 'r', encoding="utf-8") as prediction_f:
        real = reference_f.readlines()
        prediction = prediction_f.readlines()

    return corpus_bleu(prediction, [real]).score

In [None]:
interactive_translate('Where are you going ?', tokenised_input=True)
interactive_translate('Where are you going?', tokenised_input=False)

# It is extremely important to disable tokenization for sentences with <T>, <V> labels 
interactive_translate('<T> Where are you going ?', tokenised_input=True)
interactive_translate('<T> Where are you going?', tokenised_input=False)

interactive_translate('<V> Where are you going ?', tokenised_input=True)
interactive_translate('<V> Where are you going?', tokenised_input=False)

Где идет?
Где идет?
Где ты идешь?
< T > Wздесь идет?
Где вы идете?
< V > Wздесь идет?


In [None]:
%%capture
# installing cuda
if use_cuda:
    !wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
    !sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
    !sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
    !sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
    !sudo apt-get update
    !sudo apt-get -y install cuda

In [None]:
# load test data
git_path = r'https://raw.githubusercontent.com/tsimafeip/TV-distinction/main/data/'

source_test_filename = 'deixis_test_en.tv'
target_test_filename = 'deixis_test_ru'

test_source_git_path = git_path + source_test_filename
test_target_git_path = git_path + target_test_filename

if not os.path.isfile(source_test_filename):
    wget.download(test_source_git_path, source_test_filename)

if not os.path.isfile(target_test_filename):
    wget.download(test_target_git_path, target_test_filename)

In [None]:
# Running file translation
start = timer()
print(f"Started translation of {source_test_filename} ...")
pred_file = file_translate(source_test_filename)
print(f"Deixis Test BLEU: {get_bleu(target_test_filename, pred_file)}.")
print("\nFinished translation in : %f seconds\n" % (timer() - start))

Started translation of deixis_test_en.tv ...
Deixis Test BLEU: 15.142489874395736.

Finished translation in : 370.469855 seconds

