## Setting up the notebook

In [None]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install sentencepiece

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm

In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
source_code = 'eng'
target_code = 'nso'

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing optok
#!git clone https://github.com/tatHi/optok4at.git
%cd optok4at/machineTranslation/optok
!pip install --editable .

/content/drive/MyDrive/Research/eng-to-nso/optok4at/machineTranslation/optok
Obtaining file:///content/drive/MyDrive/Research/eng-to-nso/optok4at/machineTranslation/optok
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: optok_nmt
  Running setup.py develop for optok_nmt
Successfully installed optok_nmt-0.1.0


In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
%cd optok4at/machineTranslation/fairseq
!pip install --editable .

/content/drive/MyDrive/Research/eng-to-nso/optok4at/machineTranslation/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nso/optok4at/machineTranslation/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sacrebleu (from fairseq==0.9.0)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu->fairseq==0.9.0)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu->fairseq==0.9.0)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.wh

In [None]:
# installing youtokentome
!pip install youtokentome
import youtokentome as yttm

Collecting youtokentome
  Downloading youtokentome-1.0.6.tar.gz (86 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/86.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: youtokentome
  Building wheel for youtokentome (setup.py) ... [?25l[?25hdone
  Created wheel for youtokentome: filename=youtokentome-1.0.6-cp310-cp310-linux_x86_64.whl size=1951577 sha256=376863cc952b27dca1a85e56d7dbf21cf073262d9e2f83788f4e05d506a0d234
  Stored in directory: /root/.cache/pip/wheels/df/85/f8/301d2ba45f43f30bed2fe413efa760bc726b8b660ed9c2900c
Successfully built youtokentome
Installing collected packages: youtokentome
Successfully instal

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# install multigram
#!git clone https://github.com/tatHi/multigram
%cd multigram
!pip install --editable .

/content/drive/MyDrive/Research/eng-to-nso/multigram
Obtaining file:///content/drive/MyDrive/Research/eng-to-nso/multigram
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: multigram
  Running setup.py develop for multigram
Successfully installed multigram-0.1.0



## Preparing the parallel corpus

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok')

In [None]:
# prepare dictionary to be used by our nmt model
!fairseq-preprocess -s eng -t nso \
--trainpref /content/drive/MyDrive/Research/eng-to-nso/ulm/data/train.ULM \
--validpref /content/drive/MyDrive/Research/eng-to-nso/ulm/data/val.ULM \
--testpref /content/drive/MyDrive/Research/eng-to-nso/ulm/data/test.ULM \
--joined-dictionary \
--srcdict /content/drive/MyDrive/Research/eng-to-nso/ulm/data-bin/dict.eng.txt \
--destdir data-bin \
--bpe 'sentencepiece' \
--workers 2

2024-11-01 09:14:21 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, tensorboard_logdir='', seed=None, cpu=False, tpu=False, bf16=False, fp16=False, memory_efficient_bf16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, min_loss_scale=0.0001, threshold_loss_scale=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, checkpoint_suffix='', quantization_config_path=None, profile=False, criterion='cross_entropy', tokenizer=None, bpe='sentencepiece', optimizer=None, lr_scheduler='fixed', scoring='bleu', task='translation', source_lang='eng', target_lang='nso', trainpref='/content/drive/MyDrive/Research/eng-to-nso/ulm/data/train.ULM', validpref='/content/drive/MyDrive/Research/eng-to-nso/ulm/data/val.ULM', testpref='/content/drive/MyDrive/Research/eng-to-nso/ulm/data/test.ULM', align_suffix=None, destdir='data-bin', 

## Training Tokenizer

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok')

In [None]:
!fairseq-train \
--arch transformer_optok \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion optok_pass_through \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--update-freq 1 \
--max-epoch 30 \
--validate-interval 10 \
--save-interval 10 \
--best-checkpoint-metric 'loss' \
--log-interval 100 \
--curriculum 0 \
--save-dir checkpoints-target-tok \
--ddp-backend=no_c10d \
--optok-sp-src-path /content/drive/MyDrive/Research/eng-to-nso/ulm/data/joint.model \
--optok-sp-tgt-path /content/drive/MyDrive/Research/eng-to-nso/ulm/data/joint.model \
--optok-m 3 \
--sp-alpha 0.2 \
--optok-normal-enc \
--optok-lmemb-dec /content/drive/MyDrive/Research/eng-to-nso/target-tok/checkpoints-target-tok/cache/joint.model.lmemb.pt \
--optok-nlm-dec /content/drive/MyDrive/Research/eng-to-nso/target-tok/checkpoints-target-tok/cache/joint.model.nlm.pt \
/content/drive/MyDrive/Research/eng-to-nso/target-tok/data-bin

2024-11-06 04:13:41 | INFO | fairseq_cli.train | Namespace(no_progress_bar=False, log_interval=100, log_format=None, tensorboard_logdir='', seed=2024, cpu=False, tpu=False, bf16=False, fp16=False, memory_efficient_bf16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, min_loss_scale=0.0001, threshold_loss_scale=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, checkpoint_suffix='', quantization_config_path=None, profile=False, criterion='optok_pass_through', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=1, skip_invalid_size_inputs_valid_test=False, max_tokens=4096, max_sentences=None, required_batch_size_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', validate_interval=10, validate_interval_updates=0, validate_after_updates=0, fixed_valida