## Setting up the notebook

In [None]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install subword-nmt
!pip install sentencepiece
!pip install wandb

Collecting subword-nmt
  Using cached subword_nmt-0.3.8-py3-none-any.whl.metadata (9.2 kB)
Collecting mock (from subword-nmt)
  Using cached mock-5.1.0-py3-none-any.whl.metadata (3.0 kB)
Using cached subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Using cached mock-5.1.0-py3-none-any.whl (30 kB)
Installing collected packages: mock, subword-nmt
Successfully installed mock-5.1.0 subword-nmt-0.3.8


In [None]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm
import wandb

In [None]:
# setting up wandb for logging model performance during training
# api key = b5b05b603ec81167154bcdaec184b83d9e96049b
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
source_code = 'eng'
target_code = 'nso'

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
#!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
!pip install --editable ./

/content/drive/MyDrive/Research/eng-to-nso/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nso/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq==0.12.2)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq==0.12.2)
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Colle

## Training NMT model with Byte-Pair Encoding

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe')

In [None]:
!fairseq-train data-bin \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 100 \
--validate-interval 25 \
--save-interval 25 \
--validate-interval-updates 2000 \
--save-interval-updates 2000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe \
--save-dir checkpoints-bpe \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nso'

2024-10-24 07:43:46.984834: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-24 07:43:47.229471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-24 07:43:47.300481: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-24 07:43:47.691819: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-24 07:43:51 | INFO | numexpr.utils | NumExpr 

## Training NMT with Unigram Language Model

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm')

In [None]:
!fairseq-train data-bin \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 100 \
--validate-interval 25 \
--save-interval 25 \
--validate-interval-updates 2000 \
--save-interval-updates 2000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe sentencepiece \
--save-dir checkpoints-ulm \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nso'

2024-10-24 08:38:41.096197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-24 08:38:41.129155: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-24 08:38:41.139838: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-24 08:38:41.162741: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-24 08:38:44 | INFO | numexpr.utils | NumExpr 

## Training NMT with BPE-Dropout

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP')

In [None]:
!fairseq-train data-bin-25 \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 4 \
--validate-interval 1 \
--save-interval 1 \
--validate-interval-updates 2000 \
--save-interval-updates 2000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe \
--save-dir checkpoints-bpeDROP \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nso'

2024-10-24 10:42:03.129814: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-24 10:42:03.150781: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-24 10:42:03.157446: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-24 10:42:03.173710: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-24 10:42:05 | INFO | numexpr.utils | NumExpr 

## Training NMT with ULM with Subword Regularization

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR')

In [None]:
!fairseq-train data-bin-25 \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 4 \
--validate-interval 1 \
--save-interval 1 \
--validate-interval-updates 2000 \
--save-interval-updates 2000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe sentencepiece \
--save-dir checkpoints-ulmSR \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nso'

2024-10-24 09:33:28.418871: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-24 09:33:28.444439: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-24 09:33:28.454752: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-24 09:33:28.473018: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Exception ignored in: <function _xla_gc_callback at 0

## Training NMT with Task-Specific Tokenizer

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok')

In [None]:
!fairseq-train data-bin-nmt \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 100 \
--validate-interval 25 \
--save-interval 25 \
--validate-interval-updates 2000 \
--save-interval-updates 2000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe sentencepiece \
--save-dir checkpoints-task-nmt \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nso'

2024-11-06 05:52:48.925798: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 05:52:48.947114: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 05:52:48.953235: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 05:52:48.968888: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 05:52:51 | INFO | numexpr.utils | NumExpr 