## Setting up the notebook

In [None]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install subword-nmt
!pip install sentencepiece
!pip install wandb

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm
import wandb

In [None]:
# setting up wandb for logging model performance during training
# api key = b5b05b603ec81167154bcdaec184b83d9e96049b
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
source_code = 'eng'
target_code = 'nde'

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
#!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
!pip install --editable ./

/content/drive/MyDrive/Research/eng-to-nde/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nde/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq==0.12.2)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq==0.12.2)
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Colle

## Training NMT model with Byte-Pair Encoding

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe')

In [None]:
!fairseq-train data-bin \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 100 \
--validate-interval 25 \
--save-interval 25 \
--validate-interval-updates 7000 \
--save-interval-updates 7000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe \
--save-dir checkpoints-bpe \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nde'

2024-11-06 15:24:41.542920: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 15:24:41.563214: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 15:24:41.569578: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 15:24:41.585904: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 15:24:44 | INFO | numexpr.utils | 

## Training NMT with Unigram Language Model

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm')

In [None]:
!fairseq-train data-bin \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 100 \
--validate-interval 25 \
--save-interval 25 \
--validate-interval-updates 7000 \
--save-interval-updates 7000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe sentencepiece \
--save-dir checkpoints-ulm \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nde'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

epoch 037 | valid on 'valid' subset:  22% 57/262 [00:31<02:20,  1.46it/s][A




epoch 037 | valid on 'valid' subset:  24% 62/262 [00:35<02:36,  1.28it/s][A
epoch 037 | valid on 'valid' subset:  24% 63/262 [00:36<02:27,  1.35it/s][A



epoch 037 | valid on 'valid' subset:  26% 67/262 [00:38<01:51,  1.75it/s][A
epoch 037 | valid on 'valid' subset:  26% 68/262 [00:38<01:48,  1.79it/s][A




epoch 037 | valid on 'valid' subset:  28% 73/262 [00:41<01:43,  1.83it/s][A
epoch 037 | valid on 'valid' subset:  28% 74/262 [00:42<01:40,  1.88it/s][A


epoch 037 | valid on 'valid' subset:  29% 77/262 [00:43<01:39,  1.87it/s][A
epoch 037 | valid on 'valid' subset:  30% 78/262 [00:44<01:44,  1.76it/s][A
epoch 037 | valid on 'valid' subset:  30% 79/262 [00:44<01:40,  1.83it/s][A



epoch 037 | valid on 'valid' subset:  32% 83/262 [00:47<02:11,  1.36it/s][A
epoch 037 | valid on 'valid' subset:  32% 84/262 [00:48<02:10,  1.36it/

## Training NMT with BPE-Dropout

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP')

In [None]:
!fairseq-train data-bin-25 \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 4 \
--validate-interval 1 \
--save-interval 1 \
--validate-interval-updates 7000 \
--save-interval-updates 7000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe \
--save-dir checkpoints-bpeDROP \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nde'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m



epoch 001 | valid on 'valid' subset:  18% 47/257 [00:28<02:53,  1.21it/s][A
epoch 001 | valid on 'valid' subset:  19% 48/257 [00:28<02:35,  1.35it/s][A




epoch 001 | valid on 'valid' subset:  21% 53/257 [00:31<01:57,  1.74it/s][A
epoch 001 | valid on 'valid' subset:  21% 54/257 [00:32<02:11,  1.54it/s][A




epoch 001 | valid on 'valid' subset:  23% 59/257 [00:35<01:50,  1.79it/s][A




epoch 001 | valid on 'valid' subset:  25% 64/257 [00:38<02:01,  1.59it/s][A
epoch 001 | valid on 'valid' subset:  25% 65/257 [00:38<02:09,  1.48it/s][A




epoch 001 | valid on 'valid' subset:  27% 70/257 [00:43<02:57,  1.05it/s][A
epoch 001 | valid on 'valid' subset:  28% 71/257 [00:44<02:37,  1.18it/s][A



epoch 001 | valid on 'valid' subset:  29% 75/257 [00:46<02:03,  1.47it/s][A
epoch 001 | valid on 'valid' subset:  30% 76/257 [00:47<01:55,  1.56it/s][A
epoch 001 | valid on 'valid' subset:  30% 77/257 [00:48<02:04,  1.

## Training NMT with ULM with Subword Regularization

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR')

In [None]:
!fairseq-train data-bin-25 \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 4 \
--validate-interval 1 \
--save-interval 1 \
--validate-interval-updates 7000 \
--save-interval-updates 7000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe sentencepiece \
--save-dir checkpoints-ulmSR \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nde'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m



epoch 002 | valid on 'valid' subset:  32% 83/262 [01:04<02:20,  1.27it/s][A
epoch 002 | valid on 'valid' subset:  32% 84/262 [01:05<02:29,  1.19it/s][A



epoch 002 | valid on 'valid' subset:  34% 88/262 [01:09<03:03,  1.06s/it][A
epoch 002 | valid on 'valid' subset:  34% 89/262 [01:10<03:11,  1.11s/it][A
epoch 002 | valid on 'valid' subset:  34% 90/262 [01:11<03:22,  1.18s/it][A



epoch 002 | valid on 'valid' subset:  36% 94/262 [01:14<02:29,  1.13it/s][A
epoch 002 | valid on 'valid' subset:  36% 95/262 [01:15<02:24,  1.15it/s][A
epoch 002 | valid on 'valid' subset:  37% 96/262 [01:16<02:27,  1.13it/s][A


epoch 002 | valid on 'valid' subset:  38% 99/262 [01:19<02:21,  1.15it/s][A
epoch 002 | valid on 'valid' subset:  38% 100/262 [01:20<02:22,  1.13it/s][A
epoch 002 | valid on 'valid' subset:  39% 101/262 [01:20<02:19,  1.15it/s][A
epoch 002 | valid on 'valid' subset:  39% 102/262 [01:21<02:20,  1.14it/s]

## Training NMT with Task-Specific Tokenizer

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok')

In [None]:
!fairseq-train data-bin-nmt \
--arch transformer \
--activation-fn relu \
--share-decoder-input-output-embed \
--share-all-embeddings \
--encoder-layers 3 \
--encoder-attention-heads 4 \
--encoder-embed-dim 256 \
--encoder-ffn-embed-dim 1024 \
--decoder-layers 3 \
--decoder-attention-heads 4 \
--decoder-embed-dim 256 \
--decoder-ffn-embed-dim 1024 \
--dropout 0.25 \
--seed 2024 \
--optimizer 'adam' \
--adam-betas '(0.9, 0.999)' \
--lr-scheduler 'inverse_sqrt' \
--patience 5 \
--warmup-updates 1000 \
--criterion 'label_smoothed_cross_entropy' \
--label-smoothing 0.1 \
--lr 0.0003 \
--weight-decay 0.0 \
--max-tokens 4096 \
--max-tokens-valid 3600 \
--required-batch-size-multiple 1 \
--best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
--max-epoch 100 \
--validate-interval 25 \
--save-interval 25 \
--validate-interval-updates 7000 \
--save-interval-updates 7000 \
--log-interval 100 \
--curriculum 0 \
--no-epoch-checkpoints \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 0, "max_len_b": 100, "len_pen": 1}' \
--eval-bleu-detok space \
--eval-bleu-remove-bpe sentencepiece \
--save-dir checkpoints-task-nmt \
--ddp-backend=no_c10d \
--wandb-project 'fairseq-standard-subword-tok-eng-to-nde'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m



epoch 037 | valid on 'valid' subset:  22% 57/262 [00:31<02:37,  1.30it/s][A




epoch 037 | valid on 'valid' subset:  24% 62/262 [00:34<02:06,  1.58it/s][A
epoch 037 | valid on 'valid' subset:  24% 63/262 [00:35<02:00,  1.65it/s][A



epoch 037 | valid on 'valid' subset:  26% 67/262 [00:37<01:52,  1.73it/s][A
epoch 037 | valid on 'valid' subset:  26% 68/262 [00:37<01:51,  1.74it/s][A




epoch 037 | valid on 'valid' subset:  28% 73/262 [00:40<01:45,  1.80it/s][A
epoch 037 | valid on 'valid' subset:  28% 74/262 [00:41<01:44,  1.80it/s][A


epoch 037 | valid on 'valid' subset:  29% 77/262 [00:42<01:51,  1.66it/s][A
epoch 037 | valid on 'valid' subset:  30% 78/262 [00:43<02:00,  1.52it/s][A
epoch 037 | valid on 'valid' subset:  30% 79/262 [00:44<02:28,  1.23it/s][A



epoch 037 | valid on 'valid' subset:  32% 83/262 [00:48<02:32,  1.17it/s][A
epoch 037 | valid on 'valid' subset:  32% 84/262 [00:48<02:23,  1.24i