## Setting up notebook

In [1]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install subword-nmt
!pip install sentencepiece

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [6]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm

In [7]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive
Mounted at /content/drive


In [8]:
source_code = 'eng'
target_code = 'nde'

In [9]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
#!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
!pip install --editable ./

/content/drive/MyDrive/Research/eng-to-nde/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nde/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq==0.12.2)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq==0.12.2)
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Colle

## Tokenizing the data

In [10]:
train_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/train.{source_code}'
train_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/train.{target_code}'
val_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/val.{source_code}'
val_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/val.{target_code}'
test_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/test.{source_code}'
test_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/test.{target_code}'

### BPE Dropout

In [11]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/data-25')

In [12]:
# making copies of training set
with open(train_source_path, 'r') as sf, open(train_target_path, 'r') as tf:
  source = sf.read().splitlines()
  target = tf.read().splitlines()
  rep_source = source
  rep_target = target
  for _ in range(24):
    rep_source = rep_source + source
    rep_target = rep_target + target

with open('25train.source', 'w') as sf, open('25train.target', 'w') as tf:
  for i in range(len(rep_source)):
    sf.write(rep_source[i] + '\n')
    tf.write(rep_target[i] + '\n')

In [13]:
# applying tokenization to training, validation and test sets
!subword-nmt apply-bpe -c /content/drive/MyDrive/Research/eng-to-nde/bpe/data/bpe.codes.4000 --vocabulary /content/drive/MyDrive/Research/eng-to-nde/bpe/data/vocab.$source_code --dropout 0.1 --seed 2024 < 25train.source > train.BPE.$source_code
!subword-nmt apply-bpe -c /content/drive/MyDrive/Research/eng-to-nde/bpe/data/bpe.codes.4000 --vocabulary /content/drive/MyDrive/Research/eng-to-nde/bpe/data/vocab.$target_code --dropout 0.1 --seed 2024 < 25train.target > train.BPE.$target_code

!subword-nmt apply-bpe -c /content/drive/MyDrive/Research/eng-to-nde/bpe/data/bpe.codes.4000 --vocabulary /content/drive/MyDrive/Research/eng-to-nde/bpe/data/vocab.$source_code < $val_source_path > val.BPE.$source_code
!subword-nmt apply-bpe -c /content/drive/MyDrive/Research/eng-to-nde/bpe/data/bpe.codes.4000 --vocabulary /content/drive/MyDrive/Research/eng-to-nde/bpe/data/vocab.$target_code < $val_target_path > val.BPE.$target_code

!subword-nmt apply-bpe -c /content/drive/MyDrive/Research/eng-to-nde/bpe/data/bpe.codes.4000 --vocabulary /content/drive/MyDrive/Research/eng-to-nde/bpe/data/vocab.$source_code < $test_source_path > test.BPE.$source_code
!subword-nmt apply-bpe -c /content/drive/MyDrive/Research/eng-to-nde/bpe/data/bpe.codes.4000 --vocabulary /content/drive/MyDrive/Research/eng-to-nde/bpe/data/vocab.$target_code < $test_target_path > test.BPE.$target_code

In [14]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP')

In [15]:
# prepare dictionary to be used by our nmt model
!fairseq-preprocess -s eng -t nde \
--trainpref data-25/train.BPE \
--validpref data-25/val.BPE \
--testpref data-25/test.BPE \
--joined-dictionary \
--srcdict /content/drive/MyDrive/Research/eng-to-nde/bpe/data-bin/dict.eng.txt \
--destdir data-bin-25 \
--bpe 'subword_nmt' \
--workers 2

2024-10-20 01:43:08.805909: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 01:43:09.134582: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 01:43:09.228406: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 01:43:09.799697: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-20 01:44:45 | INFO | fairseq_cli.preprocess |

### ULM Subword Regularization

In [16]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/data-25')

In [17]:
# making copies of training set
with open(train_source_path, 'r') as sf, open(train_target_path, 'r') as tf:
  source = sf.read().splitlines()
  target = tf.read().splitlines()
  rep_source = source
  rep_target = target
  for _ in range(24):
    rep_source = rep_source + source
    rep_target = rep_target + target

In [18]:
# applying tokenization to training, validation and test sets
sp = spm.SentencePieceProcessor(model_file='/content/drive/MyDrive/Research/eng-to-nde/ulm/data/joint.model')

for i in range(len(rep_source)):
  rep_source[i] = ' '.join(sp.encode(rep_source[i], out_type=str, enable_sampling=True, alpha=0.2, nbest_size=-1))
  rep_target[i] = ' '.join(sp.encode(rep_target[i], out_type=str, enable_sampling=True, alpha=0.2, nbest_size=-1))

with open(f'train.ULM.{source_code}', 'w') as wf:
  for source in rep_source:
    wf.write(source + '\n')

with open(f'train.ULM.{target_code}', 'w') as wf:
  for target in rep_target:
    wf.write(target + '\n')

with open(val_source_path, 'r') as rf, open(f'val.ULM.{source_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(val_target_path, 'r') as rf, open(f'val.ULM.{target_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(test_source_path, 'r') as rf, open(f'test.ULM.{source_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(test_target_path, 'r') as rf, open(f'test.ULM.{target_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

In [19]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR')

In [20]:
# prepare dictionary to be used by our nmt model
!fairseq-preprocess -s eng -t nde \
--trainpref data-25/train.ULM \
--validpref data-25/val.ULM \
--testpref data-25/test.ULM \
--joined-dictionary \
--srcdict /content/drive/MyDrive/Research/eng-to-nde/ulm/data-bin/dict.eng.txt \
--destdir data-bin-25 \
--bpe 'sentencepiece' \
--workers 2

2024-10-20 02:19:47.313628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 02:19:47.366985: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 02:19:47.380071: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 02:19:47.424163: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-20 02:19:58 | INFO | fairseq_cli.preprocess |