## Setting up notebook

In [None]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install subword-nmt
!pip install sentencepiece

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm

In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
source_code = 'eng'
target_code = 'nso'

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
!pip install --editable ./

Cloning into 'fairseq'...
remote: Enumerating objects: 35379, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 35379 (delta 9), reused 36 (delta 9), pack-reused 35337 (from 1)[K
Receiving objects: 100% (35379/35379), 25.47 MiB | 13.30 MiB/s, done.
Resolving deltas: 100% (25537/25537), done.
Updating files: 100% (1637/1637), done.
/content/drive/MyDrive/Research/eng-to-nso/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nso/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl

## Tokenizing the data

In [None]:
train_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/train.{source_code}'
train_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/train.{target_code}'
val_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/val.{source_code}'
val_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/val.{target_code}'
test_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/test.{source_code}'
test_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/test.{target_code}'

### BPE

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/data')

In [None]:
# learn joint vocabulary
!subword-nmt learn-joint-bpe-and-vocab --input $train_source_path $train_target_path -s 4000 -o bpe.codes.4000 --write-vocabulary vocab.$source_code vocab.$target_code

100% 4000/4000 [00:11<00:00, 356.13it/s] 


In [None]:
# applying tokenization to training, validation and test sets
!subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$source_code < $train_source_path > train.BPE.$source_code
!subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$target_code < $train_target_path > train.BPE.$target_code

!subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$source_code < $val_source_path > val.BPE.$source_code
!subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$target_code < $val_target_path > val.BPE.$target_code

!subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$source_code < $test_source_path > test.BPE.$source_code
!subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$target_code < $test_target_path > test.BPE.$target_code

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe')

In [None]:
# prepare dictionary to be used by our nmt model
!fairseq-preprocess -s eng -t nso \
--trainpref data/train.BPE \
--validpref data/val.BPE \
--testpref data/test.BPE \
--joined-dictionary \
--destdir data-bin \
--bpe 'subword_nmt' \
--thresholdsrc 1 \
--thresholdtgt 1 \
--workers 2

2024-10-15 10:12:25.023828: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-15 10:12:25.376763: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-15 10:12:25.473537: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-15 10:12:26.009735: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  dictionaries = [ (Dictionary.load(f"{label_dir}/dic

### ULM

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/data')

In [None]:
# learn joint vocabulary
spm.SentencePieceTrainer.train(f'--input={train_source_path},{train_target_path} --model_prefix=joint --model_type=unigram --vocab_size=4000 --character_coverage=1.0 --bos_id=0 --pad_id=1 --eos_id=2 --unk_id=3')

In [None]:
# applying tokenization to training, validation and test sets
sp = spm.SentencePieceProcessor(model_file='joint.model')

with open(train_source_path, 'r') as rf, open(f'train.ULM.{source_code}', 'w') as wf:
    for line in rf:
        wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(train_target_path, 'r') as rf, open(f'train.ULM.{target_code}', 'w') as wf:
    for line in rf:
        wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(val_source_path, 'r') as rf, open(f'val.ULM.{source_code}', 'w') as wf:
    for line in rf:
        wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(val_target_path, 'r') as rf, open(f'val.ULM.{target_code}', 'w') as wf:
    for line in rf:
        wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(test_source_path, 'r') as rf, open(f'test.ULM.{source_code}', 'w') as wf:
    for line in rf:
        wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

with open(test_target_path, 'r') as rf, open(f'test.ULM.{target_code}', 'w') as wf:
    for line in rf:
        wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm')

In [None]:
# prepare dictionary to be used by our nmt model
!fairseq-preprocess -s eng -t nso \
--trainpref data/train.ULM \
--validpref data/val.ULM \
--testpref data/test.ULM \
--joined-dictionary \
--destdir data-bin \
--bpe 'sentencepiece' \
--thresholdsrc 1 \
--thresholdtgt 1 \
--workers 2

2024-10-15 10:13:35.141225: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-15 10:13:35.165192: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-15 10:13:35.172200: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-15 10:13:35.189411: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-15 10:13:43 | INFO | fairseq_cli.preprocess |