## Setting up the notebook

In [1]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install sentencepiece

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [2]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm

In [3]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
source_code = 'eng'
target_code = 'nde'

In [5]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing optok
#!git clone https://github.com/tatHi/optok4at.git
%cd optok4at/machineTranslation/optok
!pip install --editable .

/content/drive/MyDrive/Research/eng-to-nde/optok4at/machineTranslation/optok
Obtaining file:///content/drive/MyDrive/Research/eng-to-nde/optok4at/machineTranslation/optok
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: optok_nmt
  Running setup.py develop for optok_nmt
Successfully installed optok_nmt-0.1.0


In [6]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
%cd optok4at/machineTranslation/fairseq
!pip install --editable .

/content/drive/MyDrive/Research/eng-to-nde/optok4at/machineTranslation/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nde/optok4at/machineTranslation/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sacrebleu (from fairseq==0.9.0)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu->fairseq==0.9.0)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu->fairseq==0.9.0)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.wh

In [7]:
# installing youtokentome
!pip install youtokentome
import youtokentome as yttm

Collecting youtokentome
  Downloading youtokentome-1.0.6.tar.gz (86 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/86.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: youtokentome
  Building wheel for youtokentome (setup.py) ... [?25l[?25hdone
  Created wheel for youtokentome: filename=youtokentome-1.0.6-cp310-cp310-linux_x86_64.whl size=1951571 sha256=68b177edd4090a295ed94448c39c81dc64cbfb47b2074dc6dd05bbc3c0b22af1
  Stored in directory: /root/.cache/pip/wheels/df/85/f8/301d2ba45f43f30bed2fe413efa760bc726b8b660ed9c2900c
Successfully built youtokentome
Installing collected packages: youtokentome
Successfully instal

In [8]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# install multigram
#!git clone https://github.com/tatHi/multigram
%cd multigram
!pip install --editable .

/content/drive/MyDrive/Research/eng-to-nde/multigram
Obtaining file:///content/drive/MyDrive/Research/eng-to-nde/multigram
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: multigram
  Running setup.py develop for multigram
Successfully installed multigram-0.1.0


## Tokenizing the data

In [9]:
train_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/train.{source_code}'
train_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/train.{target_code}'
val_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/val.{source_code}'
val_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/val.{target_code}'
test_source_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/test.{source_code}'
test_target_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data/test.{target_code}'

In [10]:
from multigram import lm
from multigram import tokenizer

In [11]:
mlm = lm.MultigramLM()
mlm.load('/content/drive/MyDrive/Research/eng-to-nde/target-tok/checkpoints-target-tok/checkpoint_best.optok.dec.mlm')
tknzr = tokenizer.Tokenizer(mlm)

In [12]:
# applying tokenization to training, validation and test sets
sp = spm.SentencePieceProcessor(model_file='/content/drive/MyDrive/Research/eng-to-nde/ulm/data/joint.model')

In [13]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/data')

### Training set

In [14]:
with open(train_target_path, 'r') as tf:
  target = tf.read().splitlines()
  for i in range(len(target)):
    segment = target[i].split()
    for j in range(len(segment)):
      segment[j] = '▁' + segment[j]
    target[i] = ''.join(segment)

In [15]:
for i in range(len(target)):
  target[i] = tknzr.encode_as_pieces(target[i])

  logProbTable = np.log(probTable)


In [16]:
with open(f'train.{target_code}', 'w') as wf:
  for segment in target:
    wf.write(' '.join(segment) + '\n')

In [18]:
with open(train_source_path, 'r') as rf, open(f'train.{source_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

### Validation set

In [20]:
with open(val_target_path, 'r') as tf:
  target = tf.read().splitlines()
  for i in range(len(target)):
    segment = target[i].split()
    for j in range(len(segment)):
      segment[j] = '▁' + segment[j]
    target[i] = ''.join(segment)

In [21]:
for i in range(len(target)):
  target[i] = tknzr.encode_as_pieces(target[i])

  logProbTable = np.log(probTable)


In [22]:
with open(f'val.{target_code}', 'w') as wf:
  for segment in target:
    wf.write(' '.join(segment) + '\n')

In [23]:
with open(val_source_path, 'r') as rf, open(f'val.{source_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')

### Test set

In [24]:
with open(test_target_path, 'r') as tf:
  target = tf.read().splitlines()
  for i in range(len(target)):
    segment = target[i].split()
    for j in range(len(segment)):
      segment[j] = '▁' + segment[j]
    target[i] = ''.join(segment)

In [25]:
for i in range(len(target)):
  target[i] = tknzr.encode_as_pieces(target[i])

In [26]:
with open(f'test.{target_code}', 'w') as wf:
  for segment in target:
    wf.write(' '.join(segment) + '\n')

In [27]:
with open(test_source_path, 'r') as rf, open(f'test.{source_code}', 'w') as wf:
    for line in rf:
      wf.write(' '.join(sp.encode(line, out_type=str)) + '\n')