## Setting up the notebook

In [None]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install subword-nmt
!pip install sentencepiece

Collecting subword-nmt
  Downloading subword_nmt-0.3.8-py3-none-any.whl.metadata (9.2 kB)
Collecting mock (from subword-nmt)
  Downloading mock-5.1.0-py3-none-any.whl.metadata (3.0 kB)
Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Downloading mock-5.1.0-py3-none-any.whl (30 kB)
Installing collected packages: mock, subword-nmt
Successfully installed mock-5.1.0 subword-nmt-0.3.8


In [None]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm

In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
source_code = 'eng'
target_code = 'nso'

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
#!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
!pip install --editable ./

/content/drive/MyDrive/Research/eng-to-nso/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-nso/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq==0.12.2)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq==0.12.2)
  Downloading bitarray-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Colle

## Evaluate Model on Raw Test and Global Test Sets

In [None]:
import sacrebleu

In [None]:
before_punc = '!,.:;?)'
def remove_whitespace_before(segment, punc):
  """
  This function looks for punctuation in a segment and removes the whitespace
  before the punctuation mark.

  Args:
    segment (str): segment to be processed.
    punc (str): string of punctuation marks.

  Returns:
    The segment with whitespace removed before punctuation marks in punc.
  """
  punc_positions = []
  for i, symbol in enumerate(segment):
    if symbol in punc:
      punc_positions.append(i)

  punc_positions.reverse()
  for pos in punc_positions:
    if segment[pos-1] == " ":
      segment = segment[0:pos-1] + segment[pos:]

  return segment

### BPE

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe')

In [None]:
!mkdir -p trained_model
!cp data-bin/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin/dict.nso.txt trained_model/dict.nso.txt
!cp data/bpe.codes.4000 trained_model/bpe.codes.4000
!cp checkpoints-bpe/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_gtest

Traceback (most recent call last):
  File "/usr/local/bin/fairseq-interactive", line 5, in <module>
    from fairseq_cli.interactive import cli_main
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq_cli/interactive.py", line 23, in <module>
    from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/__init__.py", line 20, in <module>
    from fairseq.distributed import utils as distributed_utils
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/distributed/__init__.py", line 7, in <module>
    from .fully_sharded_data_parallel import (
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/distributed/fully_sharded_data_parallel.py", line 10, in <module>
    from fairseq.dataclass.configs import DistributedTrainingConfig
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/dataclass/__init__.py", line 6, in <module>
    from .configs impo

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/test.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_test

Traceback (most recent call last):
  File "/usr/local/bin/fairseq-interactive", line 5, in <module>
    from fairseq_cli.interactive import cli_main
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq_cli/interactive.py", line 23, in <module>
    from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/__init__.py", line 20, in <module>
    from fairseq.distributed import utils as distributed_utils
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/distributed/__init__.py", line 7, in <module>
    from .fully_sharded_data_parallel import (
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/distributed/fully_sharded_data_parallel.py", line 10, in <module>
    from fairseq.dataclass.configs import DistributedTrainingConfig
  File "/content/drive/MyDrive/Research/eng-to-nso/fairseq/fairseq/dataclass/__init__.py", line 6, in <module>
    from .configs impo

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
!sacrebleu ref1.nso ref2.nso ref3.nso ref4.nso -i $translations_path -m bleu --force
!sacrebleu ref1.nso ref2.nso ref3.nso ref4.nso -i $post_translations_path -m bleu

{
 "name": "BLEU",
 "score": 17.6,
 "signature": "nrefs:4|case:mixed|eff:no|tok:none|smooth:add-k[1.00]|version:2.4.3",
 "verbose_score": "51.8/24.4/12.1/6.2 (BP = 1.000 ratio = 1.018 hyp_len = 11776 ref_len = 11569)",
 "nrefs": "4",
 "case": "mixed",
 "eff": "no",
 "tok": "none",
 "smooth": "add-k[1.00]",
 "version": "2.4.3"
}
[0m

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
!sacrebleu test.nso -i $translations_path -m bleu --force
!sacrebleu test.nso -i $post_translations_path -m bleu

{
 "name": "BLEU",
 "score": 18.2,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "49.3/23.7/13.8/8.6 (BP = 0.941 ratio = 0.943 hyp_len = 54813 ref_len = 58147)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m{
 "name": "BLEU",
 "score": 18.2,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "49.3/23.7/13.8/8.6 (BP = 0.941 ratio = 0.943 hyp_len = 54813 ref_len = 58147)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m

### ULM

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm')

In [None]:
!mkdir -p trained_model
!cp data-bin/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin/dict.nso.txt trained_model/dict.nso.txt
!cp data/joint.model trained_model/joint.model
!cp checkpoints-ulm/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 08:26:21.830990: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 08:26:21.859909: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 08:26:21.869689: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 08:26:21.893508: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 08:26:32 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/test.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 08:30:40.403397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 08:30:40.454459: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 08:30:40.480514: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 08:30:40.554905: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 08:30:52 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### BPE-Dropout

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP')

In [None]:
!mkdir -p trained_model
!cp data-bin-25/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin-25/dict.nso.txt trained_model/dict.nso.txt
!cp /content/drive/MyDrive/Research/eng-to-nso/bpe/data/bpe.codes.4000 trained_model/bpe.codes.4000
!cp checkpoints-bpeDROP/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 09:16:20.385786: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 09:16:20.466807: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 09:16:20.492546: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 09:16:20.548231: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 09:16:39 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/test.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 09:19:47.592644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 09:19:47.616877: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 09:19:47.624039: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 09:19:47.641578: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 09:19:59 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### ULM with Subword Regularization

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR')

In [None]:
!mkdir -p trained_model
!cp data-bin-25/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin-25/dict.nso.txt trained_model/dict.nso.txt
!cp /content/drive/MyDrive/Research/eng-to-nso/ulm/data/joint.model trained_model/joint.model
!cp checkpoints-ulmSR/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 09:33:18.466989: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 09:33:18.492209: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 09:33:18.499204: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 09:33:18.517116: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 09:33:31 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/test.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 09:37:58.122371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 09:37:58.162003: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 09:37:58.173934: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 09:37:58.201253: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 09:38:09 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### Task-specific Tokenization



In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok')

In [None]:
!mkdir -p trained_model
!cp data-bin-nmt/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin-nmt/dict.nso.txt trained_model/dict.nso.txt
!cp /content/drive/MyDrive/Research/eng-to-nso/ulm/data/joint.model trained_model/joint.model
!cp checkpoints-task-nmt/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 10:00:26.579153: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 10:00:26.602519: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 10:00:26.609322: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 10:00:26.626659: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 10:00:40 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang nso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-nso/cleaned-data/test.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 10:03:46.677705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 10:03:46.721511: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 10:03:46.731476: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 10:03:46.755291: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 10:03:58 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')