## Setting up the notebook

In [None]:
# installing packages
!pip install pip==24.0
!pip install numpy==1.23.5
!pip install tensorboardX
!pip install subword-nmt
!pip install sentencepiece

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
# importing packages
import numpy
import os
import tensorboardX
import sentencepiece as spm

In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
source_code = 'eng'
target_code = 'tso'

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}')

# installing fairseq
#!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
!pip install --editable ./

/content/drive/MyDrive/Research/eng-to-tso/fairseq
Obtaining file:///content/drive/MyDrive/Research/eng-to-tso/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq==0.12.2)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq==0.12.2)
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Colle

## Evaluate Model on Raw Test and Global Test Sets

In [None]:
import sacrebleu

In [None]:
before_punc = '!,.:;?)'
def remove_whitespace_before(segment, punc):
  """
  This function looks for punctuation in a segment and removes the whitespace
  before the punctuation mark.

  Args:
    segment (str): segment to be processed.
    punc (str): string of punctuation marks.

  Returns:
    The segment with whitespace removed before punctuation marks in punc.
  """
  punc_positions = []
  for i, symbol in enumerate(segment):
    if symbol in punc:
      punc_positions.append(i)

  punc_positions.reverse()
  for pos in punc_positions:
    if segment[pos-1] == " ":
      segment = segment[0:pos-1] + segment[pos:]

  return segment

### BPE

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe')

In [None]:
!mkdir -p trained_model
!cp data-bin/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin/dict.tso.txt trained_model/dict.tso.txt
!cp data/bpe.codes.4000 trained_model/bpe.codes.4000
!cp checkpoints-bpe/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 10:56:33.035930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 10:56:33.061054: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 10:56:33.068532: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 10:56:33.086453: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 10:56:45 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/test.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 10:59:55.160062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 10:59:55.199115: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 10:59:55.210870: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 10:59:55.237224: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:00:08 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpe/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### ULM

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm')

In [None]:
!mkdir -p trained_model
!cp data-bin/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin/dict.tso.txt trained_model/dict.tso.txt
!cp data/joint.model trained_model/joint.model
!cp checkpoints-ulm/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 11:14:59.548822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:14:59.585674: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:14:59.596221: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:14:59.622453: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:15:13 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/test.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 11:18:37.822432: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:18:37.848920: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:18:37.856837: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:18:37.876961: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:18:50 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulm/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### BPE-Dropout

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP')

In [None]:
!mkdir -p trained_model
!cp data-bin-25/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin-25/dict.tso.txt trained_model/dict.tso.txt
!cp /content/drive/MyDrive/Research/eng-to-tso/bpe/data/bpe.codes.4000 trained_model/bpe.codes.4000
!cp checkpoints-bpeDROP/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 11:27:12.736314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:27:12.834626: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:27:12.852981: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:27:12.894281: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:27:27 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'subword_nmt' \
--bpe-codes trained_model/bpe.codes.4000 \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/test.eng \
trained_model/ | grep -P "D-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 11:30:49.892408: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:30:49.919675: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:30:49.926953: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:30:49.945240: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:31:02 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/bpeDROP/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### ULM with Subword Regularization

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR')

In [None]:
!mkdir -p trained_model
!cp data-bin-25/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin-25/dict.tso.txt trained_model/dict.tso.txt
!cp /content/drive/MyDrive/Research/eng-to-tso/ulm/data/joint.model trained_model/joint.model
!cp checkpoints-ulmSR/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 11:40:07.791521: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:40:07.816556: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:40:07.823744: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:40:07.841801: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:40:23 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/test.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 11:44:07.283263: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:44:07.337922: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:44:07.354619: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:44:07.392532: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:44:19 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/ulmSR/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

### Task-specific Tokenization

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok')

In [None]:
!mkdir -p trained_model
!cp data-bin-nmt/dict.eng.txt trained_model/dict.eng.txt
!cp data-bin-nmt/dict.tso.txt trained_model/dict.tso.txt
!cp /content/drive/MyDrive/Research/eng-to-tso/ulm/data/joint.model trained_model/joint.model
!cp checkpoints-task-nmt/checkpoint_best.pt trained_model/model.pt

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/clean_gtest.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_gtest

2024-11-06 11:57:37.132367: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:57:37.158400: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:57:37.165806: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:57:37.186094: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 11:57:50 | INFO | fairseq_cli.interactive 

In [None]:
!fairseq-interactive \
--path trained_model/model.pt \
--source-lang eng \
--target-lang tso \
--bpe 'sentencepiece' \
--sentencepiece-model trained_model/joint.model \
--beam 5 \
--lenpen 1 \
--seed 2024 \
--max-len-a 0 \
--max-len-b 100 \
--quiet \
--remove-bpe sentencepiece \
--buffer-size 100 \
--input /content/drive/MyDrive/Research/eng-to-tso/cleaned-data/test.eng \
trained_model/ | grep -P "H-[0-9]+" | cut -f3 > trained_model/translations_test

2024-11-06 12:01:02.536330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 12:01:02.582730: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 12:01:02.596304: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 12:01:02.634130: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-06 12:01:14 | INFO | fairseq_cli.interactive 

In [None]:
# change working directory
os.chdir(f'/content/drive/MyDrive/Research/eng-to-{target_code}/cleaned-data')

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/translations_gtest'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/post_process_translations_gtest'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
!sacrebleu ref1.tso ref2.tso ref3.tso ref4.tso -i $translations_path -m bleu --force
!sacrebleu ref1.tso ref2.tso ref3.tso ref4.tso -i $post_translations_path -m bleu

{
 "name": "BLEU",
 "score": 28.8,
 "signature": "nrefs:4|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "66.3/38.9/23.1/13.4 (BP = 0.964 ratio = 0.964 hyp_len = 10915 ref_len = 11318)",
 "nrefs": "4",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m{
 "name": "BLEU",
 "score": 28.8,
 "signature": "nrefs:4|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "66.3/38.9/23.1/13.4 (BP = 0.964 ratio = 0.964 hyp_len = 10915 ref_len = 11318)",
 "nrefs": "4",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m

In [None]:
translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/translations_test'

In [None]:
# post-processing predictions
with open(translations_path, 'r', encoding='utf-8') as f:
  segments = f.read().splitlines()

for i in range(len(segments)):
  segments[i] = remove_whitespace_before(segments[i], before_punc)

post_translations_path = f'/content/drive/MyDrive/Research/eng-to-{target_code}/target-tok/trained_model/post_process_translations_test'
with open(post_translations_path, 'w') as f:
  for segment in segments:
    f.write(segment + '\n')

In [None]:
!sacrebleu test.tso -i $translations_path -m bleu --force
!sacrebleu test.tso -i $post_translations_path -m bleu

{
 "name": "BLEU",
 "score": 35.0,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "64.3/43.9/32.1/24.2 (BP = 0.910 ratio = 0.914 hyp_len = 28625 ref_len = 31326)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m{
 "name": "BLEU",
 "score": 35.0,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "64.3/43.9/32.1/24.2 (BP = 0.910 ratio = 0.914 hyp_len = 28625 ref_len = 31326)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
}
[0m