<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/NLLB-200-CTranslate2-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Translation with NLLB-200 using CTranslate2

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

# Loading the data

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [None]:
# Load test dataset

source_test_file = "all-filtered.es.real.test"
target_test_file = "all-filtered.en.real.test"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

Período de validez después de abierto el envase: 10 horas.
Shelf life after first opening the container: 10 hours.


In [None]:
# Load fuzzy matches from the Context Dataset

online_test_file = "all-filtered.esen.ms-multi-12.online.test"

src_lang = "spa_Latn"
tgt_lang = "eng_Latn"

with open(online_test_file, encoding="utf-8") as online:
  lines = [line.strip().split(" ||| ") for line in online.readlines()]
  scores = [float(line[0].strip()) for line in lines]
  fuzzy_source_sentences = [line[1].strip() for line in lines]
  online_source_sentences = [line[2].strip() for line in lines]
  fuzzy_target_prefixes = [line[3].strip() for line in lines]

print(fuzzy_source_sentences[0])
print(online_source_sentences[0])
print(fuzzy_target_prefixes[0])

Período de validez después de abierto el envase: 4 semanas
Período de validez después de abierto el envase: 10 horas.
Shelf life after opening the immediate packaging: 4 weeks.


# Load the models

In [None]:
!pip3 install ctranslate2 sentencepiece -q

In [None]:
!ls /content/drive/MyDrive/models/ct2-nllb*

/content/drive/MyDrive/models/ct2-nllb-200-3.3B-int8:
config.json  model.bin	shared_vocabulary.txt

/content/drive/MyDrive/models/ct2-nllb-200-distilled-1.2B-int8:
config.json  model.bin	shared_vocabulary.txt

/content/drive/MyDrive/models/ct2-nllb-200-distilled-600M-int8:
config.json  model.bin	shared_vocabulary.txt


In [None]:
# Example of converting an NLLB model to CTranslate2 with int8 quantization

# !ct2-transformers-converter --model facebook/nllb-200-3.3B --quantization int8 --output_dir ct2/ct2-nllb-200-3.3B-int8

In [None]:
# Download the SentencePiece model

# !wget https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model

In [None]:
import os

# [Modify] Set paths to the CTranslate2 and SentencePiece models

drive = "/content/drive/MyDrive/models"

ct_model_path = os.path.join(drive, "ct2-nllb-200-3.3B-int8")
sp_model_path = os.path.join(drive, "flores200_sacrebleu_tokenizer_spm.model")

In [None]:
import ctranslate2
import sentencepiece as spm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Load the CTranslate2 model
translator = ctranslate2.Translator(ct_model_path, device=device)

In [None]:
sp.encode_as_pieces("English:")

['▁English', ':']

# Translate (source sentences only)

In [None]:
src_lang = "spa_Latn"
tgt_lang = "eng_Latn"

beam_size = 2

source_sents = [sent.strip() for sent in source_sentences]
target_prefix = [[tgt_lang]] * len(source_sents)

# Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

# Translate the source sentences
translations = translator.translate_batch(source_sents_subworded,
                                          batch_type="tokens",
                                          max_batch_size=2024,
                                          beam_size=beam_size,
                                          target_prefix=target_prefix)
translations = [translation.hypotheses[0] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]

print(*translations_desubword[:10], sep="\n")

Shelf-life after opening the container: 10 hours.
Children and adolescents The use of Telmisartan Teva in children and adolescents up to 18 years of age is not recommended.
·Promoting the improvement, in accordance with international standards, of conditions of detention and the treatment of persons deprived of their liberty.
Of the 1,165 patients treated with Picato in actin keratosis clinical trials conducted with ingenol mebutate gel, 656 patients (56%) were 65 years of age or older, while 241 patients (21%) were 75 years of age or older.
Given the lack of effectiveness of this mechanism in the work of Members and the WTO, NGOs are now advocating that such materials be better organized on the website and even that the Secretariat take a more active stance, indicating some issues for the submission of material based on more predefined deadlines and patterns.
• When the area of skin that has been cleaned is dry, peel it and hold it firmly with one hand.
A commonly used dose to induce 

In [None]:
# Save the translations
with open("all-filtered.es.real.test.translated-nllb3.3-ct2-beam2.en", "w+") as output:
  for translation in translations_desubword:
    output.write(translation + "\n")

# Translate (with fuzzy matches)

In [None]:
import ctranslate2
import sentencepiece as spm
import torch

src_lang = "spa_Latn"
tgt_lang = "eng_Latn"

beam_size = 2

# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)


# Subword the source sentences
fuzzy_source_sentences_subworded = sp.encode_as_pieces(fuzzy_source_sentences)
online_source_sentences_subworded = sp.encode_as_pieces(online_source_sentences)
fuzzy_online_subworded = zip(fuzzy_source_sentences_subworded, online_source_sentences_subworded)

separator = sp.encode_as_pieces("•")  # tokenize "•" -- output is "▁•"

source_sents_subworded = [[src_lang] + fuzzy_src + [src_lang] + separator + online_src + ["</s>"]
                          for fuzzy_src, online_src in fuzzy_online_subworded]
print(source_sents_subworded[0])

prefixes_subworded = sp.encode_as_pieces(fuzzy_target_prefixes)
target_prefixes = [[tgt_lang] + sent + [tgt_lang] + separator for sent in prefixes_subworded]
print(target_prefixes[0])

# Translate the source sentences
translator = ctranslate2.Translator(ct_model_path, device=device)
translations = translator.translate_batch(source_sents_subworded,
                                          batch_type="tokens",
                                          max_batch_size=2024,
                                          beam_size=beam_size,
                                          min_decoding_length=2,
                                          max_decoding_length=512,
                                          target_prefix=target_prefixes)
translations = [translation.hypotheses[0] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]

translations_only = [sent.split(tgt_lang)[1].strip() for sent in translations_desubword]

print("\nTranslations:", *translations_desubword[:10], sep="\n")
print("\nTranslations only:", *translations_only[:10], sep="\n")

In [None]:
translations_only = [sent.split(tgt_lang)[1].strip() for sent in translations_desubword]
translations_only = [sent[1:].strip() if sent.startswith("•") else sent.strip() for sent in translations_only]

In [None]:
translations_only[0]

'Shelf life after opening the packaging: 10 hours.'

In [None]:
# Save the translations

translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-nllb3.3-ct2-beam2-bulletpoint.en"

with open(translations_file_name, "w+") as output:
  for translation in translations_only:
    output.write(translation + "\n")