<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Mistral-CTranslate2-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Translation with Mistral 7B (baseline and fine-tuned models)

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

# Loading the data

In [36]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [37]:
# Load test datasets

source_test_file = "all-filtered.es.real.test"
target_test_file = "all-filtered.en.real.test"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

Período de validez después de abierto el envase: 10 horas.
Shelf life after first opening the container: 10 hours.


In [10]:
# Load the fuzzy matches from the Context Dataset

online_test_file = "all-filtered.esen.ms-multi-12.online.test"

with open(online_test_file, encoding="utf-8") as online:
  lines = [line.strip().split(" ||| ") for line in online.readlines()]
  scores = [float(line[0].strip()) for line in lines]
  fuzzy_source_sentences = [line[1].strip() for line in lines]
  online_source_sentences = [line[2].strip() for line in lines]
  fuzzy_target_prefixes = [line[3].strip() for line in lines]

print(fuzzy_source_sentences[0])
print(online_source_sentences[0])
print(fuzzy_target_prefixes[0])

Período de validez después de abierto el envase: 4 semanas
Período de validez después de abierto el envase: 10 horas.
Shelf life after opening the immediate packaging: 4 weeks.


# Create the prompts

In [11]:
# Function to create zero-shot and one-shot prompts

def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  one_shot=True
                  ):

  prompts = []

  if one_shot:
    for fuzzy_src, fuzzy_tgt, new_src in zip(fuzzy_sources, fuzzy_targets, new_sources):
      fuzzy_src = source_lang + ": " + fuzzy_src
      fuzzy_tgt = target_lang + ": " + fuzzy_tgt
      new_src = source_lang + ": " + new_src
      segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src + "\n" + target_lang + ":"
      prompts.append(segment)
  else:
    for new_src in new_sources:
      new_src = source_lang + ": " + new_src
      segment = new_src + "\n" + target_lang + ":"
      prompts.append(segment)

  return prompts

In [12]:
source_lang = "Spanish"
target_lang = "English"

In [13]:
# Create prompts

prompts_zero_shot = create_prompt(source_lang,
                                  target_lang,
                                  fuzzy_source_sentences,
                                  fuzzy_target_prefixes,
                                  online_source_sentences,
                                  one_shot=False
                                  )

prompts_one_shot = create_prompt(source_lang,
                                  target_lang,
                                  fuzzy_source_sentences,
                                  fuzzy_target_prefixes,
                                  online_source_sentences,
                                  one_shot=True
                                  )

print(len(prompts_zero_shot))
print(len(prompts_one_shot))

10000
10000


In [14]:
print(prompts_zero_shot[0], "\n")
print(prompts_one_shot[0])

Spanish: Período de validez después de abierto el envase: 10 horas.
English: 

Spanish: Período de validez después de abierto el envase: 4 semanas
English: Shelf life after opening the immediate packaging: 4 weeks.
Spanish: Período de validez después de abierto el envase: 10 horas.
English:


# Loading the model

In [15]:
!pip3 install CTranslate2 transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.8/36.8 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Google Colab switched to CUDA 12 while CTranslate2 still uses CUDA 11
# RuntimeError: Library libcublas.so.11 is not found or cannot be loaded
# If you received this error during translation, try to install libcublas11

# !apt install libcublas11

In [None]:
import os
from google.colab import userdata

shared_drive = userdata.get("shared_drive")

directory = os.path.join(shared_drive, "models")

os.chdir(directory)
os.getcwd()

In [11]:
# To convert Mistral baseline (before fine-tuning) to the CTranslate2 format, if you did not already
# !ct2-transformers-converter --model mistralai/Mistral-7B-v0.1 --quantization int8 --output_dir ct2-mistral-7B-v0.1

# To convert Mistral after FINE-TUNING to the CTranslate2 format, check the steps here:
# https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Convert-Mistral-Finetuned-CTranslate2.ipynb

In [39]:
!ls $directory"/ct2-mistral-finetuned-v1-25Nov"
!ls $directory"/ct2-mistral-finetuned-v2-26Nov"

config.json  model.bin	vocabulary.json
config.json  model.bin	vocabulary.json


In [None]:
# Load the model

import ctranslate2
import transformers
import os

# Mistral - Baseline model
# model_name = "ct2-mistral-7B-v0.1"
# tokenizer_name = "mistralai/Mistral-7B-v0.1"

# Mistral - FINE-TUNED model
# model_name = "ct2-mistral-finetuned-v1-25Nov"
# tokenizer_name = "mistralai/Mistral-7B-v0.1"

# model_name = "ct2-mistral-finetuned-v2-26Nov"
# tokenizer_name = "mistralai/Mistral-7B-v0.1"

model = os.path.join(directory, model_name)

generator = ctranslate2.Generator(model, device="cuda")
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)

print("Model:", model_name)
print("Tokenizer:", tokenizer_name)

# Translation

In [41]:
# Add stopping criteria to avoid over-generation
# References:
# https://github.com/OpenNMT/CTranslate2/issues/1309
# https://github.com/OpenNMT/CTranslate2/issues/1322
# https://stackoverflow.com/questions/69403613/how-to-early-stop-autoregressive-model-with-a-list-of-stop-words

stopping_criteria = tokenizer.convert_ids_to_tokens(tokenizer.encode(".\n"))
# Probably also re-add the default end of sentence token, but maybe it is not nescessary

# For Llama-3, try the following:
stopping_criteria = tokenizer.convert_ids_to_tokens(tokenizer.encode("<|end_of_text|>.\n"))

In [42]:
# Test: Tokenize and generate (single prompt)

n = 0
prompt = prompts_zero_shot[n]

max_length = len(prompt.split("\n")[-2].split(" ")[1:]) * 4


tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt))

results = generator.generate_batch([tokens],
                                   sampling_topk=1,  # 1 for greedy search
                                   max_length=max_length,
                                   include_prompt_in_result=False,
                                   end_token=stopping_criteria,
                                   min_length=1,
                                   batch_type="tokens",
                                   max_batch_size=8096,
                                   )
output_ids = results[0].sequences_ids[0]
output = tokenizer.decode(output_ids).strip()

output_length = len(output_ids)
print(f"{max_length=}")
print(f"{output_length=}")

print(f"\nTranslation:\n{output}")

max_length=40
output_length=11

Translation:
Shelf life after opening: 10 hours.


In [43]:
# Batch translation fuction

def translate_batch(prompts,
                    tokenizer,
                    generator,
                    max_length,
                    end_token,
                    topk=1,
                    ):
  # Tokenize the prompts
  tokenized_inputs = tokenizer(prompts)

  # Extract the token IDs for the batch
  input_ids_batch = tokenized_inputs['input_ids']

  # Convert the batch of token IDs to tokens
  tokens_batch = [tokenizer.convert_ids_to_tokens(ids) for ids in input_ids_batch]

  # Generate outputs in a batch
  results = generator.generate_batch(tokens_batch,
                                     sampling_topk=1,  # 1 for greed search
                                     max_length=max_length,
                                     min_length=1,
                                     include_prompt_in_result=False,
                                     end_token=end_token,
                                     batch_type="tokens",
                                     max_batch_size=8096, # 32384 # try smaller numbers if you run out of memory
                                     )

  # Decode the outputs
  sequences_ids = [result.sequences_ids[0] for result in results]
  translations = tokenizer.batch_decode(sequences_ids,
                                        skip_special_tokens=True,
                                        )

  return translations

In [25]:
results[0].sequences

[['▁Shel',
  'f',
  '▁life',
  '▁after',
  '▁opening',
  ':',
  '▁',
  '1',
  '0',
  '▁hours',
  '.']]

In [49]:
# Parameters
length_multiplier = 4
topk = 1

In [44]:
# @title ✳️ Set prompts (zero-shot, one-shot)

# prompts = prompts_zero_shot
prompts = prompts_one_shot

print(prompts[0])

Spanish: Período de validez después de abierto el envase: 4 semanas
English: Shelf life after opening the immediate packaging: 4 weeks.
Spanish: Período de validez después de abierto el envase: 10 horas.
English:


In [50]:
length = [len(prompt.split("\n")[-2].split(" ")[1:]) for prompt in prompts]
max_len = max(length) * length_multiplier
print(f"Max length: {max_len}")

Max length: 280


In [51]:
translations = translate_batch(prompts,
                              tokenizer,
                              generator,
                              max_len,
                              stopping_criteria,
                              topk
                              )

In [None]:
print(len(translations))

print(*translations[:10], sep="\n")

# Save the translations

In [53]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [54]:
# Save the translations


translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-Mistral-zero-shot-batch8096.en"
translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-Mistral-one-shot-batch8096.en"

# translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-Mistral-finetuned-v1-zero-shot.en"
# translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-Mistral-finetuned-v1-one-shot.en"

# translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-Mistral-finetuned-v2-zero-shot.en"
# translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-Mistral-finetuned-v2-one-shot.en"

with open(translations_file_name, "w+") as output:
  for translation in translations:
    output.write(translation + "\n")

In [None]:
!wc -l $translations_file_name
!head -n 10 $translations_file_name