<a href="https://colab.research.google.com/github/ymoslem/LLMs/blob/main/inference/Mistral-CTranslate2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Translation with Mistral 7B using CTranslate2

In [1]:
!pip3 install CTranslate2 transformers -q

In [None]:
# Google Colab switched to CUDA 12 while CTranslate2 still uses CUDA 11
# RuntimeError: Library libcublas.so.11 is not found or cannot be loaded
# If you received this error during translation, try to install libcublas11

# !apt install libcublas11

In [None]:
import os
from google.colab import userdata

shared_drive = userdata.get("shared_drive")

directory = os.path.join(shared_drive, "models")

os.chdir(directory)
os.getcwd()

In [None]:
# Convert Mistral to the CTranslate2 format, if you did not already
# !ct2-transformers-converter --model mistralai/Mistral-7B-v0.1 --quantization int8 --output_dir ct2-mistral-7B-v0.1

## Load the model

```python
model = "ct2-mistral-7B-v0.1"
tokenizer = "mistralai/Mistral-7B-v0.1"
```

Other models:
```
model = "ct2-mistral-7B-instruct-v0.1"
tokenizer = "mistralai/Mistral-7B-Instruct-v0.1"

model = "ct2-falcon-7b-instruct"
tokenizer = "tiiuae/falcon-40b-instruct"

model = "ct2-zephyr-7b-beta"
tokenizer = "HuggingFaceH4/zephyr-7b-beta"
```

In [10]:
# Load the model

import ctranslate2
import transformers

model = "ct2-mistral-7B-v0.1"
tokenizer = "mistralai/Mistral-7B-v0.1"

# Load the translator and tokenizer
generator = ctranslate2.Generator(model, device="cuda")
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer)

In [11]:
# Test prompts

src_lang = "Spanish"
tgt_lang = "English"

# Zero-shot prompt
prompt_source = (
    f"{src_lang}: Período de validez después de abierto el envase: 10 horas.\n"
    f"{tgt_lang}:"
)

# Fuzzy one-shot prompt
prompt_fuzzy = (
    f"{src_lang}: Período de validez después de abierto el envase: 4 semanas\n"
    f"{tgt_lang}: Shelf life after opening the immediate packaging: 4 weeks.\n"
    f"{src_lang}: Período de validez después de abierto el envase: 10 horas.\n"
    f"{tgt_lang}:"
)

prompts = [prompt_source, prompt_fuzzy]

print(*prompts, sep="\n\n")

Spanish: Período de validez después de abierto el envase: 10 horas.
English:

Spanish: Período de validez después de abierto el envase: 4 semanas
English: Shelf life after opening the immediate packaging: 4 weeks.
Spanish: Período de validez después de abierto el envase: 10 horas.
English:


In [6]:
# Add stopping criteria to avoid over-generation
# References:
# https://github.com/OpenNMT/CTranslate2/issues/1309
# https://github.com/OpenNMT/CTranslate2/issues/1322
# https://stackoverflow.com/questions/69403613/how-to-early-stop-autoregressive-model-with-a-list-of-stop-words

stopping_criteria = tokenizer.convert_ids_to_tokens(tokenizer.encode(".\n"))
# Probably also re-add the default end of sentence token, but maybe it is not nescessary

In [12]:
# Tokenize and generate (single prompt)

prompt = prompts[0]

max_length = len(prompt.split("\n")[-2].split(" ")) * 4


tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt))

results = generator.generate_batch([tokens],
                                   sampling_topk=1,  # 1 for greed search
                                   max_length=max_length,
                                   include_prompt_in_result=False,
                                   end_token=stopping_criteria
                                   )
output_ids = results[0].sequences_ids[0]
output = tokenizer.decode(output_ids)

output_length = len(output_ids)
print(f"{max_length=}")
print(f"{output_length=}")

print(f"\nTranslation:\n{output}")

max_length=44
output_length=14

Translation:
Period of validity after opening the package: 10 hours.


In [13]:
longest_prompt = max([len(prompt.split("\n")[-2].split(" ")[1:]) for prompt in prompts])
max_length = longest_prompt * 4
print(f"{max_length=}")

max_length=40


In [14]:
# Tokenize the prompts (batch)
tokenized_inputs = tokenizer(prompts)

# Extract the token IDs for the batch
input_ids_batch = tokenized_inputs['input_ids']

# Convert the batch of token IDs to tokens
tokens_batch = [tokenizer.convert_ids_to_tokens(ids) for ids in input_ids_batch]

# Generate outputs in a batch
results = generator.generate_batch(tokens_batch,
                                   sampling_topk=1,  # 1 for greedy search
                                   max_length=max_length,
                                   include_prompt_in_result=False,
                                   end_token=stopping_criteria
                                  )

# Decode the outputs
outputs = [tokenizer.decode(output_ids) for result in results for output_ids in result.sequences_ids]

# 'outputs' will now contain the generated text for each sentence in your list
print(*outputs, sep="\n\n")

Period of validity after opening the package: 10 hours.

Shelf life after opening the immediate packaging: 10 hours.
