<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/vLLM-translate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Translation with vLLM

This notebook is part of the code of my paper,  
*Domain-Specific Translation with Open-Source Large Language Models: Resource-Oriented Analysis* ([link](https://arxiv.org/abs/2412.05862)).


# Install vLLM

In [None]:
# This might need a restart, so better run it in the Terminal first.
!export VLLM_USE_MODELSCOPE=True
!pip install -q vllm

In [None]:
!mkdir -p /workspace/models/cache

In [None]:
import os

model_directory = "/workspace/models/cache"

os.chdir(model_directory)
os.getcwd()

In [None]:
!huggingface-cli login --token $HF_TOKEN

# Loading the model

In [None]:
# ✳️ Load the model -- modify the model name/path

from vllm import LLM, SamplingParams
import os
import torch


# model_path = "google/gemma-7b"
model_path = "mistralai/Mistral-7B-v0.1"
# model_path = "mistralai/Mixtral-8x7B-v0.1"
# model_path = "meta-llama/Meta-Llama-3-8B"
# model_path = "meta-llama/Meta-Llama-3-70B"
# model_path = "meta-llama/Llama-3.3-70B-Instruct"
# model_path = "meta-llama/Llama-3.1-405B"

# AWQ models
# model_path = "RiversHaveWings/Meta-Llama-3.1-405B-AWQ"
# model_path = "cognitivecomputations/DeepSeek-V3-AWQ"


# Get the number of available GPUs
# Hint: Llama 70B -> 2 H100
# Llama 405B AWQ -> 4 H100
# DeepSeek V3 AWQ -> 4 H200 or 8 H100
num_gpus = torch.cuda.device_count()

max_len = 4096  # increase for longer context (withit memory limits)
awq = True if "-awq" in model_path.lower() else False  # verify based on your model

print(f"Number of GPUs: {num_gpus}")
print(f"Max length: {max_len}")
print(f"AWQ: {awq}")


if awq:
    llm = LLM(model=model_path,
             download_dir=model_directory,
              trust_remote_code=True,
              tensor_parallel_size=num_gpus,
              quantization="awq_marlin",
              max_model_len=max_len,
             )
else:
    llm = LLM(model=model_path,
              download_dir=model_directory,
              trust_remote_code=True,
              dtype=torch.bfloat16,
              tensor_parallel_size=num_gpus,
              max_model_len=max_len,
              )

In [None]:
!nvidia-smi

## Test

In [None]:
# Test prompts - Spanish to English

src_lang = "Spanish"
tgt_lang = "English"

# Zero-shot prompt
prompt_source = (
    f"{src_lang}: Período de validez después de abierto el envase: 10 horas.\n"
    f"{tgt_lang}:"
)

# Fuzzy one-shot prompt
prompt_fuzzy = (
    f"{src_lang}: Período de validez después de abierto el envase: 4 semanas\n"
    f"{tgt_lang}: Shelf life after opening the immediate packaging: 4 weeks.\n"
    f"{src_lang}: Período de validez después de abierto el envase: 10 horas.\n"
    f"{tgt_lang}:"
)

test_prompts = [prompt_source, prompt_fuzzy]

print(*test_prompts, sep="\n\n")

Spanish: Período de validez después de abierto el envase: 10 horas.
English:

Spanish: Período de validez después de abierto el envase: 4 semanas
English: Shelf life after opening the immediate packaging: 4 weeks.
Spanish: Período de validez después de abierto el envase: 10 horas.
English:


In [None]:
# Test Greedy search
test_sampling_params = SamplingParams(
                                 temperature=0.0,
                                 top_p=1,
                                 top_k=1,
                                 max_tokens=30,
                                 skip_special_tokens=False,
                                 stop=["\n"],
                                 )

outputs = llm.generate(test_prompts,
                       test_sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text.strip()
    print()
    print(generated_text)

# Loading the data

In [None]:
# ✳️ Load the data -- modify the data directory

import os

# En-FR
data_path = "/workspace/data/"

directory = os.path.join(data_path, "en-fr")

# EN-PT
# directory = os.path.join(data_path, "en-pt")

# EN-SW
# directory = os.path.join(data_path, "en-sw")

# SW-EN
# directory = os.path.join(data_path, "sw-en")

os.chdir(directory)
os.getcwd()

'/workspace/data/en-fr'

In [None]:
!ls $directory

In [None]:
# ✳️ Load test datasets

# EN-FR
source_test_file = "all-filtered.en.real.test"
target_test_file = "all-filtered.fr.real.test"

# # EN-PT
# source_test_file = "all-filtered.en.real.test"
# target_test_file = "all-filtered.pt.real.test"

# EN-SW - Generic
# source_test_file = "generic.filtered.en.real.test"
# target_test_file = "generic.filtered.sw.real.test"

# EN-SW - Medical
# source_test_file = "medical.filtered.en.real.test"
# target_test_file = "medical.filtered.sw.real.test"

# SW-EN - Generic
# source_test_file = "generic.filtered.sw.real.test"
# target_test_file = "generic.filtered.en.real.test"

# SW-EN - Medical
# source_test_file = "medical.filtered.sw.real.test"
# target_test_file = "medical.filtered.en.real.test"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
    source_sentences = [sent.strip() for sent in source.readlines()]
    target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

reduce the dosage by initiating the patient at 90% of the previous total daily dosage, with 40% as basal rate and 50% as boluses divided between the three main meals.
Lors du transfert de patients d’ un traitement par injection à la perfusion, il est généralement conseillé de diminuer la posologie en commençant par administrer 90% de la dose journalière totale précédente, dont 40% en débit de base et 50% en bolus répartis entre les trois repas principaux.


In [None]:
# ✳️ Load the fuzzy matches from the Context Dataset

# EN-FR
online_test_file = "all-filtered.en-fr.ms-multi-12.context.test"

# EN-PT
# online_test_file = "all-filtered.en-pt.ms-multi-12.online.test"

# EN-SW
# online_test_file = "generic.filtered.ensw.ms-multi-12.online.test"
# online_test_file = "medical.filtered.ensw.ms-multi-12.online.test"

# SW-EN
# online_test_file = "generic.filtered.swen.ms-multi-12.online.test"
# online_test_file = "medical.filtered.swen.ms-multi-12.online.test"

with open(online_test_file, encoding="utf-8") as online:
    lines = [line.strip().split(" ||| ") for line in online.readlines()]
    scores = [float(line[0].strip()) for line in lines]
    fuzzy_source_sentences = [line[1].strip() for line in lines]
    online_source_sentences = [line[2].strip() for line in lines]
    fuzzy_target_prefixes = [line[3].strip() for line in lines]

print(fuzzy_source_sentences[0])
print(online_source_sentences[0])
print(fuzzy_target_prefixes[0])

Titration and maintenance dose The daily dose is subsequently increased by doubling the dose at intervals of one to three days up to the target maintenance dose of 5 mg twice daily.
reduce the dosage by initiating the patient at 90% of the previous total daily dosage, with 40% as basal rate and 50% as boluses divided between the three main meals.
22 Titration et dose d’ entretien La dose quotidienne sera augmentée par la suite en doublant la dose à un à trois jours d’ intervalle jusqu’ à atteindre la dose d’ entretien cible de 5 mg deux fois par jour.


# Create the prompts

In [None]:
# Function to create zero-shot and one-shot prompts

def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  one_shot=True
                  ):

    prompts = []

    if one_shot:
        for fuzzy_src, fuzzy_tgt, new_src in zip(fuzzy_sources, fuzzy_targets, new_sources):
            fuzzy_src = source_lang + ": " + fuzzy_src
            fuzzy_tgt = target_lang + ": " + fuzzy_tgt
            new_src = source_lang + ": " + new_src
            segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src + "\n" + target_lang + ":"
            prompts.append(segment)
    else:
        for new_src in new_sources:
            new_src = source_lang + ": " + new_src
            segment = new_src + "\n" + target_lang + ":"
            prompts.append(segment)

    return prompts

In [None]:
# ✳️ Define the source and target languages

source_lang = "English"
target_lang = "French"

# source_lang = "English"
# target_lang = "Portuguese"

# source_lang = "English"
# target_lang = "Swahili"

# source_lang = "Swahili"
# target_lang = "English"

In [None]:
# Create prompts

prompts_zero_shot = create_prompt(source_lang,
                                  target_lang,
                                  fuzzy_source_sentences,
                                  fuzzy_target_prefixes,
                                  online_source_sentences,
                                  one_shot=False
                                  )

prompts_one_shot = create_prompt(source_lang,
                                  target_lang,
                                  fuzzy_source_sentences,
                                  fuzzy_target_prefixes,
                                  online_source_sentences,
                                  one_shot=True
                                  )

print(len(prompts_zero_shot))
print(len(prompts_one_shot))

In [None]:
print(prompts_zero_shot[0], "\n")
print(prompts_one_shot[0])

In [None]:
print(prompts_zero_shot[20], "\n")
print(prompts_one_shot[20])

English: This checklist will remind prescribers how to use the medicine safely.
French: 

English: Follow your doctor’s instruction carefully on which medicines can be combined.
French: Suivez attentivement les instructions de votre médecin quant aux médicaments qui peuvent être associés.
English: This checklist will remind prescribers how to use the medicine safely.
French:


# Test translation

In [None]:
# Test: Tokenize and generate (single prompt)

n = 0
l = 1
test_prompts = prompts_zero_shot[n:n+l]
# test_prompts = prompts_one_shot[n:n+l]
print(*test_prompts, sep="\n", end="\n\n")
print("Translations:\n")

test_max_tokens = len(test_prompts[0].split() * 4)

# Greedy search
test_sampling_params = SamplingParams(
                                 temperature=0.0,
                                 top_p=1,
                                 top_k=1,
                                 max_tokens=test_max_tokens,
                                 skip_special_tokens=False,
                                 stop=["\n"],
                                 #  stop_token_ids=["\n"],
                                 logprobs=2,
                                 #  prompt_logprobs=1
                                 )

outputs = llm.generate(test_prompts,
                       test_sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text.strip()
    print(generated_text)

# Translation - full test dataset

In [None]:
# @title ✳️ Set prompts (zero-shot, one-shot)

prompts = prompts_zero_shot
# prompts = prompts_one_shot

print(prompts[0])

In [None]:
length_multiplier = 4

# Calculate max length
length = [len(prompt.split("\n")[-2].split(" ")[1:]) for prompt in prompts]
max_len = max(length) * length_multiplier
print(f"Max length: {max_len}")

In [None]:
sampling_params = SamplingParams(
                                 temperature=0.0,
                                 top_p=1,
                                 top_k=1,
                                 max_tokens=max_len,
                                 skip_special_tokens=True,
                                 stop=["\n"]
                                 )

In [None]:
generated_outputs = llm.generate(prompts,
                                 sampling_params)

In [None]:
translations = []

for generation in generated_outputs:
    translation = generation.outputs[0].text.strip()
    translations.append(translation)

len(translations)

In [None]:
print(*translations[:5], sep="\n")

# Save the translations

In [None]:
# !mkdir -p "/workspace/data/en-fr/translations-vllm"
# !mkdir -p "/workspace/data/en-pt/translations-vllm"
# !mkdir -p "/workspace/data/en-sw/translations-vllm"
# !mkdir -p "/workspace/data/sw-en/translations-vllm"

In [None]:
import os

data_path = "/workspace/data/"

translations_directory = os.path.join(data_path, "en-fr", "translations-vllm")

# translations_directory = os.path.join(data_path, "en-pt", "translations-vllm")

# translations_directory = os.path.join(data_path, "en-sw", "translations-vllm")

# translations_directory = os.path.join(data_path, "sw-en", "translations-vllm")

os.chdir(translations_directory)
os.getcwd()

In [None]:
# ✳️ Change translations file name

# EN-FR - Medical
translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-zero-shot.fr"
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-one-shot.fr"

# EN-PT - Medical
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-zero-shot.pt"
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-one-shot.pt"

# EN-SW - Generic
# translations_file_name = "test-generic-translated-Llama-3.3-70b-baseline-vLLM-zero-shot.sw"
# translations_file_name = "test-generic-translated-Llama-3.3-70b-baseline-vLLM-one-shot.sw"

# EN-SW - Medidal
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-zero-shot.sw"
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-one-shot.sw"

# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-zero-shot-temp2.sw"
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-one-shot-temp2.sw"

# SW-EN - Generic
# translations_file_name = "test-generic-translated-Llama-3.3-70b-baseline-vLLM-zero-shot.en"
# translations_file_name = "test-generic-translated-Llama-3.3-70b-baseline-vLLM-one-shot.en"

# SW-EN - Medical
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-zero-shot.en"
# translations_file_name = "test-medical-translated-Llama-3.3-70b-baseline-vLLM-one-shot.en"

In [None]:
with open(translations_file_name, "w+", encoding="utf-8") as output:
    for translation in translations:
        output.write(translation.strip() + "\n")

In [None]:
!wc -l $translations_file_name
!head -n 3 $translations_file_name