In [None]:
# This might need a restart, so better run it in the Terminal first.
# !pip3 install vllm datasets sacrebleu unbabel-comet polars

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Data

In [None]:
# Language codes
full_src_lang = "Czech"
full_tgt_lang = "German"

In [None]:
from datasets import load_dataset

dataset_name = "ymoslem/news-commentary-cs-de"  # sentence-level data

dataset = load_dataset(dataset_name,
                       split="train",
                      )

dataset = dataset.shuffle(seed=0)

# Split dataset into train and test
dataset = dataset.train_test_split(test_size=500, seed=0)

dataset = dataset["test"]

dataset

In [None]:
source_sentences = dataset["source"]
prompt = f"Translate the following text from {full_src_lang} to {full_tgt_lang}:"
prompts = [prompt + "\n" + sent + "\n" for sent in source_sentences]
print(prompts[0])

In [None]:
references = dataset["target"]
references[0]

In [None]:
def define_max_len(sentences):
    max_len, longest_idx = max([(len(sent.split()), idx)
                                for idx, sent in enumerate(sentences)])
                                
    max_len = max_len * 2
    return max_len, longest_idx

max_len, longest_idx = define_max_len(source_sentences)

print(max_len)

# Model

In [None]:
from vllm import LLM, SamplingParams
import os
import torch


layers = 24  # 16, 20, 24

model_name = "CohereLabs/aya-expanse-8b"
# model_name = f"ymoslem/wmt25-cs-de-{layers}layers-2e-05-100k-news-commentary-sentences"


num_gpus = torch.cuda.device_count()
awq = True if "-awq" in model_name.lower() else False  # verify based on your model
max_model_len = 4096


print(f"Model name: {model_name}")
print(f"Number of GPUs: {num_gpus}")
print(f"Max length: {max_len}")
print(f"AWQ: {awq}\n")


if awq:
    llm = LLM(model=model_name,
              #download_dir=model_directory,
              trust_remote_code=True,
              tensor_parallel_size=num_gpus,
              quantization="awq_marlin",
              max_model_len=max_model_len,
             )
else:
    llm = LLM(model=model_name,
              #download_dir=model_directory,
              trust_remote_code=True,
              dtype=torch.bfloat16,
              tensor_parallel_size=num_gpus,
              max_model_len=max_model_len,
              )

# Translation

In [None]:
from tqdm.auto import tqdm

print(f"Translating {len(prompts)} prompts...")

# Set up sampling parameters
sampling_params = SamplingParams(
                                temperature=0.0,  # Deterministic generation
                                max_tokens=max_len,
                                stop_token_ids=[llm.get_tokenizer().eos_token_id],
                                )

# Format all prompts for chat (if using instruct model)
formatted_prompts = []
for prompt in tqdm(prompts, desc="Formatting prompts"):
    # Format as chat
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = llm.get_tokenizer().apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    formatted_prompts.append(formatted_prompt)

# Generate all responses at once (vLLM handles batching internally)
print("Generating responses...")
batch_outputs = llm.generate(formatted_prompts, sampling_params)

# Extract the generated text
translations = []
for output in batch_outputs:
    generated_text = output.outputs[0].text.strip()
    translations.append(generated_text)

print(f"Generated {len(translations)} responses")

In [None]:
translations[0]

In [None]:
# # Optional: Save the translations to a file
# with open("output.txt", "w") as output:
#     for sentence in translations:
#         output.write(sentence.strip() + "\n")

In [None]:
# Release memory

def release_memory(model):
    import gc
    model = None
    gc.collect()
    with torch.no_grad():
        torch.cuda.empty_cache()

release_memory(llm)

# Evaluation

In [None]:
from sacrebleu.metrics import CHRF

all_scores = []

chrf = CHRF(word_order=2)

chrf_score = round(chrf.corpus_score(translations, [references]).score, 2)

all_scores.append(chrf_score)

chrf_score

In [None]:
from comet import download_model, load_from_checkpoint

# Download and load a COMET model
comet_model_names = ["Unbabel/wmt20-comet-da", "Unbabel/wmt22-comet-da"]

for comet_model_name in comet_model_names:

    model_path = download_model(comet_model_name)
    comet_model = load_from_checkpoint(model_path).to("cuda")

    assert comet_model.device.type == "cuda"

    # Prepare the data
    data = []
    for src, mt, ref in zip(source_sentences, translations, references):
        data.append({
            "src": src,
            "mt": mt,
            "ref": ref
        })

    # Calculate COMET scores
    model_output = comet_model.predict(data, batch_size=8, gpus=1)
    comet_scores = model_output.scores
    comet_corpus_score = round(model_output.system_score * 100, 2)
    all_scores.append(comet_corpus_score)
    
    release_memory(comet_model)

    print(comet_model_name)
    print(f"Corpus COMET score: {comet_corpus_score}")

In [None]:
import polars as pl

print(model_name)

df = pl.DataFrame([all_scores],
                  schema=["chrF++", "COMET20", "COMET22"],
                  orient="row",
                 )

df