In [None]:
from transformers.utils import logging

logging.set_verbosity_error()

import numpy as np
import pandas as pd
import sacrebleu
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from utils.model import load_lora_model

tqdm.pandas()

In [None]:
model, tokenizer = load_lora_model("checkpoints/checkpoint-10000")

test_data = pd.read_csv('data/processed/test.csv').sample(5000)

def translate(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, source_text: str, source_language: str, target_language: str) -> str:
    chat = [
        {"role": "system", "content": f"Translate the following text from {source_language} to {target_language}:\n"},
        {"role": "user", "content": source_text}
    ]

    chat = tokenizer.apply_chat_template(chat, tokenize=False)

    tokens = tokenizer(chat, return_tensors="pt")

    tokens = tokens.to(model.device)

    output_tokens = model.generate(**tokens, max_new_tokens=256)

    output_text = tokenizer.decode(output_tokens[0, len(tokens["input_ids"][0]): ], skip_special_tokens=True)

    return output_text
     
with torch.no_grad():
    predicted_text = test_data.progress_apply(lambda row: translate(model, tokenizer, row["source_text"], row["source_language"], row["target_language"]), axis=1).to_list()

target_text = test_data["target_text"].to_list()

bleu_scores = np.array([sacrebleu.sentence_bleu(prediction, [target]).score for prediction, target in zip(predicted_text, target_text)])

print(f"Mean BLEU score: {bleu_scores.mean()}")

  0%|          | 10/5000 [00:03<26:16,  3.17it/s]