In [1]:
dataset_name = "tuanlda78202/leo_summarization_task"
model_name = "tuanlda78202/Qwen3-1.7B-Leo-Summarization"

max_evaluation_samples = 8

In [None]:
from vllm import LLM

llm = LLM(
    model=model_name,
    max_model_len=4096,
    dtype="float16",
    quantization="bitsandbytes",
    load_format="bitsandbytes",
)

In [None]:
from datasets import load_dataset

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a helpful assistant specialized in summarizing documents. Generate a concise TL;DR summary in markdown format having a maximum of 512 characters of the key findings from the provided documents, highlighting the most significant insights

### Input:
{}

### Response:
{}"""


def format_sample(sample: dict) -> str:
    return alpaca_prompt.format(sample["instruction"], "")

In [None]:
dataset = load_dataset(dataset_name, split="test")
dataset = dataset.select(range(max_evaluation_samples))

dataset = dataset.map(lambda sample: {"prompt": format_sample(sample)})

In [None]:
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.0, top_p=0.95, min_p=0.05, max_tokens=4096
)
predictions = llm.generate(dataset["prompt"], sampling_params)

In [None]:
predictions[0].outputs[0].text

In [None]:
answers = [prediction.outputs[0].text for prediction in predictions]
answers[0]

## Eval metrics

In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")


def compute_metrics(predictions: list[str], references: list[str]):
    result = rouge.compute(
        predictions=predictions, references=references, use_stemmer=True
    )
    result["mean_len"] = np.mean([len(p) for p in predictions])

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
references = dataset["answer"]
references[0]

In [None]:
validation_metrics = compute_metrics(answers, references)
print(validation_metrics)