In [None]:
dataset_name = "tuanlda78202/leo_summarization_task"
model_name = "tuanlda78202/Qwen3-1.7B-Leo-Summarization"

In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=40960,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

In [None]:
from datasets import load_dataset

dataset = load_dataset(dataset_name, split="test")

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a helpful assistant specialized in summarizing documents. Generate a concise TL;DR summary in markdown format having a maximum of 512 characters of the key findings from the provided documents, highlighting the most significant insights

### Input:
{}

### Response:
{}"""

In [None]:
from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer)


def generate_text(
    instruction, streaming: bool = True, trim_input_message: bool = False
):
    message = alpaca_prompt.format(
        instruction,
        "",
    )
    inputs = tokenizer([message], return_tensors="pt").to("cuda")

    if streaming:
        return model.generate(
            **inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True
        )
    else:
        output_tokens = model.generate(**inputs, max_new_tokens=256, use_cache=True)
        output = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]

        if trim_input_message:
            return output[len(message) :]
        else:
            return output

In [None]:
_ = generate_text(dataset[11]["instruction"], streaming=True)