# SFT

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

## Load LLM

In [3]:
max_seq_length = 40960

In [4]:
import torch
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-30 03:14:30 [__init__.py:243] Automatically detected platform cuda.
==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.9.0.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=256,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=256,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.5.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Dataset

In [6]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a helpful assistant specialized in summarizing documents. Generate a concise TL;DR summary in markdown format having a maximum of 512 characters of the key findings from the provided documents, highlighting the most significant insights

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token


def formatting_prompts_func(examples):
    inputs = examples["instruction"]
    outputs = examples["answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN

        texts.append(text)
    return {
        "text": texts,
    }

In [7]:
from datasets import load_dataset

dataset = load_dataset("tuanlda78202/leo_summarization_task")
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

## Training

In [8]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,  # Can make training 5x faster for short sequences
    args=TrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        # warmup_steps = 5,
        # max_steps = max_steps,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="sft_outputs",
        report_to="comet_ml",
    ),
)

In [9]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.638 GB.
2.4 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 448 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 4 x 1) = 64
 "-____-"     Trainable parameters = 278,921,216/7,000,000,000 (3.98% trained)
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/tuanlda78202/leo/495f593d3ce849bba24201b483b09e61



Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0316
2,1.7388
3,1.6488
4,1.3835
5,1.3175
6,1.1782
7,1.2225


In [11]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

295.978 seconds used for training.
4.93 minutes used for training.
Peak reserved memory = 5.832 GB.
Peak reserved memory for training = 3.432 GB.
Peak reserved memory % of max memory = 50.112 %.
Peak reserved memory for training % of max memory = 29.49 %.


## Inference

In [12]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
text_streamer = TextStreamer(tokenizer)

max_qwen3_seq_length = 40960
max_new_tokens = 512


def generate_text(instruction):
    message = alpaca_prompt.format(
        instruction,
        "",
    )
    inputs = tokenizer([message], return_tensors="pt").to("cuda")

    return model.generate(
        **inputs, streamer=text_streamer, max_new_tokens=max_new_tokens, use_cache=True
    )


generate_text(dataset["validation"][1]["instruction"])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a helpful assistant specialized in summarizing documents. Generate a concise TL;DR summary in markdown format having a maximum of 512 characters of the key findings from the provided documents, highlighting the most significant insights

### Input:
# Code

# SFT

- [https://www.philschmid.de/fine-tune-llms-in-2025](https://www.philschmid.de/fine-tune-llms-in-2025)
	- [https://www.reddit.com/r/LocalLLaMA/comments/1eg0cap/is_lora_finetuning_sometimes_less_effective_than/](https://www.reddit.com/r/LocalLLaMA/comments/1eg0cap/is_lora_finetuning_sometimes_less_effective_than/)
	- [https://www.reddit.com/r/LocalLLaMA/comments/17pw7bv/eternal_question_what_rank_r_and_alpha_to_use_in/](https://www.reddit.com/r/LocalLLaMA/comments/17pw7bv/eternal_question_what_rank_r_and_alpha_to_use_in/)
	- [https://www.youtube.c

TL;DR

Fine-tuning LLMs is a critical step in achieving specific tasks. The key findings from the provided documents highlight the importance of choosing the right approach, such as LORA, QLoRA, or full fine-tuning, depending on the task and computational resources. Additionally, the documents emphasize the need for careful calibration of hyperparameters like rank, alpha, and the use of tools like DeepSpeed-Zero and DeepSpeed-LoRA for efficient fine-tuning. The response provides a concise summary of the most significant insights in markdown format, highlighting the most important findings.
```markdown
# TL;DR

Fine-tuning LLMs is a critical step in achieving specific tasks. The key findings from the provided documents highlight the importance of choosing the right approach, such as LORA, QLoRA, or full fine-tuning, depending on the task and computational resources. Additionally, the documents emphasize the need for careful calibration of hyperparameters like rank, alpha, and the use of

tensor([[38214,   374,   458,  ...,  1159,   592,  4711]], device='cuda:0')

## Saving

In [13]:
import os

model_name = "Qwen3-1.7B-Leo-Summarization"

# Saving to float16 for VLLM
model.save_pretrained_merged(
    model_name,
    tokenizer,
    save_method="merged_16bit",
)

model.push_to_hub_merged(
    model_name,
    tokenizer=tokenizer,
    save_method="merged_16bit",
    token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1.35 out of 15.49 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 59.57it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: You are pushing to hub, but you passed your HF username = tuanlda78202.
We shall truncate tuanlda78202/Qwen3-1.7B-Leo-Summarization to Qwen3-1.7B-Leo-Summarization


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.05 out of 15.49 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 70.31it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.


  0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/tuanlda78202/Qwen3-1.7B-Leo-Summarization
