# Sample Packing vs. Unpacked Finetuning


This notebook compares Unsloth finetuning with and without TRL's sample packing. We train **unsloth/qwen2.5-0.5b** on a small slice of `yahma/alpaca-cleaned` to keep the run short.

In [7]:
import math
import time
from statistics import mean
from dataclasses import dataclass

import torch
from datasets import load_dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer, SFTConfig
from transformers import TrainerCallback

In [16]:
DATASET_NAME = "yahma/alpaca-cleaned"
DATASET_SPLIT = "train[:4096]"
MODEL_NAME = "unsloth/qwen2.5-0.5b"
MAX_SEQ_LENGTH = 2048
BATCH_SIZE_UNPACKED = 8
BATCH_SIZE_PACKED = 1
GRAD_ACCUM = 1
MAX_STEPS = 50
LEARNING_RATE = 2e-4

In [17]:
def allow_overlength(module: torch.nn.Module) -> None:
    if hasattr(module, 'max_seq_length'):
        setattr(module, '_unsloth_allow_packed_overlength', True)
    for child in module.children():
        allow_overlength(child)

ALPACA_PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

@dataclass
class RunStats:
    label: str
    train_runtime: float
    steps_per_second: float
    samples_per_second: float
    approx_tokens_per_step: float
    tokens_per_second: float
    padding_percent: float

def run_experiment(use_sample_packing: bool) -> RunStats:
    model, tokenizer = FastLanguageModel.from_pretrained(
        MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=None,
        load_in_4bit=False,
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj',
                        'gate_proj', 'up_proj', 'down_proj'],
        lora_alpha=16,
        lora_dropout=0.0,
        bias='none',
        use_gradient_checkpointing='unsloth',
        use_rslora=False,
        loftq_config=None,
    )
    if use_sample_packing:
        allow_overlength(model)

    raw_dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

    def formatting(example):
        text = ALPACA_PROMPT.format(example['instruction'], example['input'], example['output'])
        return {'text': text + tokenizer.eos_token}

    dataset = raw_dataset.map(formatting, remove_columns=raw_dataset.column_names)

    text_samples = [example['text'] for example in dataset]
    encodings = tokenizer(text_samples, truncation=True, max_length=MAX_SEQ_LENGTH, return_length=True)
    lengths = encodings['length']
    total_tokens = sum(lengths)
    average_tokens_per_sample = total_tokens / len(lengths)

    batch_size = BATCH_SIZE_PACKED if use_sample_packing else BATCH_SIZE_UNPACKED
    theoretical_capacity = MAX_SEQ_LENGTH * batch_size

    if use_sample_packing:
        packed_units = max(1, math.ceil(total_tokens / MAX_SEQ_LENGTH))
        avg_tokens_per_unit = total_tokens / packed_units
        approx_tokens_per_step = min(avg_tokens_per_unit * batch_size, theoretical_capacity)
        padding_percent = max(0.0, 100.0 * (1.0 - (avg_tokens_per_unit / MAX_SEQ_LENGTH)))
    else:
        approx_tokens_per_step = min(average_tokens_per_sample * batch_size, theoretical_capacity)
        padding_percent = max(0.0, 100.0 * (1.0 - (approx_tokens_per_step / theoretical_capacity)))

    training_args = SFTConfig(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=GRAD_ACCUM,
        max_steps=MAX_STEPS,
        learning_rate=LEARNING_RATE,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=5,
        output_dir='outputs/sample-packing-demo',
        report_to='none',
        max_length=MAX_SEQ_LENGTH,
        dataset_num_proc=2,
        packing=use_sample_packing,
    )

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=dataset,
        args=training_args,
    )

    torch.cuda.empty_cache()
    start = time.perf_counter()
    train_output = trainer.train()
    elapsed = time.perf_counter() - start

    steps_per_second = train_output.metrics.get('train_steps_per_second', float('nan'))
    samples_per_second = train_output.metrics.get('train_samples_per_second', float('nan'))

    tokens_per_second = float('nan')
    if not math.isnan(steps_per_second):
        tokens_per_second = approx_tokens_per_step * steps_per_second

    return RunStats(
        label='sample_packing' if use_sample_packing else 'unpacked',
        train_runtime=elapsed,
        steps_per_second=steps_per_second,
        samples_per_second=samples_per_second,
        approx_tokens_per_step=approx_tokens_per_step,
        tokens_per_second=tokens_per_second,
        padding_percent=padding_percent,
    )

In [18]:
stats_unpacked = run_experiment(use_sample_packing=False)
stats_packed = run_experiment(use_sample_packing=True)

==((====))==  Unsloth 2025.10.2: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3080. Num GPUs = 1. Max memory: 9.641 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,096 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 8,798,208 of 502,830,976 (1.75% trained)


Step,Training Loss
5,1.7325
10,1.3161
15,1.1729
20,1.1488
25,1.2461
30,1.3541
35,1.0775
40,1.06
45,1.2404
50,1.2035


==((====))==  Unsloth 2025.10.2: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3080. Num GPUs = 1. Max memory: 9.641 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 378 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 8,798,208 of 502,830,976 (1.75% trained)


Step,Training Loss
5,1.1326
10,1.2638
15,1.3252
20,1.2695
25,1.0941
30,1.2077
35,1.1044
40,1.164
45,1.1013
50,1.189


In [19]:
import pandas as pd
pd.DataFrame([stats_unpacked.__dict__, stats_packed.__dict__])

Unnamed: 0,label,train_runtime,steps_per_second,samples_per_second,approx_tokens_per_step,tokens_per_second,padding_percent
0,unpacked,17.346902,3.081,24.647,1496.746094,4611.474715,90.864587
1,sample_packing,13.674591,3.981,3.981,2043.557333,8135.401744,0.216927


## Observations
- Sample packing reduces the number of optimizer steps needed to cover the dataset.
- Tokens per second increase sharply because each packed batch (batch size 1) processes a full `max_seq_length` window.
- Training loss remains comparable, indicating TRL's masking keeps packed sequences isolated.
