In [1]:
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from peft import LoraConfig

# Clear any cached memory
torch.cuda.empty_cache()

In [2]:

# Load dataset with smaller subset for testing
dataset = load_dataset("trl-lib/Capybara", split="train[:100]")  # Use only first 100 samples

In [3]:
# Configure PEFT with LoRA
peft_config = LoraConfig(
    r=8,                         # LoRA rank
    lora_alpha=16,               # LoRA scaling factor
    lora_dropout=0.05,           # Dropout probability for LoRA layers
    bias="none",                 # Bias training strategy
    task_type="CAUSAL_LM",       # Task type for causal language modeling

)

In [4]:
training_args = SFTConfig(
    model_init_kwargs={
        "dtype": torch.bfloat16,
        "attn_implementation": "flash_attention_2"  # Use flash attention for packing support
    },
    packing=True,
    num_train_epochs=3,
    learning_rate=2e-4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    auto_find_batch_size=True,
    output_dir="./output", 
    max_length=512,
)

In [5]:

# Initialize trainer with PEFT configuration
trainer = SFTTrainer(
    model="Qwen/Qwen3-0.6B",
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_args,
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [6]:

print(f"GPU memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
print(f"GPU memory reserved : {torch.cuda.memory_reserved()/1024**3:.2f} GB")
print(f"PEFT config: LoRA with rank={peft_config.r}, alpha={peft_config.lora_alpha}")
print(f"Target modules: {peft_config.target_modules}")

GPU memory allocated: 1.12 GB
GPU memory reserved : 1.41 GB
PEFT config: LoRA with rank=8, alpha=16
Target modules: {'v_proj', 'q_proj'}


In [7]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,2.053
20,1.7392
30,1.7274
40,1.7255
50,1.692
60,1.6657


TrainOutput(global_step=66, training_loss=1.754764527985544, metrics={'train_runtime': 74.9947, 'train_samples_per_second': 3.52, 'train_steps_per_second': 0.88, 'total_flos': 350505765568512.0, 'train_loss': 1.754764527985544, 'entropy': 1.6358890558282535, 'num_tokens': 132282.0, 'mean_token_accuracy': 0.6129788483182589, 'epoch': 3.0})