In [None]:
# Configure the model for LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Suggested r values: 8, 16, 32, 64, 128
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth", # Saves memory
    random_state=3407,
    use_rslora=False,  # Rank-Stabilized LoRA
    loftq_config=None, # LoftQ
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8
    warmup_steps=10,
    # Use max_steps for quick testing, or num_train_epochs for a full run
    max_steps=100,  # A short run for demonstration
    # num_train_epochs=1, # Uncomment for a full training run
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="paged_adamw_8bit", # Memory-efficient optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs_llama3",
    report_to="none", # Set to "wandb" or "tensorboard" for logging
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=final_dataset['train'],
    eval_dataset=final_dataset['test'],
    dataset_text_field="text",  # The column in our dataset containing the formatted prompts
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences
    args=training_args,
)

# Start the fine-tuning process
print("--- Starting Llama 3.2 Fine-Tuning ---")
trainer_stats = trainer.train()
print("--- Fine-Tuning Complete ---")

In [None]:
# --- Saving the trained LoRA adapter ---
# The trainer automatically saves the adapter to the output_dir.
# You can also save it manually:
model.save_pretrained("llama3_risk_analyst_lora")
tokenizer.save_pretrained("llama3_risk_analyst_lora")

# To push to Hugging Face Hub (if authenticated)
# model.push_to_hub("your_hf_username/llama3_risk_analyst_lora", token=HF_TOKEN)
# tokenizer.push_to_hub("your_hf_username/llama3_risk_analyst_lora", token=HF_TOKEN)

# --- Merging the adapter for deployment ---
# To create a standalone model, you can merge the LoRA weights.
# This is useful for inference endpoints that don't support PEFT adapters directly.
# First, load the base model in 16-bit precision
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Merge the LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(model, "llama3_risk_analyst_lora")
model = model.merge_and_unload()

# Now `model` is a standalone, fine-tuned model.
# You can save this merged model for deployment.
# model.save_pretrained("llama3_risk_analyst_merged")
# tokenizer.save_pretrained("llama3_risk_analyst_merged")