In [None]:
!pip install torch unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

# Define model parameters
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # Auto-detects: Float16 for T4/V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Load the Llama 3.2 3B Instruct model from unsloth's repository
# Unsloth provides pre-quantized models for faster downloads and no OOM errors.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

In [None]:
# --- Saving the trained LoRA adapter ---
# The trainer automatically saves the adapter to the output_dir.
# You can also save it manually:
model.save_pretrained("gemini2_risk_analyst_lora")
tokenizer.save_pretrained("gemini2_risk_analyst_lora")

# To push to Hugging Face Hub (if authenticated)
# model.push_to_hub("your_hf_username/llama3_risk_analyst_lora", token=HF_TOKEN)
# tokenizer.push_to_hub("your_hf_username/llama3_risk_analyst_lora", token=HF_TOKEN)

# --- Merging the adapter for deployment ---
# To create a standalone model, you can merge the LoRA weights.
# This is useful for inference endpoints that don't support PEFT adapters directly.
# First, load the base model in 16-bit precision
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gemini-2.5-pros",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Merge the LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(model, "gemini2_risk_analyst_lora")
model = model.merge_and_unload()

# Now `model` is a standalone, fine-tuned model.
# You can save this merged model for deployment.
# model.save_pretrained("llama3_risk_analyst_merged")
# tokenizer.save_pretrained("llama3_risk_analyst_merged")