# QLoRA Fine-Tuning for Policy Compliance LLM

Fine-tunes Llama 3.1 8B using QLoRA on policy compliance data.

**Requirements:** T4 GPU, HuggingFace token, training_data_augmented.jsonl

## 1. Setup

In [None]:
!pip install -q torch transformers accelerate peft bitsandbytes trl datasets sentencepiece

In [None]:
import torch
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
from huggingface_hub import login
login()

## 2. Upload Training Data

In [None]:
from google.colab import files
import json

uploaded = files.upload()
DATA_FILE = list(uploaded.keys())[0]
print(f'Uploaded: {DATA_FILE}')

## 3. Load Model (4-bit Quantization)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

MODEL_NAME = 'meta-llama/Meta-Llama-3.1-8B-Instruct'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

print('Loading model...')
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map='auto',
    torch_dtype=torch.float16,
)
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
print('Model loaded!')

## 4. Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=64, lora_alpha=128, lora_dropout=0.05,
    bias='none', task_type='CAUSAL_LM',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 5. Prepare Data

In [None]:
from datasets import Dataset

data = [json.loads(line) for line in open(DATA_FILE) if line.strip()]
print(f'Loaded {len(data)} examples')

TEMPLATE = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a compliance assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

{q}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{a}<|eot_id|>'''

def fmt(ex):
    q = ex.get('question', ex.get('instruction', ''))
    a = ex.get('answer', ex.get('output', ''))
    return {'text': TEMPLATE.format(q=q, a=a)}

dataset = Dataset.from_list([fmt(d) for d in data])
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split['train']
eval_dataset = split['test']
print(f'Train: {len(train_dataset)}, Eval: {len(eval_dataset)}')

## 6. Train

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir='./policy-llama-qlora',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type='cosine',
    optim='paged_adamw_32bit',
    max_grad_norm=0.3,
    logging_steps=10,
    report_to='none',
    evaluation_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    gradient_checkpointing=True,
    seed=42,
    remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    dataset_text_field='text',
    max_seq_length=2048,
    packing=False,
)
print('Ready to train!')

In [None]:
trainer.train()

In [None]:
trainer.save_model('./policy-llama-qlora/final')
tokenizer.save_pretrained('./policy-llama-qlora/final')
print('Saved!')

## 7. Test

In [None]:
def test(prompt):
    inputs = tokenizer(f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a compliance assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n', return_tensors='pt').to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=256, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(out[0], skip_special_tokens=True)

print(test('What are the key data privacy requirements?'))

## 8. Export & Download

In [None]:
from peft import PeftModel

print('Merging adapter...')
base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map='auto')
merged = PeftModel.from_pretrained(base, './policy-llama-qlora/final')
merged = merged.merge_and_unload()
merged.save_pretrained('./policy-llama-merged', safe_serialization=True)
tokenizer.save_pretrained('./policy-llama-merged')
print('Merged!')

In [None]:
!zip -r policy-llama-merged.zip ./policy-llama-merged
files.download('policy-llama-merged.zip')

## 9. Convert to GGUF (for Ollama)

After downloading, convert locally using llama.cpp:
```bash
python convert_hf_to_gguf.py ./policy-llama-merged --outfile policy-llm.gguf
ollama create policy-compliance -f Modelfile
```