# Fine-tune a small LLM on public legal data with LoRA (Transformers Trainer)

This notebook shows a minimal end-to-end fine-tuning using Hugging Face Transformers + PEFT (LoRA/QLoRA) on a tiny slice of public legal text (Pile of Law). It follows the Trainer flow described in the Transformers Training docs:

- Docs: https://huggingface.co/docs/transformers/en/training

Notes
- On Linux with CUDA, this will use 4-bit quantization (QLoRA) if bitsandbytes is available.
- On macOS or CPU-only, it falls back to standard LoRA without 4-bit quantization.
- For quick runs, we train on a small subset. Increase as needed.

In [None]:
# If running first time, uncomment to install dependencies
# %pip install -U "transformers>=4.41.0" datasets peft accelerate bitsandbytes sentencepiece
# %pip install -U huggingface_hub

import sys, platform
print(platform.platform())
print(sys.version)


In [None]:
import os, math
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType

# Basic config
MODEL_ID = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
DATASET_ID = "pile-of-law/pile-of-law"  # public legal corpus
OUTPUT_DIR = "outputs/legal-qlora"
BLOCK_SIZE = 1024
TRAIN_DOCS = 3000  # small for demo; increase for better results
SEED = 42

use_cuda = torch.cuda.is_available()
print(f"CUDA available: {use_cuda}")


In [None]:
# Tokenizer
from huggingface_hub import login

# Optional: login for gated models or to push results (not needed for public TinyLlama)
# login()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

# Load a small subset of Pile of Law for a quick demo
raw_ds = load_dataset(DATASET_ID, split="train", trust_remote_code=True)
raw_ds = raw_ds.shuffle(seed=SEED).select(range(min(TRAIN_DOCS, len(raw_ds))))

# Tokenize
def tokenize_fn(examples):
    return tokenizer(examples["text"], add_special_tokens=False)

tokenized = raw_ds.map(tokenize_fn, batched=True, remove_columns=raw_ds.column_names)

# Pack into fixed-length sequences for Causal LM
BLOCK_SIZE = min(BLOCK_SIZE, tokenizer.model_max_length if tokenizer.model_max_length and tokenizer.model_max_length < 100_000 else BLOCK_SIZE)

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = len(concatenated["input_ids"])
    total_len = (total_len // BLOCK_SIZE) * BLOCK_SIZE
    result = {
        k: [t[i:i + BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_ds = tokenized.map(group_texts, batched=True)
print(lm_ds)


In [None]:
# Model: optionally use 4-bit quantization (QLoRA) when CUDA+bitsandbytes available
use_bnb = False
try:
    import bitsandbytes as bnb  # noqa: F401
    use_bnb = use_cuda
except Exception:
    use_bnb = False

bnb_config = None
if use_bnb:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# LoRA target modules for LLaMA-like models
peft_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    weight_decay=0.0,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    bf16=bf16,
    fp16=(not bf16) and torch.cuda.is_available(),
    gradient_checkpointing=True,
    dataloader_pin_memory=False,
    report_to=["none"],
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=lm_ds,
    data_collator=data_collator,
)

train_result = trainer.train()
print(train_result)
trainer.save_model(OUTPUT_DIR)


In [None]:
# Quick inference to verify training worked
from transformers import pipeline

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
)

prompt = "You are a legal assistant. Summarize the following clause: The party of the first part agrees to indemnify and hold harmless..."
output = gen_pipe(prompt, max_new_tokens=128, do_sample=True, top_p=0.9, temperature=0.7)
print(output[0]["generated_text"])

In [None]:
# Save adapters only (PEFT) and tokenizer
adapter_dir = os.path.join(OUTPUT_DIR, "lora-adapter")
os.makedirs(adapter_dir, exist_ok=True)
model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print(f"Saved LoRA adapter to: {adapter_dir}")

# Optionally push to Hub (requires login and write access)
# model.push_to_hub("<your-username>/tinyllama-legal-lora")
# tokenizer.push_to_hub("<your-username>/tinyllama-legal-lora")