In [1]:
import os
# where you want all HF files (models, tokenizers, caches, etc.) to live:
os.environ['HF_HOME'] = ""
os.environ["HF_TOKEN"] = "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"

In [None]:
#!/usr/bin/env python
"""
Finetune Gemma-3-12B-IT on AG-News with QLoRA, TRL ≥ 0.15, 4 × RTX 3090
"""

# ────────── 1. imports ──────────────────────────────────────────────
import os, torch, random, gc
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTConfig, SFTTrainer

# ────────── 2. constants ────────────────────────────────────────────
MODEL_ID  = "google/gemma-3-12b-it"
OUT_DIR   = "gemma3-agnews-lora"
RAND_SEED = 42
MAX_LEN   = 512                 # truncate articles
LABELS    = ["World", "Sports", "Business", "Sci/Tech"]

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"      # four 3090s
torch.manual_seed(RAND_SEED)

# ────────── 3. load 4-bit base model on 4 GPUs ─────────────────────
max_memory = {i: "23GiB" for i in range(torch.cuda.device_count())}

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit              = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type       = "nf4",
    bnb_4bit_compute_dtype    = torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map          = "balanced_low_0",   # balanced across 4 GPUs
    max_memory          = max_memory,
    quantization_config = bnb_cfg,
    trust_remote_code   = True,
)

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()          # more VRAM savings

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token

# ────────── 4. attach LoRA adapter ─────────────────────────────────-
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj",
                    "o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)

# ────────── 5. prepare dataset ─────────────────────────────────────
def to_chatml(sample):
    user = (f"Classify the following news article into one of "
            f"[World, Sports, Business, Sci/Tech]:\n{sample['text']}")
    return {"messages": [
        {"role": "user",      "content": user},
        {"role": "assistant", "content": LABELS[int(sample['label'])]},
    ]}

ds = load_dataset("fancyzhx/ag_news", split="train").shuffle(seed=RAND_SEED)
ds = ds.select(range(40_000))                        # quick demo size
train_ds = ds.map(to_chatml, remove_columns=ds.column_names)

# ────────── 6. SFTConfig (supersedes TrainingArguments) ────────────
sft_cfg = SFTConfig(
    output_dir                 = OUT_DIR,
    max_length                 = MAX_LEN,
    per_device_train_batch_size= 1,      # 1×3090 ≈ 5.8 GiB
    gradient_accumulation_steps= 16,     # effective batch = 16
    num_train_epochs           = 2,
    learning_rate              = 2e-4,
    bf16                       = True,
    optim                      = "paged_adamw_8bit",
    warmup_ratio               = 0.05,
    lr_scheduler_type          = "cosine",
    logging_steps              = 50,
    save_total_limit           = 2,
    report_to                  = "tensorboard",
    packing                    = True,
    seed                       = RAND_SEED,
)

# ────────── 7. launch training ─────────────────────────────────────
trainer = SFTTrainer(model=model,
                     train_dataset=train_ds,
                     args=sft_cfg)
trainer.train()

# ────────── 8. save adapter ────────────────────────────────────────
model.save_pretrained(f"{OUT_DIR}-adapter")
tokenizer.save_pretrained(f"{OUT_DIR}-adapter")
print(f"\n✓ Adapter written to → {OUT_DIR}-adapter")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_id  = "google/gemma-3-12b-it"
adapter  = "gemma3-agnews-lora-adapter"

tok  = AutoTokenizer.from_pretrained(base_id)
base = AutoModelForCausalLM.from_pretrained(
    base_id, load_in_4bit=True, device_map="auto", trust_remote_code=True)
mdl  = PeftModel.from_pretrained(base, adapter).eval()

prompt = (
    "Classify the following news article into one of "
    "[World, Sports, Business, Sci/Tech]:\n"
    "Nvidia’s quarterly revenue soared 265 % year-on-year thanks to AI demand."
)
ids = tok(prompt, return_tensors="pt").to(mdl.device)
out = mdl.generate(**ids, max_new_tokens=10)
print(tok.decode(out[0][ids.input_ids.shape[1]:], skip_special_tokens=True))
# → "Business"