In [1]:
import os
import json
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPTQConfig,
    TrainingArguments
)
from trl import SFTTrainer
from peft import LoraConfig
from multiprocessing import cpu_count

In [2]:
# ─────────────────────────────────────────────────────────────────────────────
# 1. Load your RAFT JSONL directly as an HF Dataset
# ─────────────────────────────────────────────────────────────────────────────
raw_ds = load_dataset(
    "json",
    data_files="raft_train.jsonl",
    split="train"
)

# We only need the two columns:
#  - 'instruction': "<DOCUMENT>…</DOCUMENT>…QUESTION…"
#  - 'cot_answer':   the gold answer text
ds = raw_ds.remove_columns([
    c for c in raw_ds.column_names
    if c not in ["instruction", "cot_answer"]
])

# Optionally: split into train/test
splits = ds.train_test_split(test_size=0.1)
train_ds = splits["train"]
eval_ds  = splits["test"]

In [3]:
train_ds, eval_ds

(Dataset({
     features: ['cot_answer', 'instruction'],
     num_rows: 296
 }),
 Dataset({
     features: ['cot_answer', 'instruction'],
     num_rows: 33
 }))

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# 2. Initialize tokenizer & model (with GPTQ quantization)
# ─────────────────────────────────────────────────────────────────────────────
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
QUANT_CFG = GPTQConfig(bits=4, use_exllama=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token  # ensure pad is defined

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto",
    use_cache=False,
    quantization_config=QUANT_CFG
)

RuntimeError: GPU is required to quantize or run quantize model.

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 3. Tokenize examples
# ─────────────────────────────────────────────────────────────────────────────
def preprocess(batch):
    # batch["instruction"] and batch["cot_answer"] are strings
    inputs = tokenizer(
        batch["instruction"],
        truncation=True,
        max_length=2048,
        padding=False,
    )
    # target labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["cot_answer"],
            truncation=True,
            max_length=512,
            padding=False,
        )
    inputs["labels"] = labels["input_ids"]
    return inputs

train_tkn = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=train_ds.column_names,
    num_proc=cpu_count(),
)
eval_tkn  = eval_ds.map(
    preprocess,
    batched=True,
    remove_columns=eval_ds.column_names,
    num_proc=cpu_count(),
)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 4. Configure LoRA adapters
# ─────────────────────────────────────────────────────────────────────────────
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
    ],
    task_type="CAUSAL_LM"
)

# ─────────────────────────────────────────────────────────────────────────────
# 5. Setup TrainingArguments and SFTTrainer
# ─────────────────────────────────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir="raft-sft-output",
    per_device_train_batch_size=1,    # small batches if quantized
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    max_steps=60,                    # or set num_train_epochs
    evaluation_strategy="epoch",
    save_strategy="no",
    fp16=True,
    gradient_checkpointing=True,
    logging_steps=5,
    seed=42,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tkn,
    eval_dataset=eval_tkn,
    peft_config=peft_config,
    tokenizer=tokenizer,
    dataset_text_field=None,  # already tokenized
)


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 6. Run training
# ─────────────────────────────────────────────────────────────────────────────
trainer.train()
trainer.save_model("finetuned_raft_model")