In [None]:
# !pip install -qU transformers trl datasets bitsandbytes

In [None]:
# from google.colab import files
# files.upload()

Saving raft_train.jsonl to raft_train (5).jsonl


{'raft_train (5).jsonl': b'{"id":"seed_task_0","type":"general","question":"How many learners participated in the study?","context":{"sentences":[["Table 4: Local Average Treatment Effects of Credential Sharing: First and Second Stage\\nFirst Stage Second Stage Overall Second Stage Exclude 4 monthsCred. Shared Cred. Shared & Past Not In Scope New Job New Job in Scope New Job in Scope & Past Not in Scope New Job New Job in Scope New Job in Scope & Past Not in Scope\\nCred. Feature 0.028 0.023(0.004) (0.004)Cred. Shared 0.242 0.360 0.287 0.188 0.217 0.166(0.130) (0.117) (0.132) (0.105) (0.090) (0.101)\\nNo. of obs. 36,946 30,607 36,946 36,946 30,607 36,946 36,946 30,607Learners covariates Yes Yes Yes Yes Yes Yes Yes Yes\\nNote: The first two Columns show the results from the first-stage regression. The estimates in Column 1 are based on the entire LinkedIn\\nMatched Sample, in Column 2 we restrict the sample to learners whose past jobs were not in scope. Columns 3 to 8 present results of

Let's setup our notebook!

In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer
from peft import LoraConfig

## Load the datasets
Really we only need the instruction and "chain-of-thought" answer to do RAFT. Let's take a look at the columns
> Along with doing our favorite train-test split!

In [4]:
raw_ds = load_dataset(
    "json",
    data_files="raft_train.jsonl",
    split="train"
)

# We only need the two columns:
#  - 'instruction': "<DOCUMENT>…</DOCUMENT>…QUESTION…"
#  - 'cot_answer':   the gold answer text
ds = raw_ds.remove_columns([
    c for c in raw_ds.column_names
    if c not in ["instruction", "cot_answer"]
])

# Optionally: split into train/test
splits = ds.train_test_split(test_size=0.1)
train_ds = splits["train"]
eval_ds  = splits["test"]

The instructions and the chain-of-thought answer columns will be what the LLM relies on to answer the questions - it teaches the LLM how to use the retrieved contexts.

In [11]:
import pandas as pd

df = pd.DataFrame(raw_ds)
df.head()

Unnamed: 0,id,type,question,context,oracle_context,cot_answer,instruction
0,seed_task_0,general,How many learners participated in the study?,{'sentences': [['Table 4: Local Average Treatm...,The value of non-traditional credentials in th...,assistant: To determine how many learners part...,<DOCUMENT>Table 4: Local Average Treatment Eff...
1,seed_task_1,general,What platforms were used to deliver the courses?,{'sentences': [['The value of non-traditional ...,The value of non-traditional credentials in th...,assistant: Step 1: Identify relevant informati...,<DOCUMENT>The value of non-traditional credent...
2,seed_task_2,general,What was the main finding regarding credential...,{'sentences': [['The value of non-traditional ...,The value of non-traditional credentials in th...,assistant: ### Step-by-Step Reasoning:\n1. **I...,<DOCUMENT>The value of non-traditional credent...
3,seed_task_3,general,Did the intervention have a significant impact...,{'sentences': [['The value of non-traditional ...,The value of non-traditional credentials in th...,assistant: Step-by-step reasoning:\n\n1. The q...,<DOCUMENT>The value of non-traditional credent...
4,seed_task_4,general,How did the effect vary among different groups...,{'sentences': [['13 p.p.) and 36 p.p. (S.E. 12...,The value of non-traditional credentials in th...,assistant: Step-by-step reasoning:\n\n1. The s...,<DOCUMENT>13 p.p.) and 36 p.p. (S.E. 12 p.p.) ...


In [5]:
train_ds, eval_ds

(Dataset({
     features: ['cot_answer', 'instruction'],
     num_rows: 296
 }),
 Dataset({
     features: ['cot_answer', 'instruction'],
     num_rows: 33
 }))

In [None]:
## Just in case you are using a gated LLM.

# from huggingface_hub import login
# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load our fav tiny llama model
We load the quantized version just to speed things up

In [None]:
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_cfg,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token


Prepare our dataset for training!

In [None]:
from multiprocessing import cpu_count

def preprocess(batch):
    # batch["instruction"] and batch["cot_answer"] are strings
    inputs = tokenizer(
        batch["instruction"],
        truncation=True,
        max_length=2048,
        padding=False,
    )
    # target labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["cot_answer"],
            truncation=True,
            max_length=512,
            padding=False,
        )
    inputs["labels"] = labels["input_ids"]
    return inputs

train_tkn = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=train_ds.column_names,
    num_proc=cpu_count(),
)
eval_tkn  = eval_ds.map(
    preprocess,
    batched=True,
    remove_columns=eval_ds.column_names,
    num_proc=cpu_count(),
)

Map (num_proc=2):   0%|          | 0/296 [00:00<?, ? examples/s]



Map (num_proc=2):   0%|          | 0/33 [00:00<?, ? examples/s]



Prepare our training configurations!

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
    ],
    task_type="CAUSAL_LM"
)

# ─────────────────────────────────────────────────────────────────────────────
# 5. Setup TrainingArguments and SFTTrainer
# ─────────────────────────────────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir="raft-sft-output", #This will also be used as your huggingfacehub model id name
    report_to="wandb", #Leave this to be blank if you don't want to use wandb
    run_name="RAFT_SFT_Take2",
    eval_steps=5,
    eval_strategy="steps",
    per_device_train_batch_size=1,    # small batches if quantized
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=5,
    # max_steps=60,                    # or set num_train_epochs
    save_strategy="no",
    fp16=True,
    gradient_checkpointing=True,
    logging_strategy="steps",
    logging_steps=5,
    seed=42,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tkn,
    eval_dataset=eval_tkn,
    peft_config=peft_config,
)


Truncating train dataset:   0%|          | 0/296 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/33 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Train!!

In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtituslhy[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
5,2.7443,2.967905
10,2.6435,2.935272
15,2.8414,2.900691
20,2.6085,2.865374
25,2.6066,2.829698
30,2.4753,2.793829
35,2.5184,2.759644
40,2.4702,2.727984
45,2.4667,2.69669
50,2.5068,2.665173


TrainOutput(global_step=185, training_loss=2.302635703215728, metrics={'train_runtime': 1252.0526, 'train_samples_per_second': 1.182, 'train_steps_per_second': 0.148, 'total_flos': 5973991698739200.0, 'train_loss': 2.302635703215728})

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file

In [None]:
trainer.push_to_hub(token=os.environ('HUGGINGFACE_ACCESS_TOKEN'))



adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1747635475.5b9aaa27fe98.3987.0:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

events.out.tfevents.1747636110.5b9aaa27fe98.7292.0:   0%|          | 0.00/8.86k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tituslhy/raft-sft-output/commit/cd9dd616b3ecf2333645be9195438519be150422', commit_message='tituslhy/raft-llama32-1bn-finetuned', commit_description='', oid='cd9dd616b3ecf2333645be9195438519be150422', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tituslhy/raft-sft-output', endpoint='https://huggingface.co', repo_type='model', repo_id='tituslhy/raft-sft-output'), pr_revision=None, pr_num=None)