# 01: SFT on GSM8K
Fine-tune Qwen2.5-0.5B-Instruct on GSM8K with answer format.
Run once, push to HF, then use in GRPO experiments.

Install dependencies

In [None]:
!pip install -qUU git+https://github.com/tripathysagar/rlhf-gsm8k.git

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/515.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Import libraries

In [2]:
import torch
import regex as re
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig

Set training hyperparameters and HF repo target

In [3]:
config = dict(
    model_name="Qwen/Qwen2.5-0.5B-Instruct",
    lora_r=16,
    lora_alpha=16,
    epochs=3,
    lr=2e-4,
    batch_size=8,
    grad_accum=4,
    train_size=1024,
    hf_repo="tripathysagar/Qwen2.5-0.5B-GSM8K-SFT",  # change to your repo
)

Load tokenizer, model (bfloat16), and set up LoRA on all linear layers

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
model = AutoModelForCausalLM.from_pretrained(
    config["model_name"], torch_dtype=torch.bfloat16, device_map="auto"
)

lora_config = LoraConfig(
    r=config["lora_r"],
    lora_alpha=config["lora_alpha"],
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Define the system prompt and helper functions to convert GSM8K examples into the chat format with a structured `The answer is: {number}.` ending

In [5]:
SYSTEM_PROMPT = (
    "You are a helpful math assistant. Solve the problem step by step, "
    "then give your final answer as a single number on the last line in exact format"
    """\nThe answer is: {number}."""
)

def clean_gold_answer(answer_text):
    """Strip <<...>> annotations and reformat with 'The answer is: N.' ending."""
    parts = answer_text.split("####")
    reasoning = parts[0].strip()
    final_num = parts[1].strip().replace(",", "") if len(parts) > 1 else ""
    reasoning = re.sub(r'<<.*?>>', '', reasoning)
    return f"{reasoning}\nThe answer is: {final_num}."


def make_sft_example(ex):
    cleaned = clean_gold_answer(ex['answer'])
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": ex["question"]},
        {"role": "assistant", "content": cleaned},
    ]
    return {"messages": messages}

Load GSM8K training split, shuffle, take a subset, and format as chat messages

In [6]:
ds = load_dataset("openai/gsm8k", "main", split="train")
ds = ds.shuffle(seed=42).select(range(config["train_size"]))
ds = ds.map(make_sft_example, remove_columns=ds.column_names)

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Configure and run SFT training with LoRA

In [7]:
sft_config = SFTConfig(
    output_dir="sft_qwen_gsm8k",
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    gradient_accumulation_steps=config["grad_accum"],
    learning_rate=config["lr"],
    logging_steps=10,
    save_steps=10,
    report_to='none',
    max_length=256,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    fp16=False,
    dataset_text_field=None,
    save_total_limit=2,
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=ds,
    args=sft_config,
    peft_config=lora_config,
)

trainer.train()

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


Tokenizing train dataset:   0%|          | 0/1024 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1024 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,1.445153
20,0.570653
30,0.368143
40,0.326823
50,0.306867
60,0.306866
70,0.303008
80,0.289909
90,0.283406


TrainOutput(global_step=96, training_loss=0.45599458118279773, metrics={'train_runtime': 186.1496, 'train_samples_per_second': 16.503, 'train_steps_per_second': 0.516, 'total_flos': 1695756329349120.0, 'train_loss': 0.45599458118279773})

Merge LoRA adapters back into the base model

In [8]:
model = trainer.model.merge_and_unload()

In [10]:
model.gradient_checkpointing_disable()

## Quick Eval

Formmating function for applaying the chat template.

In [12]:
def format_prompt(question):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [35]:
test_ds = load_dataset("openai/gsm8k", "main", split="test")
test_ds = test_ds.train_test_split(test_size=0.5, seed=42)
test_ds = test_ds["test"]
test_ds

Dataset({
    features: ['question', 'answer'],
    num_rows: 660
})

In [36]:
def extract_gold(ex):
    return {"answer": ex["answer"].split("####")[-1].strip()}

test_ds = test_ds.map(extract_gold, remove_columns=["answer"])
test_ds = test_ds.map(lambda x: {"question": format_prompt(x["question"])})
test_ds


Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 660
})

In [37]:
print(test_ds[0]['question'])
test_ds[0]['answer']

<|im_start|>system
You are a helpful math assistant. Solve the problem step by step, then give your final answer as a single number on the last line in exact format
The answer is: {number}.<|im_end|>
<|im_start|>user
Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.<|im_end|>
<|im_start|>assistant



'109'

Fetching the answer from the response.

In [38]:
def extract_answer(response):
    try:
        matches = re.findall(r'The answer is[:\s]*(\-?\d[\d,]*\.?\d*)', response, re.IGNORECASE)
        if matches:
            return int(float(matches[-1].replace(",", ""))), True
        nums = re.findall(r'\-?\d[\d,]*\.?\d*', response)
        if nums:
            return int(float(nums[-1].replace(",", ""))), False
    except Exception as e:
        print(f"[extract_answer error] response={response[:80]!r} err={e}")
    return None, False


In [39]:
assert extract_answer("The answer is: 72.") == (72, True)
assert extract_answer("ans : 42") == (42, False)
assert extract_answer("no numbers here") == (None, False)

In [40]:
from tqdm import tqdm

def eval_math_accuracy (model, tokenizer, test_ds, batch_size=64):
    model.eval()
    model.gradient_checkpointing_disable()
    tokenizer.padding_side = "left"

    correct, total = 0, 0
    table_rows = []
    test_data = list(test_ds)

    for i in tqdm(range(0, len(test_data), batch_size), desc="Evaluating"):
        batch = test_data[i:i+batch_size]
        prompts = [format_prompt(ex["question"]) for ex in batch]
        golds = [int(ex["answer"].split("####")[-1].strip().replace(",", "")) for ex in batch]

        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id)

        for ids, gold_int in zip(out, golds):
            response = tokenizer.decode(ids[inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
            ans, has_fmt = extract_answer(response)
            is_correct = (ans == gold_int)
            correct += int(is_correct)
            total += 1
            table_rows.append(f"| {total} | {gold_int} | {ans} | {'✅' if is_correct else '❌'} | {response[:80].replace(chr(10), ' ')} |")

    print(f"\nAccuracy: {correct}/{total} = {correct/total:.2%}")

    model.train()
    tokenizer.padding_side = "right"
    return table_rows


In [41]:
perf_log = eval_math_accuracy (model, tokenizer, test_ds)

Evaluating: 100%|██████████| 11/11 [01:59<00:00, 10.85s/it]


Accuracy: 172/660 = 26.06%





In [None]:
#from IPython.display import Markdown, display

#header = "| # | Gold | Predicted | ✓ | Response |\n|---|---|---|---|---|"
#display(Markdown(header + "\n" + "\n".join(perf_log)))

## Merge & Push to HuggingFace

In [43]:
model_card = f"""---
tags:
- math
- gsm8k
- sft
- lora
- qwen2.5
datasets:
- openai/gsm8k
base_model: {config["model_name"]}
---

# {config["hf_repo"].split("/")[-1]}

Fine-tuned **{config["model_name"]}** on [GSM8K](https://huggingface.co/datasets/openai/gsm8k) for math reasoning with a structured answer format.

## Training Details

| Parameter | Value |
|---|---|
| Base model | `{config["model_name"]}` |
| Method | SFT + LoRA |
| LoRA r / alpha | {config["lora_r"]} / {config["lora_alpha"]} |
| LoRA targets | all-linear |
| Epochs | {config["epochs"]} |
| Learning rate | {config["lr"]} |
| Batch size | {config["batch_size"]} × {config["grad_accum"]} (grad accum) |
| Training examples | {config["train_size"]} |
| Precision | bf16 |

## Answer Format

The model is trained to end responses with:
```
The answer is: {{number}}.
```

## System Prompt

```
{SYSTEM_PROMPT}
```
"""

from huggingface_hub import ModelCard, create_repo

create_repo(config["hf_repo"], exist_ok=True)

ModelCard(model_card).push_to_hub(config["hf_repo"])
model.push_to_hub(config["hf_repo"])
tokenizer.push_to_hub(config["hf_repo"])
print(f"Pushed to https://huggingface.co/{config['hf_repo']}")


README.md:   0%|          | 0.00/900 [00:00<?, ?B/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nwrkgxh/model.safetensors:   5%|5         | 50.3MB /  988MB            

README.md:   0%|          | 0.00/906 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpeg26zut5/tokenizer.json:  73%|#######2  | 8.30MB / 11.4MB            

Pushed to https://huggingface.co/tripathysagar/Qwen2.5-0.5B-GSM8K-SFT
