<a href="https://colab.research.google.com/github/ywangumichigan/EECS595-Project/blob/main/SFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets peft accelerate bitsandbytes sentencepiece tiktoken tqdm

In [9]:
from datasets import get_dataset_config_names, load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
from tqdm import tqdm
import re

In [3]:
configs = get_dataset_config_names("EleutherAI/hendrycks_math")
print(f"Found {len(configs)} configs: {configs}")

def load_all(split):
    parts = [load_dataset("EleutherAI/hendrycks_math", c, split=split) for c in configs]
    return concatenate_datasets(parts)

train_ds = load_all("train")
test_ds  = load_all("test")
print(f"Train: {len(train_ds)}, Test: {len(test_ds)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Found 7 configs: ['algebra', 'counting_and_probability', 'geometry', 'intermediate_algebra', 'number_theory', 'prealgebra', 'precalculus']
Train: 7500, Test: 5000


In [4]:
SYSTEM_PROMPT = "You are a math assistant. Solve the problem step by step, explain your reasoning, and box the final answer using \\boxed{}."

def add_messages(example):
    example["messages"] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example["problem"]},
        {"role": "assistant", "content": example["solution"]}
    ]
    return example

train_ds = train_ds.map(add_messages)
test_ds  = test_ds.map(add_messages)

In [5]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)

lora_config = LoraConfig(r=64, lora_alpha=16, target_modules=["q_proj", "v_proj"])
model = get_peft_model(model, lora_config)

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
def preprocess(examples):
    texts = [
        f"{msg['content']}" for msg_list in examples["messages"] for msg in msg_list
    ]
    inputs = tokenizer(texts, max_length=2048, truncation=True, padding="max_length")
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

tokenized_train = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [7]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./qwen-sft-math-colab",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_steps=100,
        report_to="none",
    ),
    train_dataset=tokenized_train,
)
trainer.train()

# 保存
model.save_pretrained("qwen-sft-math-colab")
tokenizer.save_pretrained("qwen-sft-math-colab")

Step,Training Loss
10,5.9819
20,1.0122
30,0.133
40,0.1062
50,0.0927
60,0.128
70,0.1251
80,0.0909
90,0.0832
100,0.0925


KeyboardInterrupt: 

In [11]:
# 加载你的 SFT 模型
model_name = "./tinyllama-sft"  # 或 GCS 路径
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

# 正则提取 \boxed{}
def extract_boxed(text):
    if not text:
        return ""
    m = re.search(r"\\boxed\{([^}]*)\}", text, re.DOTALL)
    return m.group(1).strip() if m else ""

# 标准化答案
def normalize_answer(s):
    if not s:
        return ""
    s = s.strip()
    s = re.sub(r'\s+', '', s)  # 去空格
    s = re.sub(r'\\boxed\{([^}]*)\}', r'\1', s)
    s = re.sub(r'[^0-9a-zA-Z\.\-\+\/]', '', s)
    return s.lower()

# 评估函数
def evaluate_exact_match(model, tokenizer, dataset, num_samples=100):
    correct = 0
    total = 0

    for ex in tqdm(dataset.select(range(num_samples))):
        problem = ex["problem"]
        gold_solution = ex["solution"]
        gold_ans = normalize_answer(extract_boxed(gold_solution))

        # 模型生成
        prompt = f"<|system|>\nYou are a math assistant. Solve step by step and box the final answer with \\boxed{{}}.\n<|user|>\n{problem}\n<|assistant|>"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=False,
                temperature=0.0,
                pad_token_id=tokenizer.eos_token_id
            )

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # 提取 assistant 部分
        if "<|assistant|>" in pred_text:
            pred_text = pred_text.split("<|assistant|>")[-1]
        pred_ans = normalize_answer(extract_boxed(pred_text))

        if pred_ans == gold_ans:
            correct += 1
        total += 1

    return correct / total

# 运行评估
em_score = evaluate_exact_match(model, tokenizer, test_ds, num_samples=100)
print(f"Normalized Exact Match: {em_score:.1%}")

100%|██████████| 5000/5000 [00:00<00:00, 10930.98it/s]

Exact Match: 100.0%



