In [2]:
%%capture
import os

# 检测当前环境是否为Google Colab，并根据环境选择不同的安装策略
# 这是必要的，因为Colab有特殊的依赖管理需求，直接安装可能导致版本冲突
if "COLAB_" not in "".join(os.environ.keys()):
    # 非Colab环境：直接安装unsloth和vllm及其所有依赖
    !pip install unsloth vllm
else:
    # Colab环境：使用--no-deps参数避免重新安装已有依赖
    # 这能防止与Colab预安装的库（如transformers、numpy）产生版本冲突
    # [注意] 此安装方式仅适用于Colab环境！普通环境请使用标准安装命令
    !pip install --no-deps unsloth vllm

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os

# 检测当前环境是否为Google Colab
# 通过检查环境变量中是否包含"COLAB_"来判断
if "COLAB_" not in "".join(os.environ.keys()):
    # 非Colab环境：直接安装unsloth和vllm
    !pip install unsloth vllm
else:
    # Colab环境需要特殊处理以避免依赖冲突
    !pip install --no-deps unsloth vllm

    # [注意] 以下操作仅在Colab中执行！普通环境使用 [[pip install unsloth vllm]]
    # 重新加载或卸载特定模块，解决Colab中的缓存问题
    # 特别是PIL和google相关模块，避免版本冲突
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

    # 安装LLM推理所需的核心依赖库
    # --no-deps参数避免重新安装已有的依赖，减少冲突风险
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo

    # 安装数据处理和模型管理相关工具
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer

    # vLLM特殊处理：由于vLLM会重新安装numpy导致Colab崩溃
    # 从vLLM官方仓库获取依赖列表并过滤掉冲突的包
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        # 使用正则表达式移除transformers、numpy和xformers相关的依赖
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))

    # 安装过滤后的vLLM依赖
    !pip install -r vllm_requirements.txt

In [None]:
from unsloth import FastLanguageModel
import torch

# 设置最大序列长度，可根据需要调整以处理更长的推理过程
max_seq_length = 2048

# LoRA的秩，值越大模型表达能力越强，但会降低推理速度
lora_rank = 32

# 从预训练模型加载模型和分词器
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Base",  # 指定使用的预训练模型名称
    max_seq_length = max_seq_length,  # 设置最大序列长度
    load_in_4bit = False,  # 是否使用4位量化加载模型，False表示使用16位进行LoRA
    fast_inference = True,  # 是否启用vLLM快速推理
    max_lora_rank = lora_rank,  # 最大LoRA秩
    gpu_memory_utilization = 0.7,  # GPU内存使用率，内存不足时可降低此值
)

# 为模型添加LoRA参数高效微调
model = FastLanguageModel.get_peft_model(
    model,  # 基础模型
    r = lora_rank,  # LoRA的秩，可选择大于0的任意值，建议值：8、16、32、64、128
    target_modules = [  # 需要应用LoRA的模块名称列表
        "q_proj", "k_proj", "v_proj", "o_proj",  # 注意力机制相关模块
        "gate_proj", "up_proj", "down_proj",  # MLP层相关模块
    ],
    lora_alpha = lora_rank*2,  # LoRA缩放因子，乘以2可加速训练
    use_gradient_checkpointing = "unsloth",  # 是否使用梯度检查点以减少内存使用
    random_state = 3407,  # 随机种子，确保结果可重现
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-15 11:36:55 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-15 11:36:55 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.5.3: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Qwen3-4B-Base with actual GPU utilization = 69.34%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

INFO 05-15 11:37:30 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 05-15 11:37:30 [cuda.py:289] Using XFormers backend.
INFO 05-15 11:37:31 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 05-15 11:37:31 [model_runner.py:1108] Starting to load model unsloth/Qwen3-4B-Base...
INFO 05-15 11:37:31 [weight_utils.py:265] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

INFO 05-15 11:39:20 [weight_utils.py:281] Time spent downloading weights for unsloth/Qwen3-4B-Base: 108.931651 seconds


model.safetensors.index.json:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-15 11:39:52 [loader.py:458] Loading weights took 31.46 seconds
INFO 05-15 11:39:52 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 05-15 11:39:53 [model_runner.py:1140] Model loading took 7.6334 GiB and 141.653974 seconds
INFO 05-15 11:40:03 [worker.py:287] Memory profiling takes 9.54 seconds
INFO 05-15 11:40:03 [worker.py:287] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.69) = 10.22GiB
INFO 05-15 11:40:03 [worker.py:287] model weights take 7.63GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 0.88GiB; the rest of the memory reserved for KV Cache is 1.68GiB.
INFO 05-15 11:40:03 [executor_base.py:112] # cuda blocks: 764, # CPU blocks: 0
INFO 05-15 11:40:03 [executor_base.py:117] Maximum concurrency for 2048 tokens per request: 5.97x
INFO 05-15 11:40:04 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mod

Capturing CUDA graph shapes:   0%|          | 0/23 [00:00<?, ?it/s]

INFO 05-15 11:40:47 [model_runner.py:1592] Graph capturing finished in 43 secs, took 0.34 GiB
INFO 05-15 11:40:47 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 53.66 seconds
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'pre_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'pre_feedforward_layernorm']


tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.5.3 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [6]:
# 定义思维过程标记：用于大模型生成中间推理步骤
reasoning_start = "<start_working_out>" # 标记思考过程开始（类似<think>标签）
reasoning_end   = "<end_working_out>"   # 标记思考过程结束（类似</think>标签）

# 定义解决方案标记：用于AI模型输出最终答案
solution_start  = "<SOLUTION>"           # 标记解决方案开始
solution_end    = "</SOLUTION>"          # 标记解决方案结束

# 构建系统提示词，指导AI模型如何结构化输出
system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""

# 输出系统提示词内容（用于调试或查看）
system_prompt

'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>'

In [11]:
# 定义聊天模板（使用Jinja2模板语法），用于构建模型输入序列

chat_template = \
    "{% if messages[0]['role'] == 'system' %}"\
        "{{ messages[0]['content'] + eos_token }}"\
        "{% set loop_messages = messages[1:] %}"\
    "{% else %}"\
        "{{ '{system_prompt}' + eos_token }}"\
        "{% set loop_messages = messages %}"\
    "{% endif %}"\
    "{% for message in loop_messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ message['content'] }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ message['content'] + eos_token }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}{{ '{reasoning_start}' }}"\
    "{% endif %}"

# 将模板中的占位符替换为实际变量值
chat_template = chat_template\
    .replace("'{system_prompt}'",   f"'{system_prompt}'")\
    .replace("'{reasoning_start}'", f"'{reasoning_start}'")
tokenizer.chat_template = chat_template

Let's see how our chat template behaves on an example:

In [12]:
# 使用之前设置的聊天模板处理对话历史，构建模型输入文本
# 参数说明：
# - tokenize=False：返回文本字符串而非token ID列表
# - add_generation_prompt=True：在末尾添加推理开始标记，提示模型继续生成
tokenizer.apply_chat_template(
    [
        # 第一轮对话：用户提问
        {"role": "user", "content": "What is 1+1?"},
        # 助手回答，包含结构化的推理过程和解决方案
        {"role": "assistant", "content": f"{reasoning_start}I think it's 2.{reasoning_end}{solution_start}2{solution_end}"},
        # 第二轮对话：用户提出新问题
        {"role": "user", "content": "What is 2+2?"},
    ],
    tokenize=False,           # 返回文本而非token IDs
    add_generation_prompt=True # 在末尾添加<start_working_out>标记，引导模型开始推理
)

"You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION><|endoftext|>What is 1+1?<start_working_out>I think it's 2.<end_working_out><SOLUTION>2</SOLUTION><|endoftext|>What is 2+2?<start_working_out>"

### Pre fine-tuning for formatting
We now use a subset of NVIDIA's [Open Math Reasoning dataset](https://huggingface.co/datasets/nvidia/OpenMathReasoning) which was filtered to only include high quality DeepSeek R1 traces.

We'll only filter ~59 or so examples to first "prime" / pre fine-tune the model to understand our custom GRPO formatting.

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np

# 从Hugging Face的datasets库加载数学推理数据集
# 使用OpenMathReasoning-mini的cot分割（Chain of Thought格式）
dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")

# 将数据集转换为Pandas DataFrame，并仅保留需要的列
# expected_answer: 预期答案
# problem: 数学问题描述
# generated_solution: 生成的解决方案
dataset = dataset.to_pandas()[
    ["expected_answer", "problem", "generated_solution"]
]

# 数据清洗：尝试将预期答案转换为数值类型
# pd.to_numeric()会尝试转换为数字，无法转换的将被设为NaN
# errors="coerce"参数表示无法转换时返回NaN而非抛出错误
is_number = pd.to_numeric(pd.Series(dataset["expected_answer"]), errors = "coerce").notnull()

# 数据筛选：仅保留预期答案能成功转换为数字的样本
# np.where(is_number)[0]返回所有True值的索引位置
dataset = dataset.iloc[np.where(is_number)[0]]

# 显示处理后的数据集
dataset

Unnamed: 0,expected_answer,problem,generated_solution
0,14,Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$...,"<think>\nOkay, let's see. I need to solve the ..."
6,-2,Find the value of the parameter $a$ for which ...,"<think>\nOkay, so I need to find the value of ..."
9,18,What is the sum of all real numbers $x$ for wh...,"<think>\nOkay, so I need to solve the equation..."
13,2,Evaluate the sum \(\sum_{n=1}^\infty \frac{\ph...,"<think>\nOkay, so I need to evaluate the infin..."
17,30,What is the largest positive integer that divi...,"<think>\nAlright, so I need to find the larges..."
...,...,...,...
19243,244,"Let \( p \), \( q \), and \( r \) be the disti...","<think>\nOkay, so I need to find the value of ..."
19245,1,A bug is on the $0$ of a number line. At any p...,"<think>\nOkay, so I have this problem where a ..."
19247,4,A bus left point X for point Y. Two hours late...,"<think>\nOkay, let's tackle this problem step ..."
19248,18,Each interior angle of a regular n-gon measure...,"<think>\nOkay, let's see. I need to find the n..."




We have to format the dataset to follow our GRPO style formatting:

In [16]:
def format_dataset(x):
    # 从数据集中提取预期答案和问题描述
    expected_answer = x["expected_answer"]
    problem = x["problem"]

    # 清理生成的解决方案：移除DeepSeek模型原有的<think>标记
    # 因为我们使用自定义的<start_working_out>和<end_working_out>标记
    thoughts = x["generated_solution"]
    thoughts = thoughts.replace("<think>", "").replace("</think>", "")

    # 清理文本格式：移除首尾的换行符和空白字符
    thoughts = thoughts.strip()

    # 构建符合我们自定义模板的结构化输出
    # 将中间推理过程包裹在<start_working_out>和<end_working_out>之间
    # 将预期答案包裹在<SOLUTION>和</SOLUTION>之间
    final_prompt = \
        reasoning_start + thoughts + reasoning_end + \
        solution_start + expected_answer + solution_end

    # 返回格式化后的对话结构
    # 包含系统提示、用户问题和结构化的助手回答
    return [
        {"role" : "system",    "content" : system_prompt},
        {"role" : "user",      "content" : problem},
        {"role" : "assistant", "content" : final_prompt},
    ]

# 对数据集中的每一行应用格式化函数
# 生成符合聊天模板的消息结构，并存储在新的"Messages"列中
dataset["Messages"] = dataset.apply(format_dataset, axis = 1)

Check to see if it worked:

In [17]:
# 使用tokenizer的聊天模板处理第一条对话数据
# dataset["Messages"][0]是一个包含系统、用户和助手消息的列表
# tokenize=False表示返回格式化后的文本字符串而非token IDs
tokenizer.apply_chat_template(
    dataset["Messages"][0],  # 选择第一条对话数据（已结构化的消息列表）
    tokenize=False           # 返回文本格式而非token IDs列表
)

"You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Given $\\sqrt{x^2+165}-\\sqrt{x^2-52}=7$ and $x$ is positive, find all possible values of $x$.<start_working_out>Okay, let's see. I need to solve the equation √(x² + 165) - √(x² - 52) = 7, and find all positive values of x. Hmm, radicals can be tricky, but maybe if I can eliminate the square roots by squaring both sides. Let me try that.\n\nFirst, let me write down the equation again to make sure I have it right:\n\n√(x² + 165) - √(x² - 52) = 7.\n\nOkay, so the idea is to isolate one of the radicals and then square both sides. Let me try moving the second radical to the other side:\n\n√(x² + 165) = 7 + √(x² - 52).\n\nNow, if I square both sides, maybe I can get rid of the square roots. Let's do that:\n\n(√(x² + 165))² = (7 + √(x² - 52))².\n\nSimplifying the left side:\n\nx² + 

Let's truncate the pre fine-tuning dataset to `max_seq_length/2` since we don't want too long reasoning traces.

Note this might take 2 minutes!

In [18]:
# 计算每条对话的token数量并添加到新列"N"中
# tokenizer.apply_chat_template(x)将对话消息转换为模型输入格式
# len(...)计算转换后的token数量（默认tokenize=True返回token列表）
dataset["N"] = dataset["Messages"].apply(lambda x: len(tokenizer.apply_chat_template(x)))

# 过滤掉长度超过模型最大序列长度一半的样本
# max_seq_length是模型支持的最大输入长度（例如4096）
# 保留长度≤max_seq_length/2的样本以确保安全裕量
dataset = dataset.loc[dataset["N"] <= max_seq_length/2].copy()

# 显示过滤后的数据集形状（行数, 列数）
dataset.shape

(59, 5)

We then tokenize the messages and convert it to a Hugging Face compatible dataset format:

In [19]:
from datasets import Dataset

# 将结构化对话消息转换为模型可接受的文本格式
# dataset["Messages"].values.tolist() 获取所有对话消息列表
# tokenize=False 返回文本字符串而非token IDs
# 结果存储在新的"text"列中，格式为：
#   [系统提示]
#   [用户问题]
#   [助手回答]<eos_token>
dataset["text"] = tokenizer.apply_chat_template(
    dataset["Messages"].values.tolist(),
    tokenize=False
)

# 将Pandas DataFrame转换为Hugging Face Dataset对象
# 这便于后续进行批处理、缓存和与其他Hugging Face工具集成
dataset = Dataset.from_pandas(dataset)

# 显示数据集结构
dataset

Dataset({
    features: ['expected_answer', 'problem', 'generated_solution', 'Messages', 'N', 'text', '__index_level_0__'],
    num_rows: 59
})

Let's now pre fine-tune the model so it follows our custom GRPO formatting!

In [20]:
from trl import SFTTrainer, SFTConfig

# 初始化基于监督微调(SFT)的训练器，用于指令跟随训练
trainer = SFTTrainer(
    model=model,                      # 待微调的基础模型
    tokenizer=tokenizer,              # 用于处理文本的分词器
    train_dataset=dataset,            # 训练数据集，包含格式化的文本列"text"

    # 训练配置参数
    args=SFTConfig(
        dataset_text_field="text",    # 指定包含训练文本的列名
        per_device_train_batch_size=1, # 每个设备的训练批次大小
        gradient_accumulation_steps=1, # 梯度累积步数，模拟更大批次

        # 学习率调度与优化配置
        warmup_steps=5,                # 学习率预热步数
        num_train_epochs=2,            # 训练轮数
        learning_rate=2e-4,            # 初始学习率，长训练可降至2e-5
        lr_scheduler_type="linear",    # 线性学习率衰减策略

        # 优化器与正则化配置
        optim="adamw_8bit",            # 使用8位量化AdamW优化器节省显存
        weight_decay=0.01,             # 权重衰减率，防止过拟合

        # 日志与随机性配置
        logging_steps=5,               # 每5步记录一次训练日志
        seed=3407,                     # 固定随机种子，确保结果可复现
        report_to="none",              # 不使用额外报告工具（如WandB）
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/59 [00:00<?, ? examples/s]

In [21]:
# 开训！
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 59 | Num Epochs = 2 | Total steps = 118
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 66,060,288/4,088,528,384 (1.62% trained)


Step,Training Loss
5,0.6449
10,0.6396
15,0.4192
20,0.3896
25,0.4224
30,0.4484
35,0.4751
40,0.4193
45,0.4454
50,0.3282


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=118, training_loss=0.3486055499416287, metrics={'train_runtime': 177.3531, 'train_samples_per_second': 0.665, 'train_steps_per_second': 0.665, 'total_flos': 2374193075607552.0, 'train_loss': 0.3486055499416287})

Let's check if the model has learnt to follow the custom format:

In [22]:
# 准备模型输入文本：使用聊天模板格式化对话历史
# dataset[0]["Messages"][:2] 提取第一条样本的前两条消息（系统提示+用户问题）
# add_generation_prompt=True 添加<start_working_out>标记，引导模型开始推理
text = tokenizer.apply_chat_template(
    dataset[0]["Messages"][:2],
    tokenize=False,
    add_generation_prompt=True,  # 必须为生成任务添加此标记
)

from transformers import TextStreamer

# 调用模型进行文本生成
# tokenizer(...) 将文本转换为模型可接受的输入张量
# return_tensors="pt" 返回PyTorch张量
# temperature=0 确定性生成（贪婪解码）
# max_new_tokens=1024 最大生成长度
# streamer 实时打印生成内容
_ = model.generate(
    **tokenizer(text, return_tensors="pt").to("cuda"),
    temperature=0,
    max_new_tokens=1024,
    streamer=TextStreamer(tokenizer, skip_prompt=False),  # 实时显示生成过程
)

You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Jenifer has 82 cents in pennies and nickels. Her younger brother mistook all her nickels for dimes and counted the total as $1.47. How many pennies does Jenifer have?<start_working_out>Okay, let's see. Jenifer has 82 cents in pennies and nickels. Her brother thought all the nickels were dimes and counted the total as $1.47. I need to find out how many pennies she has. Hmm, let's break this down.

First, I need to set up some equations. Let's say the number of pennies is P and the number of nickels is N. Since pennies are worth 1 cent each and nickels are 5 cents each, the total value is 1P + 5N = 82 cents. That's the first equation.

Now, her brother thought all the nickels were dimes. Dimes are 10 cents each. So, he counted the total as $1.47, which is 147 cents. So, the equation

Yes it did follow the formatting! Great! Let's remove some items before the GRPO step

In [23]:
# 释放数据集占用的内存
del dataset  # 删除Pandas DataFrame和Hugging Face Dataset对象

# 清理PyTorch GPU缓存
torch.cuda.empty_cache()  # 释放未使用的GPU显存，适用于多轮训练/推理场景

# 触发Python垃圾回收机制
import gc
gc.collect()  # 强制回收不再使用的对象，清理内存碎片

0

### Data Prep
<a name="Data"></a>

We're using Hugging Face's [Open R1 Math dataset](https://huggingface.co/datasets/open-r1/DAPO-Math-17k-Processed). You can also utilize OpenAI's famous [GSM8K dataset](https://huggingface.co/datasets/openai/gsm8k)

In [24]:
from datasets import load_dataset

# 从Hugging Face Hub加载预训练的数学问题数据集
# "open-r1/DAPO-Math-17k-Processed": 数据集名称（数学问题与解答）
# "en": 指定加载英文版本
# split="train": 加载训练集部分
dataset = load_dataset("open-r1/DAPO-Math-17k-Processed", "en", split = "train")

# 显示数据集结构（行数、列名等元信息）
dataset

README.md:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

en/train-00000-of-00001.parquet:   0%|          | 0.00/5.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14116 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'solution', 'data_source', 'source_prompt', 'ability', 'reward_model', 'extra_info'],
    num_rows: 14116
})

Let's look at the first row:

In [25]:
dataset[0]["prompt"]

'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$ be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$ and $\\angle BDC = 90^\\circ$. Suppose that $AD = 1$ and that $\\frac{BD}{CD} = \\frac{3}{2}$. If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'

In [26]:
dataset[0]["solution"]

'34'

In GSM8K, ee notice all answers like about have a ####, so we extract it. But for the Open R1 dataset, we can skip the below.

In [28]:
def extract_hash_answer(text):
    """
    从模型生成的文本中提取最终答案
    使用"####"作为分隔符（常见于数学问题解答格式）

    参数:
        text: 模型生成的完整文本，包含推理过程和答案

    返回:
        提取的最终答案字符串，或原始文本（当前实现未做提取）
    """
    # 原始逻辑（被注释掉）：使用"####"作为分隔符提取答案
    # if "####" not in text: return None
    # return text.split("####")[1].strip()

    # 当前实现：直接返回原始文本，不做任何提取
    return text

# 测试函数：从数据集中的第一个样本中提取答案
extract_hash_answer(dataset[0]["solution"])

'34'

Let's map the dataset! and see the first row:

In [29]:
# 使用map函数对数据集的每个样本进行格式化处理
# 将原始数据转换为包含"prompt"和"answer"的对话格式
dataset = dataset.map(lambda x: {
    "prompt": [
        {"role": "system", "content": system_prompt},  # 添加系统提示
        {"role": "user", "content": x["prompt"]},      # 添加用户问题
    ],
    "answer": extract_hash_answer(x["solution"]),  # 提取并处理标准答案
})

# 显示处理后的第一个样本，验证格式是否正确
dataset[0]

Map:   0%|          | 0/14116 [00:00<?, ? examples/s]

{'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>',
   'role': 'system'},
  {'content': 'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$ be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$ and $\\angle BDC = 90^\\circ$. Suppose that $AD = 1$ and that $\\frac{BD}{CD} = \\frac{3}{2}$. If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.',
   'role': 'user'}],
 'solution': '34',
 'data_source': 'math_dapo',
 'source_prompt': [{'content': 'Solve the following math problem step by step. The last line of your response should be of the form Answer: $Answer (without quotes) where $Answer is the answer to the problem.\n\nIn triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $

We create a regex format to match the reasoning sections and answers:

In [32]:
import re

# 构建用于匹配模型生成答案的正则表达式
# 设计目标：从模型输出中准确提取<start_working_out>...</SOLUTION>格式的内容

# 构建SOLUTION结束标签的正则表达式
# 允许在</SOLUTION>后有可选的空格和EOS token
solution_end_regex = r"</SOLUTION>[\s]{0,}" + \
    "(?:" + re.escape(tokenizer.eos_token) + ")?"

match_format = re.compile(
    rf"{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end_regex}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)
match_format

re.compile(r'<end_working_out>.*?<SOLUTION>(.+?)</SOLUTION>[\s]{0,}(?:<\|endoftext\|>)?[\s]{0,}$',
re.MULTILINE|re.DOTALL|re.UNICODE)

We verify it works:

In [33]:
match_format.findall(
    "Let me think!<end_working_out>"\
    f"<SOLUTION>\n2\n</SOLUTION>",
)

['\n2\n']

In [34]:
match_format.findall(
    "<start_working_out>Let me think!<end_working_out>"\
    f"<SOLUTION>  2  </SOLUTION>\n\n",
)

['  2  ']

We now want to create a reward function to match the format exactly - we reward it with 3 points if it succeeds:

In [39]:
def match_format_exactly(completions, **kwargs):
    """
    评估模型生成的回答是否严格遵循预期的格式

    参数:
        completions: 模型生成的回答列表，格式为[[{"content": "回答内容"}, ...], ...]
        **kwargs: 保留参数，不做实际使用

    返回:
        scores: 每个回答的格式匹配得分列表
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]  # 提取生成的内容

        # 检查是否完全匹配预期的格式
        # 使用之前编译的正则表达式match_format进行匹配
        if match_format.search(response) is not None:
            score += 3.0  # 完全匹配格式得3分

        scores.append(score)
    return scores

If it fails, we want to reward the model if it at least follows the format partially, by counting each symbol:

In [40]:
def match_format_approximately(completions, **kwargs):
    """
    评估模型生成的回答是否近似遵循预期的格式
    通过统计关键标记的出现次数进行评分

    参数:
        completions: 模型生成的回答列表，格式为[[{"content": "回答内容"}, ...], ...]
        **kwargs: 保留参数，不做实际使用

    返回:
        scores: 每个回答的格式匹配得分列表
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]  # 提取生成的内容

        # 统计关键标记的出现次数
        # reasoning_start不参与评分，因为它是生成时强制添加的
        # score += 0.5 if response.count(reasoning_start) == 1 else -1.0

        # 对其他关键标记进行评分：
        # 每个标记正确出现1次得0.5分，否则扣1分
        score += 0.5 if response.count(reasoning_end)   == 1 else -1.0
        score += 0.5 if response.count(solution_start)  == 1 else -1.0
        score += 0.5 if response.count(solution_end)    == 1 else -1.0

        scores.append(score)
    return scores

Finally, we want to extract the generated answer, and reward or penalize it! We also reward it based on how close the answer is to the true one via ratios:

In [42]:
def check_answer(prompts, completions, answer, **kwargs):
    """
    评估模型生成的答案是否正确
    支持精确匹配、近似匹配和数值范围匹配

    参数:
        prompts: 输入提示列表
        completions: 模型生成的回答列表
        answer: 标准答案列表
        **kwargs: 保留参数

    返回:
        scores: 每个回答的得分列表
    """
    # 提取问题文本（通常是prompts中的最后一个用户消息）
    question = prompts[0][-1]["content"]

    # 提取模型生成的内容
    responses = [completion[0]["content"] for completion in completions]

    # 从生成内容中提取答案部分（使用预定义的正则表达式）
    extracted_responses = [
        guess.group(1) if (guess := match_format.search(r)) is not None else None
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0

        # 情况1：无法提取答案
        if guess is None:
            scores.append(-2.0)
            continue

        # 情况2：答案完全匹配（包括格式）
        if guess == true_answer:
            score += 5.0

        # 情况3：答案基本匹配（忽略首尾空格）
        elif guess.strip() == true_answer.strip():
            score += 3.5

        else:
            # 情况4：尝试数值比较（处理数学问题）
            try:
                ratio = float(guess) / float(true_answer)
                if ratio >= 0.9 and ratio <= 1.1:
                    score += 2.0  # 误差在10%以内
                elif ratio >= 0.8 and ratio <= 1.2:
                    score += 1.5  # 误差在20%以内
                else:
                    score -= 2.5  # 误差过大，扣分
            except:
                score -= 4.5  # 非数值类型或格式错误，重罚

        scores.append(score)
    return scores

Also sometimes it might not be 1 number as the answer, but like a sentence for example "The solution is $20" -> we extract 20.

We also remove possible commas for example as in 123,456

In [43]:
# 编译正则表达式：从<SOLUTION>标签中提取数值内容
# 支持正数、负数、小数点和千分位逗号
match_numbers = re.compile(
    solution_start + r".*?[\s]{0,}([-]?[\d\.\,]{1,})",
    flags = re.MULTILINE | re.DOTALL
)

# 测试不同格式的数值提取
print(match_numbers.findall("<SOLUTION>  0.34  </SOLUTION>"))    # 小数
print(match_numbers.findall("<SOLUTION>  123,456  </SOLUTION>"))  # 带千分位逗号
print(match_numbers.findall("<SOLUTION>  -0.234  </SOLUTION>"))   # 负数
print(match_numbers.findall("<SOLUTION>17</SOLUTION>"))           # 整数

['0.34']
['123,456']
['-0.234']
['17']


We now prepare our main function which will print out the generated responses and the true answer, along with another reward function which converts text to float via `float` and sees if it's the same.

In [44]:
# 全局变量：控制日志打印频率
global PRINTED_TIMES
PRINTED_TIMES = 0
global PRINT_EVERY_STEPS
PRINT_EVERY_STEPS = 5

def check_numbers(prompts, completions, answer, **kwargs):
    """
    验证模型生成的数值答案是否正确
    通过正则表达式提取数值并进行严格数值比较

    参数:
        prompts: 输入提示列表
        completions: 模型生成的回答列表
        answer: 标准答案列表
        **kwargs: 保留参数

    返回:
        scores: 每个回答的得分列表
    """
    # 提取问题文本
    question = prompts[0][-1]["content"]

    # 提取模型生成的内容
    responses = [completion[0]["content"] for completion in completions]

    # 使用正则表达式提取数值部分
    extracted_responses = [
        guess.group(1) if (guess := match_numbers.search(r)) is not None else None
        for r in responses
    ]

    scores = []

    # 每PRINT_EVERY_STEPS次调用打印一次调试信息
    global PRINTED_TIMES
    global PRINT_EVERY_STEPS
    if PRINTED_TIMES % PRINT_EVERY_STEPS == 0:
        print(
            '*'*20 +
            f"\nQuestion:\n{question}" +
            f"\nAnswer:\n{answer[0]}" +
            f"\nResponse:\n{responses[0]}" +
            f"\nExtracted:\n{extracted_responses[0]}"
        )
    PRINTED_TIMES += 1

    # 评分逻辑
    for guess, true_answer in zip(extracted_responses, answer):
        # 情况1：无法提取数值
        if guess is None:
            scores.append(-2.5)
            continue

        # 情况2：尝试转换为浮点数并比较
        try:
            # 处理标准答案（去除首尾空格后转换为浮点数）
            true_answer = float(true_answer.strip())

            # 处理模型生成的答案（去除逗号和空格后转换为浮点数）
            guess = float(guess.strip().replace(",", ""))

            # 严格数值比较：完全相等得3.5分，否则扣1.5分
            scores.append(3.5 if guess == true_answer else -1.5)

        except:
            # 情况3：转换失败（非数值格式）
            scores.append(0)
            continue

    return scores

Get the top 90% prompt length so we don't accidentally truncate them!

Ie we'll remove the top 10% long prompts.

In [45]:
# 对数据集进行分词处理，将对话模板转换为token序列
# add_generation_prompt=True: 添加生成提示标记（如<start_working_out>）
# tokenize=True: 直接返回token IDs而非文本
tokenized = dataset.map(
    lambda x: {"tokens": tokenizer.apply_chat_template(
        x["prompt"],
        add_generation_prompt=True,
        tokenize=True
    )},
    batched=True,  # 启用批处理以提高效率
)

# 验证分词结果：打印第一个样本的解码文本
print(tokenizer.decode(tokenized[0]["tokens"]))

# 计算每个样本的token长度
tokenized = tokenized.map(lambda x: {"L": len(x["tokens"])})

# 统计分析：计算token长度的90%分位数
# 即90%的样本token长度都小于这个值
import numpy as np
maximum_length = int(np.quantile(tokenized["L"], 0.9))
print("Max Length = ", maximum_length)

# 过滤数据集：仅保留长度在90%分位数以内的样本
# 避免超长样本导致的内存溢出或训练不稳定
dataset = dataset.select(np.where(np.array(tokenized["L"]) <= maximum_length)[0])

# 释放tokenized数据集占用的内存
del tokenized

Map:   0%|          | 0/14116 [00:00<?, ? examples/s]

You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION><|endoftext|>In triangle $ABC$, $\sin \angle A = \frac{4}{5}$ and $\angle A < 90^\circ$. Let $D$ be a point outside triangle $ABC$ such that $\angle BAD = \angle DAC$ and $\angle BDC = 90^\circ$. Suppose that $AD = 1$ and that $\frac{BD}{CD} = \frac{3}{2}$. If $AB + AC$ can be expressed in the form $\frac{a\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.<start_working_out>


Map:   0%|          | 0/14116 [00:00<?, ? examples/s]

Max Length =  201


<a name="Train"></a>
### Train the model

Now set up GRPO Trainer and all configurations!

In [46]:
# 设置最大提示长度（基于之前计算的90%分位数）
max_prompt_length = maximum_length + 1  # +1作为安全余量

# 计算最大生成长度（确保总长度不超过模型限制）
max_completion_length = max_seq_length - max_prompt_length

from vllm import SamplingParams

# 配置vllm采样参数（控制模型生成行为）
vllm_sampling_params = SamplingParams(
    min_p = 0.1,                # 最小概率阈值，防止极低概率token被采样
    top_p = 1.0,                # 核采样参数（不截断，使用全部概率分布）
    top_k = -1,                 # 禁用top-k采样（-1表示不限制）
    seed = 3407,                # 固定随机种子，确保结果可复现
    stop = [tokenizer.eos_token],  # 遇到结束标记时停止生成
    include_stop_str_in_output = True,  # 保留结束标记在输出中
)

from trl import GRPOConfig, GRPOTrainer

# 配置梯度优化策略（Gradient Reinforcement Optimization）
training_args = GRPOConfig(
    vllm_sampling_params = vllm_sampling_params,  # 使用上面定义的采样策略
    temperature = 1.0,            # 生成温度（1.0表示不调整随机性）
    learning_rate = 5e-6,         # 学习率（微调LLM时通常较小）
    weight_decay = 0.01,          # 权重衰减（L2正则化）
    warmup_ratio = 0.1,           # 学习率预热比例
    lr_scheduler_type = "linear", # 线性学习率衰减策略
    optim = "adamw_8bit",         # 使用8位量化AdamW优化器节省显存
    logging_steps = 1,            # 每1步记录一次日志
    per_device_train_batch_size = 1,  # 每设备批次大小
    gradient_accumulation_steps = 1,  # 梯度累积步数（可增加至4以平滑训练）
    num_generations = 4,          # 每步生成的样本数（内存不足时可减小）
    max_prompt_length = max_prompt_length,  # 最大提示长度
    max_completion_length = max_completion_length,  # 最大生成长度
    max_steps = 100,              # 最大训练步数
    save_steps = 100,             # 每100步保存一次模型
    report_to = "none",           # 不使用额外报告工具
    output_dir = "outputs",       # 输出目录

    # 可选的训练+评估配置（已注释掉）
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!

You might have to wait 150 to 200 steps for any action. You'll probably get 0 reward for the first 100 steps. Please be patient!

| Step | Training Loss | reward    | reward_std | completion_length | kl       |
|------|---------------|-----------|------------|-------------------|----------|
| 1    | 0.000000      | 0.125000  | 0.000000   | 200.000000        | 0.000000 |
| 2    | 0.000000      | 0.072375  | 0.248112   | 200.000000        | 0.000000 |
| 3    | 0.000000      | -0.079000 | 0.163776   | 182.500000        | 0.000005 |


In [None]:
# 创建梯度优化训练器（Gradient Reinforcement Optimization）
# 结合多种奖励函数指导模型生成符合要求的内容
trainer = GRPOTrainer(
    model=model,                      # 待微调的基础模型
    processing_class=tokenizer,       # 用于处理文本的分词器

    # 奖励函数列表：综合评估模型生成质量
    reward_funcs=[
        match_format_exactly,         # 严格格式匹配（完全符合格式得3分）
        match_format_approximately,   # 近似格式匹配（关键标记出现得0.5分）
        check_answer,                 # 答案正确性验证（精确匹配得5分）
        check_numbers,                # 数值精确性验证（数值相等得3.5分）
    ],

    args=training_args,               # 训练配置参数（如学习率、批次大小等）
    train_dataset=dataset,            # 训练数据集

    # 可选的训练+评估配置（已注释掉）
    # train_dataset=new_dataset["train"],  # 分割后的训练集
    # eval_dataset=new_dataset["test"],    # 分割后的测试集
)

# 执行训练过程
# 模型将根据多种奖励函数的综合评分进行优化
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,709 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 66,060,288/4,088,528,384 (1.62% trained)


********************
Question:
Compute the number of positive integers that divide at least two of the integers in the set $\{1^1,2^2,3^3,4^4,5^5,6^6,7^7,8^8,9^9,10^{10}\}$.
Answer:
22
Response:
Okay, so I need to find the number of positive integers that divide at least two of the numbers in the set {1^1, 2^2, 3^3, ..., 10^10}. Hmm, let's start by understanding what the problem is asking. We have a set where each element is a number raised to its own power, from 1 to 10. The goal is to find how many positive integers are divisors of at least two different numbers in this set.

First, I should probably list out the elements of the set to see what they are. Let me compute each one:

1. 1^1 = 1
2. 2^2 = 4
3. 3^3 = 27
4. 4^4 = 256
5. 5^5 = 3125
6. 6^6 = 46656
7. 7^7 = 823543
8. 8^8 = 16777216
9. 9^9 = 387420489
10. 10^10 = 10000000000

Wait, those are all the elements. Now, I need to find all positive integers that divide at least two of these numbers. The key here is that a number can di

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,0.0055,-7.5,0.0,1846.0,0.136793,0.0,-3.0,-2.0,-2.5
2,0.0061,-3.5,4.618802,1703.25,0.153148,1.5,-0.75,-2.25,-2.0
3,0.0058,-7.5,0.0,1846.0,0.145551,0.0,-3.0,-2.0,-2.5
4,0.0048,-7.5,0.0,1846.0,0.119565,0.0,-3.0,-2.0,-2.5
5,0.0054,13.0,0.0,1381.0,0.135236,3.0,1.5,5.0,3.5
6,0.0046,-7.5,0.0,1846.0,0.115762,0.0,-3.0,-2.0,-2.5
7,0.0053,-3.5,4.618802,1690.5,0.132705,1.5,-0.75,-2.25,-2.0
8,0.0047,-2.375,10.25,1838.0,0.117301,0.75,-1.875,-0.25,-1.0
9,0.0059,-5.5,4.0,1700.25,0.14769,0.75,-1.875,-2.125,-2.25


********************
Question:
The sum of $\lfloor x \rfloor$ for all real numbers $x$ satisfying the equation $16 + 15x + 15x^2 = \lfloor x \rfloor^3$ is:
Answer:
33
Response:
Okay, let's try to solve this problem. So the equation given is 16 + 15x + 15x² = ⌊x⌋³. I need to find all real numbers x that satisfy this equation and then compute the sum of ⌊x⌋ for those solutions. Hmm, OK.

First, I remember that ⌊x⌋ is the floor function, which gives the greatest integer less than or equal to x. So maybe I should consider different integer values for ⌊x⌋ and see if they work. Let me denote k = ⌊x⌋, where k is an integer. Then the equation becomes 16 + 15x + 15x² = k³. But since k ≤ x < k+1, maybe I can use this to find possible values of k.

Let me rearrange the equation to express x in terms of k. Starting with 15x² + 15x + 16 - k³ = 0. Wait, maybe it's better to keep it as is and think about possible integer values of k that make sense here.

Since k is the floor of x, x is between k and

<a name="Inference"></a>
### Inference
Now let's try the model we just trained! First, let's first try the model without any GRPO trained:

In [None]:
# 定义要生成回答的问题
text = "What is the sqrt of 101?"

from vllm import SamplingParams

# 配置生成参数
sampling_params = SamplingParams(
    temperature=1.0,    # 控制生成随机性（1.0表示中等随机，0表示完全确定）
    top_k=50,           # 仅从概率最高的50个token中采样，提高生成质量
    max_tokens=1024,    # 最大生成长度限制
)

# 使用vllm的快速生成接口
output = model.fast_generate(
    [text],              # 输入文本列表（支持批量处理）
    sampling_params=sampling_params,
    lora_request=None,   # 不使用LoRA适配器（如需参数高效微调可传入）
)[0].outputs[0].text    # 提取第一个样本的生成文本

# 输出结果
output

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

" - Answers\nMath and Arithmetic\nGeometry\nNumbers\nSquare Roots\nWhat is the sqrt of 101?\nWiki User\n∙ 2011-05-12 02:55:35\nStudy now\nBest Answer\nCopy\n101 squared is equal to 10201, which is itself a perfectly good\nnumber\nWiki User\n∙ 2011-05-12 02:55:35\nThis answer is:\n🙏\n0\n🤨\n0\n😮\n0\nStudy guides\nAlgebra\n20 cards\nA polynomial of degree zero is a constant term\nThe grouping method of factoring can still be used when only some of the terms share a common factor A True B False\nA number a power of a variable or a product of the two is a monomial while a polynomial is the of monomials\n3.71 (14)\n☆★☆★☆★☆★☆★\n208 Reviews\nHow do you square a number on scientific calculator?\nWhen you want to take the square of 126.45, for example, you\nwould press the numbers, 1 2 6 . 4 5 = x2. If you are using the\ntraditional form of calculator, which may have a square root sign\non it, the square roots would be found by pressing the numbers 1 2\n6 . 4 5 = &radic;\nHow do I find the squar

And now with the LoRA we just trained with GRPO - we first save the LoRA first!

In [None]:
# 保存模型的LoRA（Low-Rank Adaptation）权重
# LoRA是一种参数高效的微调方法，仅更新少量适配器权重
model.save_lora("grpo_saved_lora")

# 保存内容说明：
# - 目录"grpo_saved_lora"将包含：
#   1. adapter_config.json - LoRA配置文件
#   2. adapter_model.bin - LoRA权重文件（通常较小，如几十MB）

# 后续使用方法：
# 1. 加载基础模型
# 2. 从保存的LoRA权重中加载适配器
# 3. 使用合并后的模型进行推理或进一步微调

# 示例加载代码：
# from transformers import AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained("base_model_name")
# model.load_lora("grpo_saved_lora")

Verify LoRA is actually trained!

In [None]:
from safetensors import safe_open

# 用于存储从safetensors文件中加载的张量
tensors = {}

# 使用safe_open安全地打开safetensors格式的LoRA权重文件
with safe_open("grpo_saved_lora/adapter_model.safetensors", framework="pt") as f:
    # 验证所有LoRA权重张量（A矩阵和B矩阵）都包含非零值
    # 这确保了微调过程中确实更新了权重
    for key in f.keys():
        # 获取当前张量
        tensor = f.get_tensor(key)

        # 计算零元素比例：零元素数量 / 总元素数量
        n_zeros = (tensor == 0).sum() / tensor.numel()

        # 断言检查：确保张量不全为零
        # 若全为零，说明微调过程可能未正确更新此权重
        assert(n_zeros.item() != tensor.numel())

        # 可选：将张量存入字典（当前代码未使用）
        # tensors[key] = tensor

Now we load the LoRA and test:

In [None]:
# 构建对话消息列表，包含系统提示和用户问题
messages = [
    {"role": "system", "content": system_prompt},  # 系统提示：指导模型如何回答
    {"role": "user", "content": "What is the sqrt of 101?"},  # 用户问题
]

# 将对话消息转换为模型可接受的文本格式
# add_generation_prompt=True：添加生成标记（如<start_working_out>）
# tokenize=False：返回文本字符串而非token IDs
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,  # 必须添加，用于引导模型开始生成
    tokenize = False,
)

from vllm import SamplingParams

# 配置文本生成参数
sampling_params = SamplingParams(
    temperature = 1.0,  # 控制生成随机性（1.0为默认值，0表示确定性生成）
    top_k = 50,         # 限制采样候选集大小，提高生成质量
    max_tokens = 2048,  # 最大生成长度（确保不超过模型支持的序列长度）
)

# 使用vllm的快速生成接口进行推理
# text：格式化的输入文本
# lora_request：加载之前保存的LoRA权重（参数高效微调的适配器）
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),  # 加载微调后的LoRA权重
)[0].outputs[0].text  # 提取生成的文本结果

# 输出生成的答案
output

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

"Okay, so I need to find the square root of 101. Hmm, I remember that square roots are about finding a number that, when multiplied by itself, gives the original number. But 101 isn't a perfect square, right? Let me think. The squares around 101 would be 100 (which is 10 squared) and 121 (which is 11 squared). So the square root of 101 is somewhere between 10 and 11. \n\nBut the problem is asking for the square root, which might be an irrational number. So maybe it's a decimal? Let me try to estimate. I need to find the number that's closer to 10.5. Let me test 10.05. If I square that, it's (10 + 0.05)*(10 + 0.05) = 100 + 0.05*10 + 0.05*10 + 0.05*0.05 = 100 + 0.5 + 0.5 + 0.0025 = 101.0025. That's very close to 101. So maybe the square root is 10.05 plus or minus a little bit. \n\nBut wait, maybe there's a better way. Since 10.05 squared is 101.0025, which is slightly above 101, the actual square root should be slightly less than 10.05. Let me try 10.04. Squaring 10.04: (10 + 0.04)*(10 

Our reasoning model is much better - it's not always correct, since we only trained it for an hour or so - it'll be better if we extend the sequence length and train for longer!

<a name="Save"></a>
### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# 模型保存与部署选项（通过条件控制，避免意外执行）

# 选项1：合并LoRA权重到基础模型并保存为16位精度（半精度）
# 特点：模型体积中等，推理速度较快，适合大多数GPU部署
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
    # 可选：直接推送到Hugging Face Hub
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# 选项2：合并LoRA权重到基础模型并保存为4位精度（量化模型）
# 特点：模型体积最小（约为原始1/4），适合边缘设备或内存受限环境
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit")
    # 可选：直接推送到Hugging Face Hub
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# 选项3：仅保存LoRA适配器权重（不合并到基础模型）
# 特点：体积最小（通常几十MB），需与基础模型一起使用，适合参数高效微调场景
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "lora")
    # 可选：直接推送到Hugging Face Hub
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [None]:
# 模型保存与部署选项（通过条件控制，避免意外执行）

# 选项1：保存为8bit Q8_0量化的GGUF格式
# 特点：较高精度，中等模型体积，适合大多数CPU/GPU混合环境
if False:
    model.save_pretrained_gguf("model", tokenizer)
    # 可选：直接推送到Hugging Face Hub
    model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# 选项2：保存为16bit浮点精度的GGUF格式
# 特点：最高精度，最大模型体积，适合高性能GPU环境
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# 选项3：保存为q4_k_m量化的GGUF格式（推荐）
# 特点：良好的精度体积比，优化的KV缓存量化，适合大多数场景
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# 选项4：一次性保存多种量化格式（高效方式）
# 特点：一次导出多种精度，满足不同部署需求
if False:
    model.push_to_hub_gguf(
        "hf/model",  # 需替换为你的Hugging Face用户名和模型名
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m"],  # 同时保存多种量化格式
        token = "",  # Hugging Face API Token
    )