In [15]:
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import trl
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, get_peft_model, TaskType


In [16]:
import torch
torch.__version__

'2.3.0+cu121'

In [17]:
SYSTEM_PROMPT = """
按照如下格式生成：
<think>
...
</think>
<answer>
...
</answer>
"""
def process_data(data):
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question_zh-cn']}
        ],
        'answer': x['answer_only']
    }) 
    return data
def extract_answer(text):
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def mark_num(text):
    reward = 0
    if text.count("<think>\n") == 1:
        reward += 0.125
        
    if text.count("</think>\n") == 1:
        reward += 0.125
        
    if text.count("<answer>\n") == 1:
        reward += 0.125
        
    if text.count("</answer>\n") == 1:
        reward += 0.125
    return reward

# 生成答案是否正确的奖励
def correctness_reward(prompts, completions, answer, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_answer(r) for r in responses]
    print(f"问题:\n{prompts[0][-1]['content']}", f"\n答案:\n{answer[0]}", f"\n模型输出:\n{responses[0]}", f"\n提取后的答案:\n{extracted_responses[0]}")
    return [2.0 if response == str(ans) else 0.0 for response, ans in zip(extracted_responses, answer)]
# 生成答案是否是数字的奖励（单纯依赖结果是否正确进行奖励，条件很苛刻，会导致奖励比较稀疏，模型难以收敛，所以加上答案是否是数字的奖励，虽然答案错误，但是至少生成的是数字（对于数学问题），也要给予适当奖励）
def digit_reward(completions, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_answer(r) for r in responses]
    return [0.5 if response.isdigit() else 0.0 for response in extracted_responses]

# 格式奖励
def hard_format_reward(completions, **kwargs):
    pattern = r"^<think>\n.*?n</think>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, response) for response in responses]
    return [0.5 if match else 0.0 for match in matches]
# 格式奖励
def soft_format_reward(completions, **kwargs):
    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, response) for response in responses]
    return [0.5 if match else 0.0 for match in matches]
# 标记奖励（改善格式奖励稀疏问题）
def mark_reward(completions, **kwargs):
    responses = [completion[0]["content"] for completion in completions]
    return [mark_num(response) for response in responses]

In [18]:
import os

os.environ['HF_HOME'] = "/root/data/.cache/huggingface_hub"
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"

In [19]:
!env | grep HF_HOME

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


HF_HOME=/root/data/.cache/huggingface_hub


In [20]:
model_name = "/root/data/.cache/huggingface_hub/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/7ae557604adf67be50417f59c2c2f167def9a775"

model = AutoModelForCausalLM.from_pretrained(model_name)
model.cuda()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
ds = load_dataset('/root/data/.cache/huggingface_hub/hub/datasets--swulling--gsm8k_chinese/snapshots/961c39528fe28d424672b44e768d72d5bf089ca2')
data = process_data(ds['train'])

In [23]:
data[0]


{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 72,
 'question_zh-cn': '纳塔利娅在 4 月份向 48 个朋友出售了视频片段，然后在 5 月份售出了一半的视频片段。娜塔莉亚在四月和五月总共卖出了多少个视频？',
 'answer_only': 72,
 'prompt': [{'content': '\n按照如下格式生成：\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': '纳塔利娅在 4 月份向 48 个朋友出售了视频片段，然后在 5 月份售出了一半的视频片段。娜塔莉亚在四月和五月总共卖出了多少个视频？',
   'role': 'user'}]}

In [24]:
training_args = GRPOConfig(
        output_dir='./output',
        learning_rate=5e-6,
        adam_beta1 = 0.9,
        adam_beta2 = 0.99,
        weight_decay = 0.1,
        warmup_ratio = 0.1,
        lr_scheduler_type='cosine',
        logging_steps=1,
        bf16=True,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_generations=8,
        max_prompt_length=256,
        max_completion_length=200,
        num_train_epochs=1,
        save_steps=100,
        max_grad_norm=0.1,
        log_on_each_node=False,
        use_vllm=False,
        report_to="tensorboard"
    )

In [25]:
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        mark_reward,
        soft_format_reward,
        hard_format_reward,
        digit_reward,
        correctness_reward
        ],
    args = training_args,
    train_dataset=data
)



Detected kernel version 4.18.7, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [26]:
trainer.train()
trainer.save_model('./output')

问题:
艾哈迈德和艾米丽正在进行一场比赛，看谁能获得班上最好的成绩。共有 9 项作业，艾哈迈德在课堂上得了 91 分。 Emily 的得分为 92。最终作业的价值与所有其他作业的价值相同。艾米丽的期末作业得了 90 分。如果所有成绩均为整数，艾哈迈德击败艾米丽所需的最低成绩是多少？ 
答案:
100 
模型输出:
让我们首先计算所有作业的整体价值，并根据这些价值得出艾哈迈德和艾米丽在各个部分的表现。由于所有作业的价值相同，我们可以将它们视为一个数组，然后找到这个数组中，与艾哈迈德和艾米丽在各个部分得分相等的部分。接着，我们需要找出艾哈迈德得分低于这个值的那一项作业。

以下是具体步骤：

1. **计算总和**：艾哈迈德在所有作业上的得分加上艾米丽在所有作业上的得分，也就是183分（91+92+90+91）。
2. **确定总和中与艾哈迈德和艾米丽得分相等的部分**：我们需要找到一个部分，使得这个部分的总分与艾哈迈德和艾米丽得分相等。
   - 相等部分的得分是170分（183/2）。
3. **确定其他作业 
提取后的答案:
让我们首先计算所有作业的整体价值，并根据这些价值得出艾哈迈德和艾米丽在各个部分的表现。由于所有作业的价值相同，我们可以将它们视为一个数组，然后找到这个数组中，与艾哈迈德和艾米丽在各个部分得分相等的部分。接着，我们需要找出艾哈迈德得分低于这个值的那一项作业。

以下是具体步骤：

1. **计算总和**：艾哈迈德在所有作业上的得分加上艾米丽在所有作业上的得分，也就是183分（91+92+90+91）。
2. **确定总和中与艾哈迈德和艾米丽得分相等的部分**：我们需要找到一个部分，使得这个部分的总分与艾哈迈德和艾米丽得分相等。
   - 相等部分的得分是170分（183/2）。
3. **确定其他作业
问题:
水箱上的水表显示水箱已充满 1/3 的水。为了填充水箱，添加了 16 加仑的水。水箱满后可容纳多少加仑的水？ 
答案:
24 
模型输出:
为了解决这个问题，我们首先需要理解水箱已有的水量和添加的水量。

1. 已有的水量是1/3水箱容量的水量是 \( \frac{1}{3} \times \text{水箱容量} \)。
2. 添加的水量是16加仑。

因此，我们可以通过以下公式计算水箱满后可容纳的水量：

\[ \text{满后水量} 

In [None]:
# 'prompt': [{'content': '\n按照如下格式生成：\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
#    'role': 'system'},
#   {'content': '纳塔利娅在 4 月份向 48 个朋友出售了视频片段，然后在 5 月份售出了一半的视频片段。娜塔莉亚在四月和五月总共卖出了多少个视频？',
#    'role': 'user'}]}

In [1]:
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import trl
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, get_peft_model, TaskType


In [3]:
# 加载权重
model = AutoModelForCausalLM.from_pretrained("./output/checkpoint-100/")
tokenizer = AutoTokenizer.from_pretrained('./output/checkpoint-100/')

In [4]:
model.device

device(type='cpu')

In [5]:
model.cuda()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [18]:
def inference(model, prompt):
    # prompt = "我每天都有6块钱零花钱，那么我一周后总共有多少钱？"
    messages = [
        {"role": "system", "content": "\n按照如下格式生成：\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n"},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors='pt').to(model.device)

    generated_ids = model.generate(
        **model_inputs, max_length=512
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [16]:
model_ori = AutoModelForCausalLM.from_pretrained("/root/data/.cache/huggingface_hub/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/7ae557604adf67be50417f59c2c2f167def9a775")
model_ori.cuda()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [26]:
prompt = "我每天有6元，一周之后还有多少钱？我忘记说了，星期一我会花一掉块钱"
res1, res2 = inference(model, prompt), inference(model_ori, prompt)

print(f"问题:{prompt}")
print(f"微调前：{res2}")
print(f"微调后：{res1}")

问题:我每天有6元，一周之后还有多少钱？我忘记说了，星期一我会花一掉块钱
微调前：```
think
答案：-6元

think
```
微调后：```markdown
<think>
一天后剩下 \( 6 \times 7 = 42 \) 元。
星期一花掉1元，所以剩下 \( 42 - 1 = 41 \) 元。
</think>
<answer>
41
</answer>
```
