# Preprocessing

In [2]:
from datasets import load_dataset

dataset_id = "AI-MO/NuminaMath-TIR"
train_dataset, test_dataset = load_dataset(dataset_id, split=["train[:5%]", "test[:5%]"])

print(train_dataset)

print(train_dataset[0])

Dataset({
    features: ['problem', 'solution', 'messages'],
    num_rows: 3622
})
{'problem': 'What is the coefficient of $x^2y^6$ in the expansion of $\\left(\\frac{3}{5}x-\\frac{y}{2}\\right)^8$?  Express your answer as a common fraction.', 'solution': "To determine the coefficient of \\(x^2y^6\\) in the expansion of \\(\\left(\\frac{3}{5}x - \\frac{y}{2}\\right)^8\\), we can use the binomial theorem.\n\nThe binomial theorem states:\n\\[\n(a + b)^n = \\sum_{k=0}^{n} \\binom{n}{k} a^{n-k} b^k\n\\]\n\nIn this case, \\(a = \\frac{3}{5}x\\), \\(b = -\\frac{y}{2}\\), and \\(n = 8\\).\n\nWe are interested in the term that contains \\(x^2y^6\\). In the general term of the binomial expansion:\n\\[\n\\binom{8}{k} \\left(\\frac{3}{5}x\\right)^{8-k} \\left(-\\frac{y}{2}\\right)^k\n\\]\n\nTo get \\(x^2\\), we need \\(8 - k = 2\\), thus \\(k = 6\\).\n\nSubstituting \\(k = 6\\) into the expression:\n\\[\n\\binom{8}{6} \\left(\\frac{3}{5}x\\right)^{8-6} \\left(-\\frac{y}{2}\\right)^6 = \\binom{8}{

In [3]:
SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
)


def make_conversation(example):
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": example["problem"]},
        ],
    }


train_dataset = train_dataset.map(make_conversation)
test_dataset = test_dataset.map(make_conversation)

In [None]:
print(train_dataset[0]["prompt"])
train_dataset = train_dataset.remove_columns(["messages", "problem"])

[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>', 'role': 'system'}, {'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\left(\\frac{3}{5}x-\\frac{y}{2}\\right)^8$?  Express your answer as a common fraction.', 'role': 'user'}]


ValueError: Column name ['problem', 'messages'] not in the dataset. Current columns in the dataset: ['solution', 'prompt']

In [9]:
from pprint import pprint
print(train_dataset)
pprint(test_dataset[0])

Dataset({
    features: ['solution', 'prompt'],
    num_rows: 3622
})
{'messages': [{'content': "In 1988, a person's age was equal to the sum of the "
                          'digits of their birth year. How old was this '
                          'person?',
               'role': 'user'},
              {'content': "To solve this problem, let's break it down "
                          'step-by-step:\n'
                          '\n'
                          "1. Let the person's birth year be \\( Y \\).\n"
                          "2. In 1988, the person's age would be \\( 1988 - Y "
                          '\\).\n'
                          '3. The sum of the digits of \\( Y \\) should be '
                          'equal to their age in 1988.\n'
                          '\n'
                          'Therefore, we need to find a year \\( Y \\) such '
                          'that:\n'
                          '\n'
                          '\\[ 1988 - Y = \\text{sum of th

# Training

In [5]:
import torch
from transformers import AutoModelForCausalLM

model_id = "Qwen/Qwen2-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

## LoRAで学習

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [7]:
import re


def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    rewards_list = [1.0 if match else 0.0 for match in matches]
    return [1.0 if match else 0.0 for match in matches]

In [None]:
from math_verify import LatexExtractionConfig, parse, verify


def accuracy_reward(completions, **kwargs):
    """Reward function that checks if the completion is the same as the ground truth."""
    solutions = kwargs["solution"]
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, solution in zip(completion_contents, solutions):
        gold_parsed = parse(solution, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
        answer_parsed = parse(content, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
        if len(gold_parsed) != 0:
            try:
                rewards.append(float(verify(answer_parsed, gold_parsed)))
            except Exception:
                rewards.append(0.0)
        else:
            rewards.append(1.0)
    return rewards

Collecting math_verify
  Downloading math_verify-0.8.0-py3-none-any.whl.metadata (1.6 kB)
Collecting latex2sympy2_extended==1.10.2 (from math_verify)
  Downloading latex2sympy2_extended-1.10.2-py3-none-any.whl.metadata (5.3 kB)
Collecting antlr4-python3-runtime<=4.13.2,>=4.9.3 (from latex2sympy2_extended==1.10.2->math_verify)
  Downloading antlr4_python3_runtime-4.13.2-py3-none-any.whl.metadata (304 bytes)
Downloading math_verify-0.8.0-py3-none-any.whl (29 kB)
Downloading latex2sympy2_extended-1.10.2-py3-none-any.whl (207 kB)
Downloading antlr4_python3_runtime-4.13.2-py3-none-any.whl (144 kB)
Installing collected packages: antlr4-python3-runtime, latex2sympy2_extended, math_verify
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [math_verify][0m [latex2sympy2_extended]
[1A[2KSuccessfully installed antlr4-python3-runtime-4.13.2 latex2sympy2_extended-1.10.2 math_verify-0.8.0


In [None]:
from trl import GRPOConfig
from pathlib import Path

base_output_dir = "outputs/"

# Configure training arguments using GRPOConfig
training_args = GRPOConfig(
    output_dir=base_output_dir / "Qwen2-0.5B-GRPO-test",
    learning_rate=1e-5,
    remove_unused_columns=False,  # to access the solution column in accuracy_reward
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    bf16=True,
    # Parameters that control de data preprocessing
    max_completion_length=64,  # default: 256
    num_generations=4,  # default: 8
    max_prompt_length=128,  # default: 512
    # Parameters related to reporting and saving
    report_to=["wandb"],
    logging_steps=10,
    push_to_hub=False,
    save_strategy="steps",
    save_steps=10,
)

In [12]:
from trl import GRPOTrainer

trainer = GRPOTrainer(
    model=model, reward_funcs=[format_reward, accuracy_reward], args=training_args, train_dataset=train_dataset
)
trainer.train()
trainer.save_model(training_args.output_dir)


INFO 08-01 00:25:09 [__init__.py:244] Automatically detected platform cuda.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Currently logged in as: [33mtorotoki0329soft[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.0045
20,0.0027
30,0.0046
40,0.0039
50,0.0068
60,0.0109
70,0.0175
80,0.0107
90,0.0114
100,0.0199


Timeout during comparison
Timeout during comparison
Timeout during comparison
Timeout during comparison


# Model Performance Evaluation

In [17]:
from transformers import AutoTokenizer

model_id = "sergiopaniego/Qwen2-0.5B-GRPO"
trained_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)
trained_tokenizer = AutoTokenizer.from_pretrained(model_id)
print(test_dataset["prompt"][0])

adapter_config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>', 'role': 'system'}, {'content': "In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?", 'role': 'user'}]


In [24]:
import re

def extract_tag(text: str, tag: str) -> str | None:
    #pattern = fr"<{tag}>.*?</{tag}>"
    pattern = fr".*?<{tag}>(.*?)</{tag}>.*?"
    matches = re.match(pattern, text)
    return matches.group(1)

print(extract_tag("This is <answer>35</answer>!", "answer"))

35


In [None]:
def generate_with_reasoning(prompt):
    # Build the prompt from the dataset
    prompt = " ".join(entry["content"] for entry in prompt)

    # Tokenize and move to the same device as the model
    inputs = trained_tokenizer(prompt, return_tensors="pt").to(trained_model.device)

    # Generate text without gradients
    start_time = time.time()
    with torch.no_grad():
        output_ids = trained_model.generate(**inputs, max_length=500)
    end_time = time.time()

    # Decode and extract model response
    generated_text = trained_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Get inference time
    inference_duration = end_time - start_time

    # Get number of generated tokens
    num_input_tokens = inputs["input_ids"].shape[1]
    num_generated_tokens = output_ids.shape[1] - num_input_tokens

    return generated_text, inference_duration, num_generated_tokens


In [19]:
prompt = test_dataset["prompt"][0]
generated_text, inference_duration, num_generated_tokens = generate_with_reasoning(prompt)
print(generated_text)

A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer> In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?<think> 20 years old </think><answer> 35 </answer>

The reasoning process is: If the sum of the digits of the person's birth year is equal to the person's current age, then that person must be 20 years old. The answer is: 35.


In [20]:
print(f"Inference time: {inference_duration:.2f} seconds")
print(f"Generated tokens: {num_generated_tokens}")
prompt_text = " ".join(entry["content"] for entry in prompt)
response_text = generated_text[len(prompt_text) :].strip()
print(response_text)

Inference time: 2.23 seconds
Generated tokens: 65
<think> 20 years old </think><answer> 35 </answer>

The reasoning process is: If the sum of the digits of the person's birth year is equal to the person's current age, then that person must be 20 years old. The answer is: 35.


In [34]:
def extract_answer_numinamath_tir(text: str) -> str | None:
    pattern = fr".*?```output(.*?)```.*?"
    matches = re.match(pattern, text)
    print(matches)
    return matches.group(1)

print(test_dataset['messages'][0])
#print(test_dataset['solution'][0])
extract_answer_numinamath_tir(test_dataset['solution'][0])

# correct = 0
# for example in test_dataset:
#     prompt = example["prompt"]
#     generated_text, inference_duration, num_generated_tokens = generate_with_reasoning(prompt)
#     predict = extract_tag(generated_text, "answer")
#     example[""]

[{'content': "In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?", 'role': 'user'}, {'content': 'To solve this problem, let\'s break it down step-by-step:\n\n1. Let the person\'s birth year be \\( Y \\).\n2. In 1988, the person\'s age would be \\( 1988 - Y \\).\n3. The sum of the digits of \\( Y \\) should be equal to their age in 1988.\n\nTherefore, we need to find a year \\( Y \\) such that:\n\n\\[ 1988 - Y = \\text{sum of the digits of } Y \\]\n\nWe can solve this by iterating through possible values for \\( Y \\) and checking if the condition holds.\n\nLet\'s write a Python script to find the correct birth year \\( Y \\).\n```python\ndef digit_sum(year):\n    """Calculate the sum of the digits of a year."""\n    return sum(int(digit) for digit in str(year))\n\ndef find_birth_year():\nprint((    for year in range(1900, 1989):))  # Reasonable range given the\n```\n```output\nCell In[210], line 6\n    for year in range(1900, 1989):

AttributeError: 'NoneType' object has no attribute 'group'