# installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm

In [2]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

# Model

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Base",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-19 12:02:58 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-19 12:02:58 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.5.6: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Qwen3-4B-Base with actual GPU utilization = 69.29%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 22.16 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

INFO 05-19 12:03:29 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='unsloth/Qwen3-4B-Base', speculative_config=None, tokenizer='unsloth/Qwen3-4B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=unsloth/Qwen3-4B-Base, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_asyn

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

INFO 05-19 12:04:22 [weight_utils.py:281] Time spent downloading weights for unsloth/Qwen3-4B-Base: 49.987338 seconds


model.safetensors.index.json:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-19 12:04:24 [loader.py:458] Loading weights took 2.35 seconds
INFO 05-19 12:04:24 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 05-19 12:04:25 [gpu_model_runner.py:1347] Model loading took 7.6334 GiB and 53.152834 seconds
INFO 05-19 12:04:48 [backends.py:420] Using cache directory: /root/.cache/vllm/torch_compile_cache/958479050b/rank_0_0 for vLLM's torch.compile
INFO 05-19 12:04:48 [backends.py:430] Dynamo bytecode transform time: 22.57 s


Inductor Compilation: 100%|██████████| 6/6 [00:01<00:00,  4.68it/s, triton_poi_fused_add_mul_sub_5]

INFO 05-19 12:04:54 [backends.py:136] Cache the graph of shape None for later use



Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 10.88it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 119.95it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 111.44it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 104.65it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 112.27it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 11.60it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 113.62it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 108.48it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 113.91it/s, triton_poi_fused_add_mul_sub_9]
Inductor Compilation: 100%|██████████| 10/10 [00:00<00:00, 112.20it/s, tri

INFO 05-19 12:05:57 [backends.py:148] Compiling a graph for general shape takes 66.38 s
INFO 05-19 12:09:12 [monitor.py:33] torch.compile takes 88.95 s in total
INFO 05-19 12:09:16 [kv_cache_utils.py:634] GPU KV cache size: 44,928 tokens
INFO 05-19 12:09:16 [kv_cache_utils.py:637] Maximum concurrency for 2,048 tokens per request: 21.94x
INFO 05-19 12:10:40 [gpu_model_runner.py:1686] Graph capturing finished in 84 secs, took 0.82 GiB
INFO 05-19 12:10:41 [core.py:159] init engine (profile, create kv cache, warmup model) took 375.76 seconds
Unsloth: Just some info: will skip parsing ['pre_feedforward_layernorm', 'post_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['pre_feedforward_layernorm', 'post_feedforward_layernorm']


tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.5.6 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


# GRPO chat template

### GRPO chat template
Since we're using a base model, we should set a chat template. You can make your own chat template as well!
1. DeepSeek uses `<think>` and `</think>`, but this is **not** necessary - you can customize it however you like!
2. A `system_prompt` is recommended to at least guide the model's responses.

In [4]:
reasoning_start = "<start_working_out>" # Acts as <think>
reasoning_end   = "<end_working_out>"   # Acts as </think>
solution_start  = "<SOLUTION>"
solution_end    = "</SOLUTION>"

system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""
system_prompt

'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>'

In [5]:
chat_template = \
    "{% if messages[0]['role'] == 'system' %}"\
        "{{ messages[0]['content'] + eos_token }}"\
        "{% set loop_messages = messages[1:] %}"\
    "{% else %}"\
        "{{ '{system_prompt}' + eos_token }}"\
        "{% set loop_messages = messages %}"\
    "{% endif %}"\
    "{% for message in loop_messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ message['content'] }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ message['content'] + eos_token }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}{{ '{reasoning_start}' }}"\
    "{% endif %}"

# Replace with out specific template:
chat_template = chat_template\
    .replace("'{system_prompt}'",   f"'{system_prompt}'")\
    .replace("'{reasoning_start}'", f"'{reasoning_start}'")
tokenizer.chat_template = chat_template

chat_template

"{% if messages[0]['role'] == 'system' %}{{ messages[0]['content'] + eos_token }}{% set loop_messages = messages[1:] %}{% else %}{{ 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>' + eos_token }}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% if message['role'] == 'user' %}{{ message['content'] }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<start_working_out>' }}{% endif %}"

In [6]:
tokenizer.apply_chat_template([
    {"role" : "user", "content" : "What is 1+1?"},
    {"role" : "assistant", "content" : f"{reasoning_start}I think it's 2.{reasoning_end}{solution_start}2{solution_end}"},
    {"role" : "user", "content" : "What is 2+2?"},
], tokenize = False, add_generation_prompt = True)

"You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION><|endoftext|>What is 1+1?<start_working_out>I think it's 2.<end_working_out><SOLUTION>2</SOLUTION><|endoftext|>What is 2+2?<start_working_out>"

# Pre fine-tuning for formatting

We now use a subset of NVIDIA's [Open Math Reasoning dataset](https://huggingface.co/datasets/nvidia/OpenMathReasoning) which was filtered to only include high quality DeepSeek R1 traces.

We'll only filter ~59 or so examples to first "prime" / pre fine-tune the model to understand our custom GRPO formatting.

In [7]:
from datasets import load_dataset
import pandas as pd
import numpy as np

In [8]:
dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")
dataset = dataset.to_pandas()[
    ["expected_answer", "problem", "generated_solution"]
]

# Try converting to number - if not, replace with NaN
is_number = pd.to_numeric(pd.Series(dataset["expected_answer"]), errors = "coerce").notnull()
# Select only numbers
dataset = dataset.iloc[np.where(is_number)[0]]

dataset

README.md:   0%|          | 0.00/603 [00:00<?, ?B/s]

data/cot-00000-of-00001.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

Generating cot split:   0%|          | 0/19252 [00:00<?, ? examples/s]

Unnamed: 0,expected_answer,problem,generated_solution
0,14,Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$...,"<think>\nOkay, let's see. I need to solve the ..."
6,-2,Find the value of the parameter $a$ for which ...,"<think>\nOkay, so I need to find the value of ..."
9,18,What is the sum of all real numbers $x$ for wh...,"<think>\nOkay, so I need to solve the equation..."
13,2,Evaluate the sum \(\sum_{n=1}^\infty \frac{\ph...,"<think>\nOkay, so I need to evaluate the infin..."
17,30,What is the largest positive integer that divi...,"<think>\nAlright, so I need to find the larges..."
...,...,...,...
19243,244,"Let \( p \), \( q \), and \( r \) be the disti...","<think>\nOkay, so I need to find the value of ..."
19245,1,A bug is on the $0$ of a number line. At any p...,"<think>\nOkay, so I have this problem where a ..."
19247,4,A bus left point X for point Y. Two hours late...,"<think>\nOkay, let's tackle this problem step ..."
19248,18,Each interior angle of a regular n-gon measure...,"<think>\nOkay, let's see. I need to find the n..."


#### Format the dataset to follow our GRPO style formatting:

In [9]:
def format_dataset(x):
    expected_answer = x["expected_answer"]
    problem = x["problem"]

    # Remove generated <think> and </think>
    thoughts = x["generated_solution"]
    thoughts = thoughts.replace("<think>", "").replace("</think>", "")

    # Strip newlines on left and right
    thoughts = thoughts.strip()
    # Add our custom formatting
    final_prompt = \
        reasoning_start + thoughts + reasoning_end + \
        solution_start + expected_answer + solution_end
    return [
        {"role" : "system",    "content" : system_prompt},
        {"role" : "user",      "content" : problem},
        {"role" : "assistant", "content" : final_prompt},
    ]

dataset["Messages"] = dataset.apply(format_dataset, axis = 1)

In [10]:
tokenizer.apply_chat_template(dataset["Messages"][0], tokenize = False)

"You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Given $\\sqrt{x^2+165}-\\sqrt{x^2-52}=7$ and $x$ is positive, find all possible values of $x$.<start_working_out>Okay, let's see. I need to solve the equation √(x² + 165) - √(x² - 52) = 7, and find all positive values of x. Hmm, radicals can be tricky, but maybe if I can eliminate the square roots by squaring both sides. Let me try that.\n\nFirst, let me write down the equation again to make sure I have it right:\n\n√(x² + 165) - √(x² - 52) = 7.\n\nOkay, so the idea is to isolate one of the radicals and then square both sides. Let me try moving the second radical to the other side:\n\n√(x² + 165) = 7 + √(x² - 52).\n\nNow, if I square both sides, maybe I can get rid of the square roots. Let's do that:\n\n(√(x² + 165))² = (7 + √(x² - 52))².\n\nSimplifying the left side:\n\nx² + 

In [11]:
print("""
You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$ is positive, find all possible values of $x$.<start_working_out>Okay, let's see. I need to solve the equation √(x² + 165) - √(x² - 52) = 7, and find all positive values of x. Hmm, radicals can be tricky, but maybe if I can eliminate the square roots by squaring both sides. Let me try that.

First, let me write down the equation again to make sure I have it right:

√(x² + 165) - √(x² - 52) = 7.

Okay, so the idea is to isolate one of the radicals and then square both sides. Let me try moving the second radical to the other side:

√(x² + 165) = 7 + √(x² - 52).

Now, if I square both sides, maybe I can get rid of the square roots. Let's do that:

(√(x² + 165))² = (7 + √(x² - 52))².

Simplifying the left side:

x² + 165 = 49 + 14√(x² - 52) + (√(x² - 52))².

The right side is expanded using the formula (a + b)² = a² + 2ab + b². So the right side becomes 7² + 2*7*√(x² - 52) + (√(x² - 52))², which is 49 + 14√(x² - 52) + (x² - 52).

So putting it all together:

x² + 165 = 49 + 14√(x² - 52) + x² - 52.

Hmm, let's simplify the right side. The x² terms will cancel out, right? Let's subtract x² from both sides:

165 = 49 + 14√(x² - 52) - 52.

Simplify the constants on the right:

49 - 52 is -3, so:

165 = -3 + 14√(x² - 52).

Now, add 3 to both sides to isolate the radical term:

165 + 3 = 14√(x² - 52).

So 168 = 14√(x² - 52).

Divide both sides by 14:

168 / 14 = √(x² - 52).

12 = √(x² - 52).

Now, square both sides again to eliminate the square root:

12² = x² - 52.

144 = x² - 52.

Add 52 to both sides:

144 + 52 = x².

196 = x².

So x = √196 = 14.

But wait, since the problem states that x is positive, we only take the positive root. So x = 14.

But hold on, when dealing with squaring equations, sometimes extraneous solutions can come up. I should check if this solution actually satisfies the original equation.

Let's plug x = 14 back into the original equation:

√(14² + 165) - √(14² - 52) = ?

Calculate each term:

14² is 196.

So first radical: √(196 + 165) = √361 = 19.

Second radical: √(196 - 52) = √144 = 12.

So 19 - 12 = 7, which is exactly the right-hand side. So yes, it checks out.

Therefore, the only solution is x = 14. Since the problem says x is positive, we don't have to consider negative roots. So I think that's the answer.
To solve the equation \(\sqrt{x^2 + 165} - \sqrt{x^2 - 52} = 7\) for positive \(x\), we proceed as follows:

1. Start with the given equation:
   \[
   \sqrt{x^2 + 165} - \sqrt{x^2 - 52} = 7
   \]

2. Isolate one of the square roots by moving \(\sqrt{x^2 - 52}\) to the right side:
   \[
   \sqrt{x^2 + 165} = 7 + \sqrt{x^2 - 52}
   \]

3. Square both sides to eliminate the square root on the left:
   \[
   (\sqrt{x^2 + 165})^2 = (7 + \sqrt{x^2 - 52})^2
   \]
   Simplifying both sides, we get:
   \[
   x^2 + 165 = 49 + 14\sqrt{x^2 - 52} + (x^2 - 52)
   \]

4. Combine like terms on the right side:
   \[
   x^2 + 165 = x^2 - 52 + 49 + 14\sqrt{x^2 - 52}
   \]
   Simplifying further:
   \[
   x^2 + 165 = x^2 - 3 + 14\sqrt{x^2 - 52}
   \]

5. Subtract \(x^2\) from both sides:
   \[
   165 = -3 + 14\sqrt{x^2 - 52}
   \]

6. Add 3 to both sides to isolate the term with the square root:
   \[
   168 = 14\sqrt{x^2 - 52}
   \]

7. Divide both sides by 14:
   \[
   12 = \sqrt{x^2 - 52}
   \]

8. Square both sides again to eliminate the square root:
   \[
   12^2 = x^2 - 52
   \]
   Simplifying:
   \[
   144 = x^2 - 52
   \]

9. Add 52 to both sides to solve for \(x^2\):
   \[
   196 = x^2
   \]

10. Take the positive square root (since \(x\) is positive):
    \[
    x = \sqrt{196} = 14
    \]

11. Verify the solution by substituting \(x = 14\) back into the original equation:
    \[
    \sqrt{14^2 + 165} - \sqrt{14^2 - 52} = \sqrt{196 + 165} - \sqrt{196 - 52} = \sqrt{361} - \sqrt{144} = 19 - 12 = 7
    \]
    The solution checks out.

Thus, the only positive solution is:
\[
\boxed{14}
\]<end_working_out><SOLUTION>14</SOLUTION><|endoftext|>
""")


You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Given $\sqrt{x^2+165}-\sqrt{x^2-52}=7$ and $x$ is positive, find all possible values of $x$.<start_working_out>Okay, let's see. I need to solve the equation √(x² + 165) - √(x² - 52) = 7, and find all positive values of x. Hmm, radicals can be tricky, but maybe if I can eliminate the square roots by squaring both sides. Let me try that.

First, let me write down the equation again to make sure I have it right:

√(x² + 165) - √(x² - 52) = 7.

Okay, so the idea is to isolate one of the radicals and then square both sides. Let me try moving the second radical to the other side:

√(x² + 165) = 7 + √(x² - 52).

Now, if I square both sides, maybe I can get rid of the square roots. Let's do that:

(√(x² + 165))² = (7 + √(x² - 52))².

Simplifying the left side:

x² + 165 = 49 + 14√(x² - 5

Let's truncate the pre fine-tuning dataset to `max_seq_length/2` since we don't want too long reasoning traces.

Note this might take 2 minutes!

tokenize the messages and convert it to a Hugging Face compatible dataset format:

In [12]:
dataset["N"] = dataset["Messages"].apply(lambda x: len(tokenizer.apply_chat_template(x)))

dataset = dataset.loc[dataset["N"] <= max_seq_length/2].copy()
dataset.shape

(59, 5)

In [13]:
from datasets import Dataset

dataset["text"] = tokenizer.apply_chat_template(dataset["Messages"].values.tolist(), tokenize = False)
dataset = Dataset.from_pandas(dataset)
dataset

Dataset({
    features: ['expected_answer', 'problem', 'generated_solution', 'Messages', 'N', 'text', '__index_level_0__'],
    num_rows: 59
})

Let's now pre fine-tune the model so it follows our custom GRPO formatting!

In [14]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/59 [00:00<?, ? examples/s]

In [15]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 59 | Num Epochs = 2 | Total steps = 118
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 66,060,288/4,088,528,384 (1.62% trained)


Step,Training Loss
5,0.6544
10,0.656
15,0.4238
20,0.3914
25,0.4236
30,0.4505
35,0.4745
40,0.4193
45,0.4455
50,0.333


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=118, training_loss=0.3504662084377418, metrics={'train_runtime': 86.2379, 'train_samples_per_second': 1.368, 'train_steps_per_second': 1.368, 'total_flos': 2374193075607552.0, 'train_loss': 0.3504662084377418})

Let's check if the model has learnt to follow the custom format:

In [16]:
text = tokenizer.apply_chat_template(
    dataset[0]["Messages"][:2],
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
)

text

'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Jenifer has 82 cents in pennies and nickels. Her younger brother mistook all her nickels for dimes and counted the total as $1.47. How many pennies does Jenifer have?<start_working_out>'

In [17]:
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    temperature = 0,
    max_new_tokens = 1024,
    streamer = TextStreamer(tokenizer, skip_prompt = False),
)

You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION><|endoftext|>Jenifer has 82 cents in pennies and nickels. Her younger brother mistook all her nickels for dimes and counted the total as $1.47. How many pennies does Jenifer have?<start_working_out>Okay, let's see. Jenifer has 82 cents in pennies and nickels. Her brother thought all the nickels were dimes and counted the total as $1.47. I need to find out how many pennies she has. Hmm, let's break this down.

First, I need to set up some equations. Let's say the number of pennies is P and the number of nickels is N. Since pennies are worth 1 cent each and nickels are 5 cents each, the total value in cents is 1P + 5N = 82. That's the first equation.

Now, her brother thought all the nickels were dimes. Dimes are 10 cents each. So, he counted the total as $1.47, which is 147 cents. So, the equat

Yes it did follow the formatting! Great! Let's remove some items before the GRPO step

In [18]:
del dataset
torch.cuda.empty_cache()
import gc
gc.collect()

0

### Data Prep
<a name="Data"></a>

We're using Hugging Face's [Open R1 Math dataset](https://huggingface.co/datasets/open-r1/DAPO-Math-17k-Processed). You can also utilize OpenAI's famous [GSM8K dataset](https://huggingface.co/datasets/openai/gsm8k)

In [19]:
from datasets import load_dataset
dataset = load_dataset("open-r1/DAPO-Math-17k-Processed", "en", split = "train")
dataset

README.md:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

en/train-00000-of-00001.parquet:   0%|          | 0.00/5.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14116 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'solution', 'data_source', 'source_prompt', 'ability', 'reward_model', 'extra_info'],
    num_rows: 14116
})

Let's look at the first row:

In [22]:
print(dataset[0]["prompt"])
print("---Solution----")
print(dataset[0]["solution"])

In triangle $ABC$, $\sin \angle A = \frac{4}{5}$ and $\angle A < 90^\circ$. Let $D$ be a point outside triangle $ABC$ such that $\angle BAD = \angle DAC$ and $\angle BDC = 90^\circ$. Suppose that $AD = 1$ and that $\frac{BD}{CD} = \frac{3}{2}$. If $AB + AC$ can be expressed in the form $\frac{a\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.
---Solution----
34


In GSM8K, we notice all answers like about have a ####, so we extract it. But for the Open R1 dataset, we can skip the below.

In [23]:
def extract_hash_answer(text):
    # if "####" not in text: return None
    # return text.split("####")[1].strip()
    return text
extract_hash_answer(dataset[0]["solution"])

'34'

Let's map the dataset! and see the first row:

In [24]:
dataset = dataset.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["prompt"]},
    ],
    "answer": extract_hash_answer(x["solution"]),
})
dataset[0]

Map:   0%|          | 0/14116 [00:00<?, ? examples/s]

{'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>',
   'role': 'system'},
  {'content': 'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$ be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$ and $\\angle BDC = 90^\\circ$. Suppose that $AD = 1$ and that $\\frac{BD}{CD} = \\frac{3}{2}$. If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.',
   'role': 'user'}],
 'solution': '34',
 'data_source': 'math_dapo',
 'source_prompt': [{'content': 'Solve the following math problem step by step. The last line of your response should be of the form Answer: $Answer (without quotes) where $Answer is the answer to the problem.\n\nIn triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $

We create a regex format to match the reasoning sections and answers:

In [25]:
import re

# Add optional EOS token matching
solution_end_regex = r"</SOLUTION>[\s]{0,}" + \
    "(?:" + re.escape(tokenizer.eos_token) + ")?"

match_format = re.compile(
    rf"{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end_regex}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)
match_format

re.compile(r'<end_working_out>.*?<SOLUTION>(.+?)</SOLUTION>[\s]{0,}(?:<\|endoftext\|>)?[\s]{0,}$',
re.MULTILINE|re.DOTALL|re.UNICODE)

In [26]:
match_format.findall(
    "Let me think!<end_working_out>"\
    f"<SOLUTION>\n2\n</SOLUTION>",
)

['\n2\n']

In [27]:
match_format.findall(
    "<start_working_out>Let me think!<end_working_out>"\
    f"<SOLUTION>  2  </SOLUTION>\n\n",
)

['  2  ']

We now want to create a reward function to match the format exactly - we reward it with 3 points if it succeeds:

In [28]:
def match_format_exactly(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

If it fails, we want to reward the model if it at least follows the format partially, by counting each symbol:

In [29]:
def match_format_approximately(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Count how many keywords are seen - we penalize if too many!
        # If we see 1, then plus some points!

        # No need to reward <start_working_out> since we always prepend it!
        # score += 0.5 if response.count(reasoning_start) == 1 else -1.0
        score += 0.5 if response.count(reasoning_end)   == 1 else -1.0
        score += 0.5 if response.count(solution_start)  == 1 else -1.0
        score += 0.5 if response.count(solution_end)    == 1 else -1.0
        scores.append(score)
    return scores

Finally, we want to extract the generated answer, and reward or penalize it! We also reward it based on how close the answer is to the true one via ratios:

In [30]:
def check_answer(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(-2.0)
            continue
        # Correct answer gets 5 points!
        if guess == true_answer:
            score += 5.0
        # Match if spaces are seen, but less reward
        elif guess.strip() == true_answer.strip():
            score += 3.5
        else:
            # We also reward it if the answer is close via ratios!
            # Ie if the answer is within some range, reward it!
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 2.0
                elif ratio >= 0.8 and ratio <= 1.2: score += 1.5
                else: score -= 2.5 # Penalize wrong answers
            except:
                score -= 4.5 # Penalize
        scores.append(score)
    return scores

Also sometimes it might not be 1 number as the answer, but like a sentence for example "The solution is $20" -> we extract 20.

We also remove possible commas for example as in 123,456

In [31]:
match_numbers = re.compile(
    solution_start + r".*?[\s]{0,}([-]?[\d\.\,]{1,})",
    flags = re.MULTILINE | re.DOTALL
)
print(match_numbers.findall("<SOLUTION>  0.34  </SOLUTION>"))
print(match_numbers.findall("<SOLUTION>  123,456  </SOLUTION>"))
print(match_numbers.findall("<SOLUTION>  -0.234  </SOLUTION>"))
print(match_numbers.findall("<SOLUTION>17</SOLUTION>"))

['0.34']
['123,456']
['-0.234']
['17']


We now prepare our main function which will print out the generated responses and the true answer, along with another reward function which converts text to float via `float` and sees if it's the same.

In [32]:
global PRINTED_TIMES
PRINTED_TIMES = 0
global PRINT_EVERY_STEPS
PRINT_EVERY_STEPS = 5

def check_numbers(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    # Print only every few steps
    global PRINTED_TIMES
    global PRINT_EVERY_STEPS
    if PRINTED_TIMES % PRINT_EVERY_STEPS == 0:
        print(
            '*'*20 + f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}"
        )
    PRINTED_TIMES += 1

    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(-2.5)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            # Remove commas like in 123,456
            guess       = float(guess.strip().replace(",", ""))
            scores.append(3.5 if guess == true_answer else -1.5)
        except:
            scores.append(0)
            continue
    return scores

Get the top 90% prompt length so we don't accidentally truncate them!

Ie we'll remove the top 10% long prompts.

In [33]:
tokenized = dataset.map(
    lambda x: {"tokens" : tokenizer.apply_chat_template(x["prompt"], add_generation_prompt = True, tokenize = True)},
    batched = True,
)
print(tokenizer.decode(tokenized[0]["tokens"]))
tokenized = tokenized.map(lambda x: {"L" : len(x["tokens"])})

import numpy as np
maximum_length = int(np.quantile(tokenized["L"], 0.9))
print("Max Length = ", maximum_length)

# Filter only samples smaller than 90% max length
dataset = dataset.select(np.where(np.array(tokenized["L"]) <= maximum_length)[0])
del tokenized

Map:   0%|          | 0/14116 [00:00<?, ? examples/s]

You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION><|endoftext|>In triangle $ABC$, $\sin \angle A = \frac{4}{5}$ and $\angle A < 90^\circ$. Let $D$ be a point outside triangle $ABC$ such that $\angle BAD = \angle DAC$ and $\angle BDC = 90^\circ$. Suppose that $AD = 1$ and that $\frac{BD}{CD} = \frac{3}{2}$. If $AB + AC$ can be expressed in the form $\frac{a\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.<start_working_out>


Map:   0%|          | 0/14116 [00:00<?, ? examples/s]

Max Length =  201


# Train the model

#### now set up the GRPO Trainer and all configurations

In [34]:
max_prompt_length = maximum_length + 1 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length

from vllm import SamplingParams
vllm_sampling_params = SamplingParams(
    min_p = 0.1,
    top_p = 1.0,
    top_k = -1,
    seed = 3407,
    stop = [tokenizer.eos_token],
    include_stop_str_in_output = True,
)

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    vllm_sampling_params = vllm_sampling_params,
    temperature = 1.0,
    learning_rate = 5e-6,
    weight_decay = 0.01,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_completion_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 100,
    save_steps = 100,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",

    # For optional training + evaluation
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [35]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    args = training_args,
    train_dataset = dataset,

    # For optional training + evaluation
    # train_dataset = new_dataset["train"],
    # eval_dataset = new_dataset["test"],
)

In [36]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,709 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 66,060,288/4,088,528,384 (1.62% trained)


********************Question:
Compute the number of positive integers that divide at least two of the integers in the set $\{1^1,2^2,3^3,4^4,5^5,6^6,7^7,8^8,9^9,10^{10}\}$. 
Answer:
22 
Response:
Okay, so I need to find how many positive integers divide at least two of the numbers in the set {1^1, 2^2, 3^3, ..., 10^10}. Hmm, let me think. When they say "divides at least two," that means I need to count the common divisors of every pair in this set. Wait, but that would be a lot of pairs. There are 10 numbers, so the number of pairs is 10 choose 2, which is 45 pairs. But calculating the greatest common divisor (GCD) for each pair would be time-consuming. Maybe there's a smarter way.

Let me think differently. Instead of looking at all pairs, maybe I can look at the prime factors of each number in the set and then find common prime factors that appear in multiple numbers. For example, gcd of two numbers is 1 if they share no common prime factors. So, if I can identify which prime factors

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,0.005,-7.5,0.0,1846.0,0.124792,0.0,-3.0,-2.0,-2.5
2,0.0072,-3.5,4.618802,1484.0,0.179148,1.5,-0.75,-2.25,-2.0
3,0.0071,-7.5,0.0,1846.0,0.176605,0.0,-3.0,-2.0,-2.5
4,0.0054,-7.5,0.0,1846.0,0.133847,0.0,-3.0,-2.0,-2.5
5,0.0084,13.0,0.0,1216.75,0.210825,3.0,1.5,5.0,3.5
6,0.0056,-7.5,0.0,1846.0,0.140002,0.0,-3.0,-2.0,-2.5
7,0.0058,-3.5,4.618802,1704.5,0.143868,1.5,-0.75,-2.25,-2.0
8,0.0039,-0.375,9.681382,1744.25,0.098248,1.5,-0.75,-0.375,-0.75
9,0.0062,-5.5,4.0,1752.0,0.155865,0.75,-1.875,-2.125,-2.25
10,0.0043,-2.375,10.25,1673.25,0.108539,0.75,-1.875,-0.25,-1.0


********************Question:
The sum of $\lfloor x \rfloor$ for all real numbers $x$ satisfying the equation $16 + 15x + 15x^2 = \lfloor x \rfloor^3$ is: 
Answer:
33 
Response:
Okay, so I need to find the sum of the floor function of x for all real numbers x that satisfy the equation 16 + 15x + 15x^2 = floor(x)^3. Hmm, let's break this down.

First, I recall that the floor function, denoted as ⌊x⌋, gives the greatest integer less than or equal to x. So, if I let k = ⌊x⌋, then k is an integer, and x is in the interval [k, k+1). The equation then becomes 16 + 15x + 15x^2 = k^3.

I need to find all real x in [k, k+1) that satisfy this equation for some integer k. Then, sum all those floor values, which are all k's in this case since the floor of x is k. So the problem reduces to finding all integer k where there exists an x in [k, k+1) such that 15x^2 +15x +16 - k^3 = 0.

Wait, let me check the equation again. The original equation is 16 +15x +15x^2 = k^3. Rearranged, that becomes 15x^2 

TrainOutput(global_step=100, training_loss=0.005393937814515084, metrics={'train_runtime': 8147.401, 'train_samples_per_second': 0.049, 'train_steps_per_second': 0.012, 'total_flos': 0.0, 'train_loss': 0.005393937814515084})

# inference

Now let's try the model we just trained! First, let's first try the model without any GRPO trained:

In [37]:
text = "What is the sqrt of 101?"

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 1.0,
    top_k = 50,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

" - Answers\nMath and Arithmetic\nAlgebra\nThe sqrt of a number\nWhat is the sqrt of 101?\nWiki User\n∙ 2009-08-04 00:08:50\nStudy now\nSee Answer\nBest Answer\nCopy\nIt is approx. 10.0499\n10.04987562112089\nWiki User\n∙ 2009-08-04 00:08:50\nThis answer is:\n👍\n🙏\n0\n🤨\n0\n😮\n0\nAdd a Comment\nStudy guides\nAlgebra\n20 cards\nA polynomial of degree zero is a constant term\nThe grouping method of factoring can still be used when only some of the terms share a common factor A True B False\nThe sum or difference of p and q is the of the x-term in the trinomial\nA number a power of a variable or a product of the two is a monomial while a polynomial is the of monomials\nSee all cards\nStudy now\nJ's study guide\n1 card\nWhat is the name of Steve on minecraft's name\nSee all cards\nStudy now\nSteel Tip Darts Out Chart\n96 cards\n170\n169\n168\n167\nSee all cards\nStudy now\nAdd your answer:\nEarn +20 pts\nQ: What is the sqrt of 101?\nSubmit\nHow do you solve x to the 2nd power plus x-101 eq

In [38]:
print("""
 - Answers
Math and Arithmetic
Algebra
The sqrt of a number
What is the sqrt of 101?
Wiki User
∙ 2009-08-04 00:08:50
Study now
See Answer
Best Answer
Copy
It is approx. 10.0499
10.04987562112089
Wiki User
∙ 2009-08-04 00:08:50
This answer is:
👍
🙏
0
🤨
0
😮
0
Add a Comment
Study guides
Algebra
20 cards
A polynomial of degree zero is a constant term
The grouping method of factoring can still be used when only some of the terms share a common factor A True B False
The sum or difference of p and q is the of the x-term in the trinomial
A number a power of a variable or a product of the two is a monomial while a polynomial is the of monomials
See all cards
Study now
J's study guide
1 card
What is the name of Steve on minecraft's name
See all cards
Study now
Steel Tip Darts Out Chart
96 cards
170
169
168
167
See all cards
Study now
Add your answer:
Earn +20 pts
Q: What is the sqrt of 101?
Submit
How do you solve x to the 2nd power plus x-101 equals zero?
x&sup2;+x-101 = 0Solve using the quadratic equation:x = [-1 +/- sqrt(1-4(-101))]/2x = (-1 +/- sqrt405)/2x = (-1 +/- 9sqrt5)/2
What is the square root of 101?
The square root of 101 is 10.05
Expressed exactly in decimal format with no approximation or
round up, it equals 10.049875633551296154593421094030. Expressed as
a recurring cubic decimal, it is 10.0498756211208910498756...
Is 101 a square number?
The closest square numbers are 100 and 121, which have square
roots of 10 and 11, respectively. So 101 is a bit greater than 10
and a bit less than 11.
The square root of 101 is about 10.05, which is more accurately
expressed as 10.049875633551296154593421094030.
Expressed as a cubic recurring decimal, this is
10.0498756211208....
How many square feet is 101 square inches?
101 square inches = 7.01042 square feet
What fraction is 101 out of 187 in lowest terms?
101/187101 is a prime number, sqrt 187 = &acirc;&sbquo;\xc2&plusmn;13.67. The numerator is not a factor of a number less than the sqrt of the denominator therefore it is in its lowest form.
How many hours is 101?
101 hours
How do you simplify sqrt 500?
It is exactly 10sqrt5. The decimal equivalents of these
numbers
are approx equal
The decimal equivalents of both numbers are just
approximately
equal.
What is 17 sq plus x sq equals 197?
if you re looking for x, it equals 10, sqrt 180 = x, so 17 +180
= 197
What is 5.7425 x 10 to the 2nd power in standard form?
574.25
A number is called a square number if its square root is which of the following?
Not a factor. But a factor of the number.
What two numbers when multiplied together equal 10 and when added or subtracted equal 105?
You can't find two numbers that do that. If a+b=105, then sqrt(a*b)
= 10 or a*b=100, but that's too small to be a factor of 105.
If sqrt(a*b)+
""")


 - Answers
Math and Arithmetic
Algebra
The sqrt of a number
What is the sqrt of 101?
Wiki User
∙ 2009-08-04 00:08:50
Study now
See Answer
Best Answer
Copy
It is approx. 10.0499
10.04987562112089
Wiki User
∙ 2009-08-04 00:08:50
This answer is:
👍
🙏
0
🤨
0
😮
0
Add a Comment
Study guides
Algebra
20 cards
A polynomial of degree zero is a constant term
The grouping method of factoring can still be used when only some of the terms share a common factor A True B False
The sum or difference of p and q is the of the x-term in the trinomial
A number a power of a variable or a product of the two is a monomial while a polynomial is the of monomials
See all cards
Study now
J's study guide
1 card
What is the name of Steve on minecraft's name
See all cards
Study now
Steel Tip Darts Out Chart
96 cards
170
169
168
167
See all cards
Study now
Add your answer:
Earn +20 pts
Q: What is the sqrt of 101?
Submit
How do you solve x to the 2nd power plus x-101 equals zero?
x&sup2;+x-101 = 0Solve using the quadra

And now with the LoRA we just trained with GRPO - we first save the LoRA first!

In [39]:
model.save_lora("grpo_saved_lora")

Now we load the LoRA and test:

In [40]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user",   "content": "What is the sqrt of 101?"},
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = False,
)
from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 1.0,
    top_k = 50,
    max_tokens = 2048,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

output

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

"Okay, let's see. I need to find the square root of 101. Hmm, I remember that the square root of a number is a value that, when multiplied by itself, gives the original number. So, sqrt(101) is the number that, when you multiply it by itself, equals 101. Let me think...\n\nFirst, I should recall some perfect squares close to 101. The square root of 100 is 10, so 10 × 10 = 100. That's just 1 less than 101. So the square root of 101 must be slightly more than 10. Let me try 10.05. Wait, maybe I can use a calculator for this part.\n\nIf I use a calculator, sqrt(101) is approximately 10.05. But the problem might expect an exact answer if it's a simplifiable square root. Wait, 101 is a prime number, right? Let me check if 101 is prime. To be prime, it should not be divisible by any number other than 1 and itself. I can test divisibility by small prime numbers like 2, 3, 5, 7, 11, etc.\n\n101 divided by 2 is 50.5, not an integer. Divided by 3: 33.666..., not integer. Divided by 5: 20.2, not 

In [43]:
print("""
Okay, let's see. I need to find the square root of 101. Hmm, I remember that the square root of a number is a value that, when multiplied by itself, gives the original number. So, sqrt(101) is the number that, when you multiply it by itself, equals 101. Let me think...

First, I should recall some perfect squares close to 101. The square root of 100 is 10, so 10 × 10 = 100. That's just 1 less than 101. So the square root of 101 must be slightly more than 10. Let me try 10.05. Wait, maybe I can use a calculator for this part.

If I use a calculator, sqrt(101) is approximately 10.05. But the problem might expect an exact answer if it's a simplifiable square root. Wait, 101 is a prime number, right? Let me check if 101 is prime. To be prime, it should not be divisible by any number other than 1 and itself. I can test divisibility by small prime numbers like 2, 3, 5, 7, 11, etc.

101 divided by 2 is 50.5, not an integer. Divided by 3: 33.666..., not integer. Divided by 5: 20.2, not integer. Divided by 7: 14.428..., not integer. Divided by 11: 9.181..., not integer. So 101 seems to be prime. Therefore, sqrt(101) can't be simplified further.

So, the exact value is sqrt(101), which is approximately 10.05. But since the problem might not expect a decimal approximation, the precise answer is sqrt(101). So, my working out is that 101 is a prime number, so its square root cannot be expressed as a simpler radical. Therefore, the answer is sqrt(101).<end_working_out><SOLUTION>sqrt(101)</SOLUTION>
""")


Okay, let's see. I need to find the square root of 101. Hmm, I remember that the square root of a number is a value that, when multiplied by itself, gives the original number. So, sqrt(101) is the number that, when you multiply it by itself, equals 101. Let me think...

First, I should recall some perfect squares close to 101. The square root of 100 is 10, so 10 × 10 = 100. That's just 1 less than 101. So the square root of 101 must be slightly more than 10. Let me try 10.05. Wait, maybe I can use a calculator for this part.

If I use a calculator, sqrt(101) is approximately 10.05. But the problem might expect an exact answer if it's a simplifiable square root. Wait, 101 is a prime number, right? Let me check if 101 is prime. To be prime, it should not be divisible by any number other than 1 and itself. I can test divisibility by small prime numbers like 2, 3, 5, 7, 11, etc.

101 divided by 2 is 50.5, not an integer. Divided by 3: 33.666..., not integer. Divided by 5: 20.2, not intege

# saving for the Float16 for VLLM

In [41]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

# GGUF / llama.cpp Conversion

To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [42]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )

# download model

In [44]:
import os
from google.colab import files

In [45]:
folder_path = "/content/grpo_saved_lora" # Replace with the path to your folder

zip_file_name = f"{folder_path}.zip"
!zip -r "{zip_file_name}" "{folder_path}"

  adding: content/grpo_saved_lora/ (stored 0%)
  adding: content/grpo_saved_lora/adapter_model.safetensors (deflated 21%)
  adding: content/grpo_saved_lora/adapter_config.json (deflated 57%)
  adding: content/grpo_saved_lora/README.md (deflated 66%)


In [46]:
files.download(zip_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>