In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-14B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.184 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/202k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unsloth 2025.4.7 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [4]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [5]:
from trl import GRPOConfig, GRPOTrainer

output_name="qwen25_14b_grpo_take1"

training_args = GRPOConfig(
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "wandb", # Can use Weights & Biases
    run_name=output_name,
    output_dir = output_name,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


Training for 250 steps took around 3hrs

In [6]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 275,251,200/14,000,000,000 (1.97% trained)
[34m[1mwandb[0m: Currently logged in as: [33mtituslhy[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


-------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<reasoning>
First, calculate the cost for the first 10 tickets, which are not discounted. Each ticket costs $40, so for 10 tickets, the cost is $40 * 10. Then, calculate the cost for the remaining 2 tickets, each with a 5% discount on the original price. The discounted price for each additional ticket is $40 - ($40 * 0.05). Multiply this discounted price by 2 (since he bought 2 tickets that had a discount).

Alternatively, you can also calculate the total cost by multiplying $40 * 12 first and then subtracting the total discount amount on the additional 2 tickets. The discount amount for each additional ticket is $40 * 0.05, and for 2 tickets, the total discount is 2 * ($40 * 0.05).
</reasoning>
<answer>
The cost for the first  
Extracted:
The cost for the first


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / int_reward_func,rewards / correctness_reward_func
1,0.0,-0.1045,0.213288,196.625,0.0,-0.1045,0.0,0.0,0.0,0.0
2,0.0,0.125,0.0,200.0,0.0,0.125,0.0,0.0,0.0,0.0
3,0.0,-0.065625,0.264471,200.0,0.000394,-0.065625,0.0,0.0,0.0,0.0
4,0.0,0.492875,1.212036,179.75,0.000452,-0.132125,0.0,0.0,0.125,0.5
5,0.0,-0.266125,0.027503,200.0,0.000595,-0.266125,0.0,0.0,0.0,0.0
6,0.0,-0.043375,0.232565,200.0,0.000579,-0.043375,0.0,0.0,0.0,0.0
7,0.0,0.125,0.0,200.0,0.000307,0.125,0.0,0.0,0.0,0.0
8,0.0,-0.02175,0.270273,189.375,0.000653,-0.02175,0.0,0.0,0.0,0.0
9,0.0,0.840375,1.319984,183.25,0.000806,-0.097125,0.0,0.0,0.1875,0.75
10,0.0,0.125,0.0,200.0,0.00039,0.125,0.0,0.0,0.0,0.0


-------------------- Question:
Jane is trying to decide whether to buy a house or a trailer. A house costs $480,000 and a trailer costs $120,000. Each loan will be paid in monthly installments over 20 years. How much more is the monthly payment on the house compared to the trailer? 
Answer:
1500 
Response:
<reasoning>
To solve this problem, we will need to calculate the monthly payments for both the house and the trailer assuming the loans have the same interest rate and the same 20-year payoff period. However, since the problem does not specify an interest rate, for simplicity and to derive a solution, we'll assume a standard annual interest rate often used in mortgage calculations, such as 4% (or 0.04 as a decimal), which is commonly applied for long-term loans. The formula for the monthly payment on a loan is:

\[ M = P \frac{r(1+r)^n}{(1+r)^n - 1} \]

where
- \( M \) is the monthly payment,
- \( P \) is the principal loan amount,
- \( r \) is the monthly interest rate (annual rate 

TrainOutput(global_step=250, training_loss=0.01682951555186065, metrics={'train_runtime': 8548.0744, 'train_samples_per_second': 0.234, 'train_steps_per_second': 0.029, 'total_flos': 0.0, 'train_loss': 0.01682951555186065})

## Push to Hub

PS: This step took 17m!

In [7]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

model.push_to_hub_merged(
    "tituslhy/qwen25_14b_grpo_take1", 
    tokenizer, 
    save_method = "merged_16bit", 
    token = os.environ["HUGGINGFACE_ACCESS_TOKEN"]
)

Unsloth: You are pushing to hub, but you passed your HF username = tituslhy.
We shall truncate tituslhy/qwen25_14b_grpo_take1 to qwen25_14b_grpo_take1
Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.43 out of 15.42 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 15%|█▍        | 7/48 [00:00<00:02, 14.27it/s]
We will save to Disk and not RAM now.
100%|██████████| 48/48 [02:43<00:00,  3.40s/it]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

 Done.
Unsloth: Saving qwen25_14b_grpo_take1/pytorch_model-00001-of-00006.bin...
Unsloth: Saving qwen25_14b_grpo_take1/pytorch_model-00002-of-00006.bin...
Unsloth: Saving qwen25_14b_grpo_take1/pytorch_model-00003-of-00006.bin...
Unsloth: Saving qwen25_14b_grpo_take1/pytorch_model-00004-of-00006.bin...
Unsloth: Saving qwen25_14b_grpo_take1/pytorch_model-00005-of-00006.bin...
Unsloth: Saving qwen25_14b_grpo_take1/pytorch_model-00006-of-00006.bin...


README.md:   0%|          | 0.00/611 [00:00<?, ?B/s]

  0%|          | 0/6 [00:00<?, ?it/s]

pytorch_model-00003-of-00006.bin:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

pytorch_model-00001-of-00006.bin:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

pytorch_model-00002-of-00006.bin:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

pytorch_model-00005-of-00006.bin:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

pytorch_model-00004-of-00006.bin:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

pytorch_model-00006-of-00006.bin:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/tituslhy/qwen25_14b_grpo_take1


## Quantization!

In [2]:
# ① Point at the real binary in build/bin
real_q = os.path.expanduser("~/llama.cpp/build/bin/llama-quantize")
assert os.path.exists(real_q), f"{real_q} not found!"

# ② Make a local 'llama.cpp' folder in your notebook working directory
cwd = os.getcwd()
local_pack = os.path.join(cwd, "llama.cpp")
os.makedirs(local_pack, exist_ok=True)

# ③ Symlink it as 'llama-quantize' and also as 'quantize'
for name in ("llama-quantize", "quantize"):
    link = os.path.join(local_pack, name)
    if os.path.exists(link) or os.path.islink(link):
        os.remove(link)
    os.symlink(real_q, link)

# ④ Verify
print("Notebook sees:", os.listdir(local_pack))

Notebook sees: ['.github', 'CODEOWNERS', 'pyproject.toml', 'README.md', 'gguf-py', 'ggml', '.clang-tidy', '.pre-commit-config.yaml', 'examples', 'tests', 'convert_llama_ggml_to_gguf.py', 'cmake', '.gitignore', 'CMakeLists.txt', 'build-xcframework.sh', 'scripts', 'Makefile', 'pocs', 'pyrightconfig.json', 'poetry.lock', 'convert_hf_to_gguf_update.py', 'src', 'docs', 'convert_hf_to_gguf.py', 'mypy.ini', 'llama-quantize', 'CONTRIBUTING.md', 'models', '.git', '.dockerignore', 'AUTHORS', 'requirements.txt', 'licenses', '.clang-format', 'flake.nix', 'prompts', 'tools', '.ecrc', '.flake8', 'grammars', '.devops', 'media', '.editorconfig', 'SECURITY.md', 'LICENSE', 'include', 'requirements', 'flake.lock', 'CMakePresets.json', 'ci', 'build', 'common', '.gitmodules', 'convert_lora_to_gguf.py', 'quantize']


In [None]:
model.push_to_hub_gguf(
    "tituslhy/qwen25_14b_grpo_take1", # Change hf to your username!
    tokenizer,
    quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
    token = os.environ["HUGGINGFACE_ACCESS_TOKEN"], # Get a token at https://huggingface.co/settings/tokens
)

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.53 out of 15.42 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


  0%|          | 0/48 [00:00<?, ?it/s]
We will save to Disk and not RAM now.
100%|██████████| 48/48 [01:53<00:00,  2.37s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving tituslhy/qwen25_14b_grpo_take1/pytorch_model-00001-of-00006.bin...
Unsloth: Saving tituslhy/qwen25_14b_grpo_take1/pytorch_model-00002-of-00006.bin...
Unsloth: Saving tituslhy/qwen25_14b_grpo_take1/pytorch_model-00003-of-00006.bin...
Unsloth: Saving tituslhy/qwen25_14b_grpo_take1/pytorch_model-00004-of-00006.bin...
Unsloth: Saving tituslhy/qwen25_14b_grpo_take1/pytorch_model-00005-of-00006.bin...
Unsloth: Saving tituslhy/qwen25_14b_grpo_take1/pytorch_model-00006-of-00006.bin...
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at tituslhy/qwen25_14b_grpo_take1 into bf16 GGUF format.
The output location will be /home/ubuntu/ideal-palm-tree/notebooks/tituslhy/qwen25_14b_grpo_take1/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: qwen25_14b_grpo_take1
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_m

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/8.99G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/tituslhy/qwen25_14b_grpo_take1
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/15.7G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/tituslhy/qwen25_14b_grpo_take1
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q5_K_M.gguf:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/tituslhy/qwen25_14b_grpo_take1


: 

## Taking out our LLM for a spin!

Serve the LLM via vLLM

```
ollama pull hf.co/tituslhy/qwen25_14b_grpo_take1:Q4_K_M
```