In [1]:
from typing import Union

from datasets import Dataset, load_dataset
from peft import LoraConfig  # type: ignore
from pydantic import BaseModel, Field

import xverify as xv

In [2]:
class Tools(BaseModel):
    """
    Run a tool.
    """

    tool_use: xv.XMLToolUse[xv.calculator, xv.search] = Field(
        ..., description="The tool call to use"
    )


class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """

    answer: int = Field(..., description="Final answer to the question")


class Reason_and_Act(BaseModel):
    scratchpad: str = Field(
        ...,
        description="Information from the Observation useful to answer the question",
    )
    reasoning: str = Field(
        ...,
        description="It describes your thoughts about the question you have been asked",
    )
    # response: Tools | FinalAnswer # this still doesn't work
    response: Union[Tools, FinalAnswer]


def tool_response_func(model: Reason_and_Act) -> dict | None:
    return xv.run_tools(model.response)


guided_schema = xv.GuidedSchema(
    Reason_and_Act,
    schema="xml",
    tool_response_func=tool_response_func,
)

In [3]:
def format_prompt(
    prompt: str, system_prompt: str | None = None
) -> list[dict[str, str]]:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})
    return messages


def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

max_steps = 10

SYSTEM_PROMPT = f"""\
You are a helpful assistant, responding in XML structured output.

- Think step by step using the scratchpad and reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer only once your are absolutely sure you have the answer.

Respond with a XML object, following the schema below:

{guided_schema.doc}
"""

SYSTEM_PROMPT = """\
TEST PROMPT
"""

dataset: Dataset = load_dataset("openai/gsm8k", "main", split="train")  # type: ignore
dataset = dataset.map(lambda x: {
    'prompt': [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': x['question']}
    ],
    'answer': extract_hash_answer(x['answer'])
})

In [4]:
def exact_answer_reward_func(completions, answer, **_) -> list[float]:
    """Reward function that checks if the final answer matches the expected answer."""

    def _check_answer(trajectory: list[dict[str, str]], answer: int) -> float:
        """Extract the last answer from a trajectory."""
        last_message = trajectory[-1]
        assert last_message["role"] == "assistant", "should be assistant"
        parsed: Reason_and_Act | None = guided_schema.parse(last_message["content"])  # type: ignore
        if parsed is None or not isinstance(parsed.response, FinalAnswer):
            return 0.0
        return 1.0 if parsed.response.answer == answer else 0.0

    return [_check_answer(c, a) for c, a in zip(completions, answer)]


model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model, tokenizer = xv.get_model_and_tokenizer(model_name)


run_name = "gsm8k-calculator-peft_" + model_name.split("/")[-1].lower()
training_args = xv.get_default_grpo_config(run_name, num_gpus=1)

peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj"],
)


trainer = xv.GRPOGuidedTrainer(
    guided_schema=guided_schema,
    model=model,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    reward_funcs=[exact_answer_reward_func],
)
trainer.train()

Using Liger kernel
Applied Liger kernels to Qwen2


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


INFO 03-13 15:15:52 __init__.py:207] Automatically detected platform cuda.
INFO 03-13 15:15:57 config.py:549] This model supports multiple tasks: {'embed', 'generate', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 03-13 15:15:57 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_e

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 15:15:59 model_runner.py:1115] Loading model weights took 0.9266 GB
INFO 03-13 15:16:00 worker.py:267] Memory profiling takes 0.83 seconds
INFO 03-13 15:16:00 worker.py:267] the current vLLM instance can use total_gpu_memory (23.58GiB) x gpu_memory_utilization (0.30) = 7.07GiB
INFO 03-13 15:16:00 worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 4.65GiB.
INFO 03-13 15:16:00 executor_base.py:111] # cuda blocks: 25399, # CPU blocks: 21845
INFO 03-13 15:16:00 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 12.40x
INFO 03-13 15:16:04 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_ut

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:14<00:00,  2.35it/s]

INFO 03-13 15:16:19 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.15 GiB
INFO 03-13 15:16:19 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 19.58 seconds



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtompollak[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO 03-13 15:16:21 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
1: received env response: None
trajectory: [{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': 'Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers?', 'role': 'user'}, {'role': 'assistant', 'content': "<Reason_and_Act>\n<scratchpad>\n  \nIn order to determine the minimum grade Ahmed needs to beat Emily in the contest, we first need to calculate the total points that Ahmed would have if Emily had also completed the final assignment. \n\nEmily’s grades: 92  \nEmily's final assignment grade: 90  \n\nThe total 



1: received env response: None
trajectory: [{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': 'The gauge on a water tank shows that the tank is 1/3 full of water. To fill the tank, 16 gallons of water are added. How many gallons of water does the tank hold when full?', 'role': 'user'}, {'role': 'assistant', 'content': "<Reason_and_Act>\n<scratchpad>\n  1. This problem involves calculating the total capacity of a water tank given the current and the amount of water it holds.\n  2. To solve this, we begin with the fact that the tank is currently 1/3 full.\n  3. If 16 gallons of water are added to fill the tank to its current state, this means the tank must have more water than the current amount to hold.\n  4. Let the total capacity of the tank be \\( C \\) gallons.\n  5. The current amount of water in the tank is \\( \\frac{1}{3}C \\).\n  6. If 16 gallons are added to fill the tank, the total amount of water becomes \\( \\frac{1}{3}C + 16 \\).\n  7. To find the total capacity,

1: received env response: None
trajectory: [{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': "There are 15 tables in the school's cafeteria. Each table can seat 10 people. Usually, only 1/10 of the seats are left unseated. How many seats are usually taken?", 'role': 'user'}, {'role': 'assistant', 'content': "<Reason_and_Act>\n<scratchpad>\n15 tables * 10 seats/table = 150 seats\n150 seats * 1/10 = 15 seats usually taken\n</scratchpad>\n<reasoning>\n  \nThere are 15 tables in the school's cafeteria. Each table can seat 10 people. So, the total number of seats is 15 tables * 10 seats/table = 150 seats. Normally, 1/10 of the seats are left unseated. So, the number of seats usually taken is 150 seats * 1/10 = 15 seats usually taken.\n\n</reasoning>\n<response>\n<FinalAnswer>\n<answer>\n150\n</answer>\n</FinalAnswer>\n</response>\n</Reason_and_Act>"}]
reached_max_steps: 1 >= 10
1: === completed ===
1: [{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': "There are 15 tabl

[2025-03-13 15:16:46] INFO lib.rs:450: response: https://html.duckduckgo.com/html 200
[2025-03-13 15:16:47] INFO lib.rs:450: response: https://html.duckduckgo.com/html 200


0: received env response: {'role': 'user', 'content': {'tool_use': {'search': "• A Guide To Cafeteria/Lunchroom Seating In Schools\n  In high school, the cafeteria often serves as a study hall or after-hours meeting place for students.\n\n• Cafeteria Design Guide - Architecture Student Chronicles\n  Here is the list of some of the important spaces of a cafeteria that you might see around: Seating area - enclosed space; Seating area - semi-enclosed space; ... Seating area = 6m x 10m = 60sqm.\n\n• [FREE] A school has 17 tables in the cafeteria. Each table seats 12 ...\n  The school has a total of 17 tables. Each table can seat up to 12 students.\n\n• School Cafeteria Seating Guide | SchoolOutlet.com\n  297 Kingsbury Grade Suite 133 Mailing: PO Box 4470 Stateline, NV 89449-4470.\n\n• Should Students Have Assigned Seats in the Cafeteria?\n  Schools have started assigning seats in the cafeteria to curb the chaos and ensure every student is well-fed. But does this truly help in behavior mana

[2025-03-13 15:16:49] INFO lib.rs:450: response: https://lite.duckduckgo.com/lite/ 200


0: received env response: {'role': 'user', 'content': {'tool_use': {'search': "• School Cafeteria Design and Best Practices\n  This article explores key aspects of the school cafeteria, including the challenges of catering to an international student body. ... consider the number of seats and the space needed for movement.\n\n• A school cafeteria has tables that can seat 12 students each. How many ...\n  Calculate the total number of tables needed. Since the 20 tables can seat 240 students, and there are 10 students left, you need one more table for those 10 students.\n\n• A Guide To Cafeteria/Lunchroom Seating In Schools\n  In high school, the cafeteria often serves as a study hall or after-hours meeting place for students.\n\n• Cafeteria Design Guide - Architecture Student Chronicles\n  Here is the list of some of the important spaces of a cafeteria that you might see around: Seating area - enclosed space; Seating area - semi-enclosed space; ... Seating area = 6m x 10m = 60sqm.\n\n• 

ValueError: Completion mask and completion ids are not the same length for state 0