In [1]:
from typing import Union

from datasets import Dataset, load_dataset
from peft import LoraConfig  # type: ignore
from pydantic import BaseModel, Field

import xverify as xv

In [2]:
class Tools(BaseModel):
    """
    Run a tool.
    """

    tool_use: xv.XMLToolUse[xv.calculator, xv.search] = Field(
        ..., description="The tool call to use"
    )


class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """

    answer: int = Field(..., description="Final answer to the question")


class Reason_and_Act(BaseModel):
    scratchpad: str = Field(
        ...,
        description="Information from the Observation useful to answer the question",
    )
    reasoning: str = Field(
        ...,
        description="It describes your thoughts about the question you have been asked",
    )
    response: Tools | FinalAnswer
    # response: Union[Tools, FinalAnswer] = Field(..., description="Current step response")


def tool_response_func(model: Reason_and_Act) -> dict | None:
    return xv.run_tools(model.response)


guided_schema = xv.GuidedSchema(
    Reason_and_Act,
    schema="xml",
    tool_response_func=tool_response_func,
)

In [3]:
def format_prompt(
    prompt: str, system_prompt: str | None = None
) -> list[dict[str, str]]:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})
    return messages


def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

max_steps = 10

SYSTEM_PROMPT = f"""\
You are a helpful assistant, responding in XML structured output.

- Think step by step using the scratchpad and reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer only once your are absolutely sure you have the answer.

Respond with a XML object, following the schema below:

{guided_schema.doc}
"""

dataset: Dataset = load_dataset("openai/gsm8k", "main", split="train")  # type: ignore
dataset = dataset.map(lambda x: {
    'prompt': [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': x['question']}
    ],
    'answer': extract_hash_answer(x['answer'])
})

In [None]:
def exact_answer_reward_func(completions, answer, **_) -> list[float]:
    """Reward function that checks if the final answer matches the expected answer."""

    def _check_answer(trajectory: list[dict[str, str]], answer: str) -> float:
        """Extract the last answer from a trajectory."""
        last_message = trajectory[-1]
        assert last_message["role"] == "assistant", "should be assistant"
        parsed: Reason_and_Act | None = guided_schema.parse(last_message["content"])  # type: ignore
        if parsed is None or not isinstance(parsed.response, FinalAnswer):
            return 0.0
        return 1.0 if str(parsed.response.answer) == answer else 0.0

    return [_check_answer(c, a) for c, a in zip(completions, answer)]


model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model, tokenizer = xv.get_model_and_tokenizer(model_name)


run_name = "gsm8k-calculator-peft_" + model_name.split("/")[-1].lower()
training_args = xv.get_default_grpo_config(
    run_name,
    num_gpus=1,
    ### GRPO params ###
    learning_rate=1e-6,
    num_generations=8,
    per_device_train_batch_size=16, # 2 prompts per batch
    gradient_accumulation_steps=2, # 1 prompt per forward
    logging_steps=10,
    num_iterations=2, # 1 on-policy + 1 off-policy
    max_steps=250,
    max_prompt_length=200,
    max_completion_length=256,
    vllm_gpu_memory_utilization=0.3,
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj"],
)


trainer = xv.GRPOGuidedTrainer(
    guided_schema=guided_schema,
    model=model,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    reward_funcs=[exact_answer_reward_func],
)
trainer.train()

Using Liger kernel
Applied Liger kernels to Qwen2


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


INFO 03-17 16:56:51 __init__.py:207] Automatically detected platform cuda.
INFO 03-17 16:56:56 config.py:549] This model supports multiple tasks: {'classify', 'reward', 'generate', 'embed', 'score'}. Defaulting to 'generate'.
INFO 03-17 16:56:56 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_e

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-17 16:56:58 model_runner.py:1115] Loading model weights took 2.8875 GB
INFO 03-17 16:57:00 worker.py:267] Memory profiling takes 1.72 seconds
INFO 03-17 16:57:00 worker.py:267] the current vLLM instance can use total_gpu_memory (23.58GiB) x gpu_memory_utilization (0.30) = 7.07GiB
INFO 03-17 16:57:00 worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 2.11GiB.
INFO 03-17 16:57:00 executor_base.py:111] # cuda blocks: 4929, # CPU blocks: 9362
INFO 03-17 16:57:00 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 2.41x
INFO 03-17 16:57:04 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:16<00:00,  2.14it/s]

INFO 03-17 16:57:21 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.20 GiB
INFO 03-17 16:57:21 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 22.42 seconds



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtompollak[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO 03-17 16:57:22 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.




[2025-03-17 16:58:11] INFO lib.rs:450: response: https://lite.duckduckgo.com/lite/ 200
[2025-03-17 16:58:12] INFO lib.rs:450: response: https://lite.duckduckgo.com/lite/ 200


Step,Training Loss


[2025-03-17 16:58:44] INFO lib.rs:450: response: https://lite.duckduckgo.com/lite/ 200
