In [1]:
from typing import Union

from datasets import Dataset, load_dataset
from peft import LoraConfig  # type: ignore
from pydantic import BaseModel, Field

import xverify as xv

In [2]:
class Tools(BaseModel):
    """
    Run a tool.
    """

    tool_use: xv.XMLToolUse[xv.calculator, xv.search] = Field(
        ..., description="The tool call to use"
    )


class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """

    answer: int = Field(..., description="Final answer to the question")


class Reason_and_Act(BaseModel):
    scratchpad: str = Field(
        ...,
        description="Information from the Observation useful to answer the question",
    )
    reasoning: str = Field(
        ...,
        description="It describes your thoughts about the question you have been asked",
    )
    # response: Tools | FinalAnswer # this still doesn't work
    response: Union[Tools, FinalAnswer]


def tool_response_func(model: Reason_and_Act) -> dict | None:
    return xv.run_tools(model.response)


guided_schema = xv.GuidedSchema(
    Reason_and_Act,
    schema="xml",
    tool_response_func=tool_response_func,
)

In [3]:
def format_prompt(
    prompt: str, system_prompt: str | None = None
) -> list[dict[str, str]]:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})
    return messages


def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

max_steps = 10

SYSTEM_PROMPT = f"""\
You are a helpful assistant, responding in XML structured output.

- Think step by step using the scratchpad and reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer only once your are absolutely sure you have the answer.

Respond with a XML object, following the schema below:

{guided_schema.doc}
"""

SYSTEM_PROMPT = """\
TEST PROMPT
"""

dataset: Dataset = load_dataset("openai/gsm8k", "main", split="train")  # type: ignore
dataset = dataset.map(lambda x: {
    'prompt': [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': x['question']}
    ],
    'answer': extract_hash_answer(x['answer'])
})

In [4]:
def exact_answer_reward_func(completions, answer, **_) -> list[float]:
    """Reward function that checks if the final answer matches the expected answer."""

    def _check_answer(trajectory: list[dict[str, str]], answer: int) -> float:
        """Extract the last answer from a trajectory."""
        last_message = trajectory[-1]
        assert last_message["role"] == "assistant", "should be assistant"
        parsed: Reason_and_Act | None = guided_schema.parse(last_message["content"])  # type: ignore
        if parsed is None or not isinstance(parsed.response, FinalAnswer):
            return 0.0
        return 1.0 if parsed.response.answer == answer else 0.0

    return [_check_answer(c, a) for c, a in zip(completions, answer)]


model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model, tokenizer = xv.get_model_and_tokenizer(model_name)


run_name = "gsm8k-calculator-peft_" + model_name.split("/")[-1].lower()
training_args = xv.get_default_grpo_config(run_name, num_gpus=1)

peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj"],
)


trainer = xv.GRPOGuidedTrainer(
    guided_schema=guided_schema,
    model=model,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    reward_funcs=[exact_answer_reward_func],
)
trainer.train()

Using Liger kernel
Applied Liger kernels to Qwen2


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


INFO 03-13 15:43:09 __init__.py:207] Automatically detected platform cuda.
INFO 03-13 15:43:14 config.py:549] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
INFO 03-13 15:43:14 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_e

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 15:43:16 model_runner.py:1115] Loading model weights took 0.9266 GB
INFO 03-13 15:43:17 worker.py:267] Memory profiling takes 0.83 seconds
INFO 03-13 15:43:17 worker.py:267] the current vLLM instance can use total_gpu_memory (23.58GiB) x gpu_memory_utilization (0.30) = 7.07GiB
INFO 03-13 15:43:17 worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 4.65GiB.
INFO 03-13 15:43:17 executor_base.py:111] # cuda blocks: 25399, # CPU blocks: 21845
INFO 03-13 15:43:17 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 12.40x
INFO 03-13 15:43:21 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_ut

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:14<00:00,  2.34it/s]

INFO 03-13 15:43:36 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.15 GiB
INFO 03-13 15:43:36 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 19.66 seconds



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtompollak[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO 03-13 15:43:38 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.




[2025-03-13 15:44:03] INFO lib.rs:450: response: https://html.duckduckgo.com/html 200
[2025-03-13 15:44:06] INFO lib.rs:450: response: https://lite.duckduckgo.com/lite/ 200


[2025-03-13 15:44:24] INFO lib.rs:450: response: https://lite.duckduckgo.com/lite/ 200
[2025-03-13 15:44:27] INFO lib.rs:450: response: https://html.duckduckgo.com/html 200


state: False
messages: state['messages']=[{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': 'In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard?', 'role': 'user'}, {'role': 'assistant', 'content': '<Reason_and_Act>\n<scratchpad>\n  \n  3\n\n</scratchpad>\n<reasoning>\nIn the graveyard, there are 20 skeletons. This number is evenly split between adult men and children. Therefore, the number of adult men is half of the total number of skeletons.\n\nEach adult man has 5 more bones than an adult woman, since they have the same number of bones per skeleton. Thus, the number of bones an adult man has is \\(5 + 20 = 25\\).\n\nEach child has half as many bones as an adult woman, so the number of bones each child has is 

ValueError: Completion mask and completion ids are not the same length for state 1

In [None]:
messages = [{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': 'In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard?', 'role': 'user'}, {'role': 'assistant', 'content': '<Reason_and_Act>\n<scratchpad>\nYou were not able to solve the problem using the provided code. I will create a function to calculate the total number of bones in the graveyard based on the information given. Please let me know if you need any function or any other kind of help. Let\'s get started.  \n</scratchpad>\n<reasoning>\nThe problem is straightforward, but let\'s break it down step by step:\n\n1. First, we know there are 20 skeletons in total in the graveyard.\n2. Half of these skeletons are adult women, which means 20/2 = 10 adult women.\n3. The remaining skeletons are the rest of the children and adult men, so there are 20 - 10 = 10 adult men and 10 - 10 = 0 child skeletons.\n\nWe need to calculate the total number of bones based on the given information:\n- An adult woman has 20 bones.\n- A male has 5 more than a woman, so a male has 20 + 5 = 25 bones.\n- A child has half the number of an adult woman, so a child has 20/2 = 10 bones.\n\nHowever, there seems to be a misunderstanding here. The task seems to focus on calculating the bones based on the skeletons and their proportion within the graveyard, but the provided solution directly points out an error in the logic. \n\nI will rerun the verification, ensuring it adheres to the problem description, and then correct it for the final solution.\n</reasoning>\n<response>\n<Tools>\n<tool_use>\n<calculator>\n<expression>\n"total_bones = (adult_women * bones_per_adult_woman) + (adult_men * bones_per_adult_men) + (children * bones_per_child)"\n</expression>\n</calculator>\n</tool_use>\n</Tools>\n</response>\n</Reason_and_Act>'}, {'role': 'user', 'content': '{\n  "tool_use": {\n    "calculator": "Error: Invalid characters in expression"\n  }\n}'}, {'role': 'assistant', 'content': '<Reason_and_Act>\n<scratchpad>\nVerification process confirmed that the provided function is not valid. This is incorrect, as the bones can be calculated based on the skeletons\' proportions rather than being directly computed from the total and their proportions. The total number of bones in the graveyard should be 20, which is the sum of adult women, adult men, and children. The proportion of each group in terms of bones is correct for the adults, but the calculation must be balanced to sum to the total number of bones. This calculation needs to use the provided skeletons\' proportions correctly.\n\nLet me recompute the bones for each group in terms of their proportions:\n\n1. **Adult Women**: 10 bones (1/2 of 20).\n2. **Adult Men**: 0 bones since there are no adult men.\n3. **Children**: 10 bones (10/2 = 5 bones).\n\nAs per the problem statement:\n- An adult woman has 20 bones.\n- An adult man has 5 more than a woman, thus 25 bones.\n- A child has half as many as an adult woman, hence 5 bones.\n\nBased on the proportions, the total number of bones in the graveyard should be the sum of these bone totals:\n- 10 women + 0 men + 10 children = 20 bones in total.\n\nI will now correct the function and compute the bones accurately as per the correct logic.\n</scratchpad>\n<reasoning>\nError in the function was due to the incorrect application of the problem\'s logic and proportions. As mentioned, the correct bones are calculated based on the proportions of adults, men, and children in the graveyard. We know there are 20 skeletons in total. \n\nThe correct bones are:\n- 10 adults (1/2 of 20 in terms of adults).\n- 0 men.\n- 10 children (10/2 = 5 in terms of children).\n\nSumming these up, the total number of bones in the graveyard is 35.\n\nLet me correct the function to directly compute the bones based on these proportions:\n</reasoning>\n<response>\n<Tools>\n<tool_use>\n<calculator>\n<expression>\n"total_bones = (adults * bones_per_adult) + (men * bones_per_men) + (children * bones_per_child)"\n</expression>\n</calculator>\n</tool_use>\n</Tools>\n</response>\n</Reason_and_Act>'}, {'role': 'user', 'content': '{\n  "tool_use": {\n    "calculator": "Error: Invalid characters in expression"\n  }\n}'}]
completion_mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
completion_ids = [27, 25139, 8378, 46672, 397, 27, 54055, 13242, 397, 2610, 1033, 537, 2952, 311, 11625, 279, 3491, 1667, 279, 3897, 2038, 13, 358, 686, 1855, 264, 729, 311, 11047, 279, 2790, 1372, 315, 24854, 304, 279, 71833, 3118, 389, 279, 1995, 2661, 13, 5209, 1077, 752, 1414, 421, 498, 1184, 894, 729, 476, 894, 1008, 3093, 315, 1492, 13, 6771, 594, 633, 3855, 13, 2303, 522, 54055, 13242, 397, 27, 19895, 287, 397, 785, 3491, 374, 30339, 11, 714, 1077, 594, 1438, 432, 1495, 3019, 553, 3019, 1447, 16, 13, 5512, 11, 582, 1414, 1052, 525, 220, 17, 15, 74307, 304, 2790, 304, 279, 71833, 624, 17, 13, 25839, 315, 1493, 74307, 525, 6683, 3198, 11, 892, 3363, 220, 17, 15, 14, 17, 284, 220, 16, 15, 6683, 3198, 624, 18, 13, 576, 9664, 74307, 525, 279, 2732, 315, 279, 2841, 323, 6683, 2953, 11, 773, 1052, 525, 220, 17, 15, 481, 220, 16, 15, 284, 220, 16, 15, 6683, 2953, 323, 220, 16, 15, 481, 220, 16, 15, 284, 220, 15, 1682, 74307, 382, 1654, 1184, 311, 11047, 279, 2790, 1372, 315, 24854, 3118, 389, 279, 2661, 1995, 510, 12, 1527, 6683, 5220, 702, 220, 17, 15, 24854, 624, 12, 362, 8593, 702, 220, 20, 803, 1091, 264, 5220, 11, 773, 264, 8593, 702, 220, 17, 15, 488, 220, 20, 284, 220, 17, 20, 24854, 624, 12, 362, 1682, 702, 4279, 279, 1372, 315, 458, 6683, 5220, 11, 773, 264, 1682, 702, 220, 17, 15, 14, 17, 284, 220, 16, 15, 24854, 382, 11209, 11, 1052, 4977, 311, 387, 264, 69747, 1588, 13, 576, 3383, 4977, 311, 5244, 389, 37614, 279, 24854, 3118, 389, 279, 74307, 323, 862, 21117, 2878, 279, 71833, 11, 714, 279, 3897, 6291, 5961, 3501, 700, 458, 1465, 304, 279, 12218, 13, 4710, 40, 686, 48547, 359, 279, 22901, 11, 22573, 432, 34951, 288, 311, 279, 3491, 4008, 11, 323, 1221, 4396, 432, 369, 279, 1590, 6291, 624, 522, 19895, 287, 397, 27, 2322, 397, 27, 16583, 397, 27, 14172, 15951, 397, 27, 88821, 397, 27, 28099, 397, 1, 5035, 880, 3154, 284, 320, 67856, 1670, 6297, 353, 24854, 5678, 10027, 494, 1670, 6908, 8, 488, 320, 67856, 717, 268, 353, 24854, 5678, 10027, 494, 717, 268, 8, 488, 320, 5864, 353, 24854, 5678, 17268, 12954, 522, 28099, 397, 522, 88821, 397, 522, 14172, 15951, 397, 522, 16583, 397, 522, 2322, 397, 522, 25139, 8378, 46672, 29, 151645, 198, 151644, 872, 198, 515, 220, 330, 14172, 15951, 788, 341, 262, 330, 88821, 788, 330, 1454, 25, 13882, 5766, 304, 7493, 698, 220, 456, 92, 151645, 198, 151644, 77091, 198, 27, 25139, 8378, 46672, 397, 27, 54055, 13242, 397, 62339, 1882, 10774, 429, 279, 3897, 729, 374, 537, 2697, 13, 1096, 374, 15114, 11, 438, 279, 24854, 646, 387, 16588, 3118, 389, 279, 74307, 6, 48792, 4751, 1091, 1660, 5961, 24182, 504, 279, 2790, 323, 862, 48792, 13, 576, 2790, 1372, 315, 24854, 304, 279, 71833, 1265, 387, 220, 17, 15, 11, 892, 374, 279, 2629, 315, 6683, 3198, 11, 6683, 2953, 11, 323, 2841, 13, 576, 21117, 315, 1817, 1874, 304, 3793, 315, 24854, 374, 4396, 369, 279, 12598, 11, 714, 279, 21937, 1969, 387, 23831, 311, 2629, 311, 279, 2790, 1372, 315, 24854, 13, 1096, 21937, 3880, 311, 990, 279, 3897, 74307, 6, 48792, 12440, 382, 10061, 752, 312, 27706, 279, 24854, 369, 1817, 1874, 304, 3793, 315, 862, 48792, 1447, 16, 13, 3070, 52030, 10973, 95518, 220, 16, 15, 24854, 320, 16, 14, 17, 315, 220, 17, 15, 4292, 17, 13, 3070, 52030, 11012, 95518, 220, 15, 24854, 2474, 1052, 525, 902, 6683, 2953, 624, 18, 13, 3070, 11539, 95518, 220, 16, 15, 24854, 320, 16, 15, 14, 17, 284, 220, 20, 24854, 3593, 2121, 817, 279, 3491, 5114, 510, 12, 1527, 6683, 5220, 702, 220, 17, 15, 24854, 624, 12, 1527, 6683, 883, 702, 220, 20, 803, 1091, 264, 5220, 11, 8450, 220, 17, 20, 24854, 624, 12, 362, 1682, 702, 4279, 438, 1657, 438, 458, 6683, 5220, 11, 16085, 220, 20, 24854, 382, 28715, 389, 279, 48792, 11, 279, 2790, 1372, 315, 24854, 304, 279, 71833, 1265, 387, 279, 2629, 315, 1493, 17240, 43645, 510, 12, 220, 16, 15, 3198, 488, 220, 15, 2953, 488, 220, 16, 15, 2841, 284, 220, 17, 15, 24854, 304, 2790, 382, 40, 686, 1431, 4396, 279, 729, 323, 12564, 279, 24854, 29257, 438, 817, 279, 4396, 12218, 624, 522, 54055, 13242, 397, 27, 19895, 287, 397, 1454, 304, 279, 729, 572, 4152, 311, 279, 15114, 3766, 315, 279, 3491, 594, 12218, 323, 48792, 13, 1634, 9733, 11, 279, 4396, 24854, 525, 16588, 3118, 389, 279, 48792, 315, 12598, 11, 2953, 11, 323, 2841, 304, 279, 71833, 13, 1205, 1414, 1052, 525, 220, 17, 15, 74307, 304, 2790, 13, 4710, 785, 4396, 24854, 525, 510, 12, 220, 16, 15, 12598, 320, 16, 14, 17, 315, 220, 17, 15, 304, 3793, 315, 12598, 4292, 12, 220, 15, 2953, 624, 12, 220, 16, 15, 2841, 320, 16, 15, 14, 17, 284, 220, 20, 304, 3793, 315, 2841, 3593, 9190, 5311, 1493, 705, 11, 279, 2790, 1372, 315, 24854, 304, 279, 71833, 374, 220, 18, 20, 382, 10061, 752, 4396, 279, 729, 311, 5961, 12564, 279, 24854, 3118, 389, 1493, 48792, 510, 522, 19895, 287, 397, 27, 2322, 397, 27, 16583, 397, 27, 14172, 15951, 397, 27, 88821, 397, 27, 28099, 397, 1, 5035, 880, 3154, 284, 320, 67856, 82, 353, 24854, 5678, 10027, 494, 8, 488, 320, 5676, 353, 24854, 5678, 717, 268, 8, 488, 320, 5864, 353, 24854, 5678, 17268, 12954, 522, 28099, 397, 522, 88821, 397, 522, 14172, 15951, 397, 522, 16583, 397, 522, 2322, 29, 198, 522, 25139, 8378, 46672, 29, 151645]

In [7]:
len(completion_ids), len(completion_mask)

(947, 956)

In [8]:
messages

[{'content': 'TEST PROMPT\n', 'role': 'system'},
 {'content': 'In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard?',
  'role': 'user'},
 {'role': 'assistant',
  'content': '<Reason_and_Act>\n<scratchpad>\nYou were not able to solve the problem using the provided code. I will create a function to calculate the total number of bones in the graveyard based on the information given. Please let me know if you need any function or any other kind of help. Let\'s get started.  \n</scratchpad>\n<reasoning>\nThe problem is straightforward, but let\'s break it down step by step:\n\n1. First, we know there are 20 skeletons in total in the graveyard.\n2. Half of these skeletons are adult women, which means 20/2 = 10 adult women.\n3. T

In [None]:
messages = [{'content': 'TEST PROMPT\n', 'role': 'system'}, {'content': 'In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard?', 'role': 'user'}, {'role': 'assistant', 'content': '<Reason_and_Act>\n<scratchpad>\nThe total number of bones in the graveyard is 300.\n</scratchpad>\n<reasoning>\n1. Adult women have 20 bones each.\n2. There are half of the 20 bones attributed to adult women, which is 100 bones in total.\n3. The remaining, adult men and children, are split evenly between man and woman. Therefore, there would be 100 bones attributed to adult men as well.\n4. An adult man has 5 more bones than an adult woman, meaning each adult man has 25 bones.\n5. There are 100 bones attributed to adult men, so the total number of bones attributed to adult men is 100 * 25 = 2500 bones.\n6. A child has half as many bones as an adult woman, meaning each child has 10 bones.\n7. There are 100 bones attributed to adult women, so the total number of bones attributed to children is 100 * 10 = 1000 bones.\n8. The total number of bones in the graveyard is the sum of the bones attributed to adult men, adult women, and children: 2500 bones + 1000 bones = 3000 bones.\n\nTherefore, the answer is 300. \n\n</reasoning>\n<response>\n<Tools>\n<tool_use>\n<calculator>\n<expression>\n300\n</expression>\n</calculator>\n</tool_use>\n</Tools>\n</response>\n</Reason_and_Act>'}, {'role': 'user', 'content': '{\n  "tool_use": {\n    "calculator": "300"\n  }\n}'}, {'role': 'assistant', 'content': '<Reason_and_Act>\n<scratchpad>\nThe total number of bones in the graveyard is 300.\n</scratchpad>\n<reasoning>\nThe given tool is a calculator, which allowed us to get the total number of bones in the graveyard.\n</reasoning>\n<response>\n<Tools>\n<tool_use>\n<calculator>\n<expression>\n300\n</expression>\n</calculator>\n</tool_use>\n</Tools>\n</response>\n</Reason_and_Act>'}, {'role': 'user', 'content': '{\n  "tool_use": {\n    "calculator": "300"\n  }\n}'}]
completion_mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
completion_ids = [27, 25139, 8378, 46672, 397, 27, 54055, 13242, 397, 785, 2790, 1372, 315, 24854, 304, 279, 71833, 374, 220, 18, 15, 15, 624, 522, 54055, 13242, 397, 27, 19895, 287, 397, 16, 13, 22133, 3198, 614, 220, 17, 15, 24854, 1817, 624, 17, 13, 2619, 525, 4279, 315, 279, 220, 17, 15, 24854, 29606, 311, 6683, 3198, 11, 892, 374, 220, 16, 15, 15, 24854, 304, 2790, 624, 18, 13, 576, 9664, 11, 6683, 2953, 323, 2841, 11, 525, 6718, 41047, 1948, 883, 323, 5220, 13, 15277, 11, 1052, 1035, 387, 220, 16, 15, 15, 24854, 29606, 311, 6683, 2953, 438, 1632, 624, 19, 13, 1527, 6683, 883, 702, 220, 20, 803, 24854, 1091, 458, 6683, 5220, 11, 7290, 1817, 6683, 883, 702, 220, 17, 20, 24854, 624, 20, 13, 2619, 525, 220, 16, 15, 15, 24854, 29606, 311, 6683, 2953, 11, 773, 279, 2790, 1372, 315, 24854, 29606, 311, 6683, 2953, 374, 220, 16, 15, 15, 353, 220, 17, 20, 284, 220, 17, 20, 15, 15, 24854, 624, 21, 13, 362, 1682, 702, 4279, 438, 1657, 24854, 438, 458, 6683, 5220, 11, 7290, 1817, 1682, 702, 220, 16, 15, 24854, 624, 22, 13, 2619, 525, 220, 16, 15, 15, 24854, 29606, 311, 6683, 3198, 11, 773, 279, 2790, 1372, 315, 24854, 29606, 311, 2841, 374, 220, 16, 15, 15, 353, 220, 16, 15, 284, 220, 16, 15, 15, 15, 24854, 624, 23, 13, 576, 2790, 1372, 315, 24854, 304, 279, 71833, 374, 279, 2629, 315, 279, 24854, 29606, 311, 6683, 2953, 11, 6683, 3198, 11, 323, 2841, 25, 220, 17, 20, 15, 15, 24854, 488, 220, 16, 15, 15, 15, 24854, 284, 220, 18, 15, 15, 15, 24854, 382, 54815, 11, 279, 4226, 374, 220, 18, 15, 15, 13, 4710, 522, 19895, 287, 397, 27, 2322, 397, 27, 16583, 397, 27, 14172, 15951, 397, 27, 88821, 397, 27, 28099, 397, 18, 15, 15, 198, 522, 28099, 397, 522, 88821, 397, 522, 14172, 15951, 397, 522, 16583, 397, 522, 2322, 397, 522, 25139, 8378, 46672, 29, 151645, 198, 151644, 872, 198, 515, 220, 330, 14172, 15951, 788, 341, 262, 330, 88821, 788, 330, 18, 15, 15, 698, 220, 456, 92, 151645, 198, 151644, 77091, 198, 27, 25139, 8378, 46672, 397, 27, 54055, 13242, 397, 785, 2790, 1372, 315, 24854, 304, 279, 71833, 374, 220, 18, 15, 15, 624, 522, 54055, 13242, 397, 27, 19895, 287, 397, 785, 2661, 5392, 374, 264, 29952, 11, 892, 5420, 601, 311, 633, 279, 2790, 1372, 315, 24854, 304, 279, 71833, 624, 522, 19895, 287, 397, 27, 2322, 397, 27, 7740, 82, 397, 27, 14172, 15951, 397, 27, 88821, 397, 27, 28099, 397, 18, 15, 15, 198, 522, 28099, 397, 522, 88821, 397, 522, 14172, 15951, 397, 522, 7740, 82, 397, 522, 2322, 29, 198, 522, 25139, 8378, 46672, 29, 151645]

In [12]:
len(completion_mask)

469

In [13]:
len(completion_ids)

466