# Structured Output with Tools

In [3]:
import json
from pprint import pprint
from pydantic import BaseModel, Field

from xverify import XMLToolUse, JSONToolUse, run_tools, Env
from xverify.tools import calculator, search
from pydantic import ValidationError

> Often when running multi-step reasoning, we want to use tools to help us.

However, not many libraries natively support this. Pydantic for instance is optimized for a static declarative schema, which isn't well suited to ad-hoc tool use.

Here we can see two examples of tools:
- `calculator`: essentially a wrapper around the `eval` function
- `search`: uses duckduckgo to search the web

In [4]:
print(f"{calculator(expression='3 + 4 * (6 ** 7)')=}")
print("\n---\n")
print(f"{search(query='What is the capital of France?', num_results=1)=}")

calculator(expression='3 + 4 * (6 ** 7)')='1119747'

---

search(query='What is the capital of France?', num_results=1)='• Paris - Wikipedia\n  Paris is a global city of culture, finance, diplomacy, and tourism, with an estimated population of 2 million residents in 2025.'


The problem is we can't (natively) include a tool call in a Pydantic model (due to the static declarative schema).

However, we can use the new `ToolUse` class to handle tool calls.

In [15]:
class ReasoningTool(BaseModel):
    """The result of a reasoning tool"""

    reasoning: str
    tool_use: JSONToolUse[calculator, search]


calc_2_2 = ReasoningTool.model_validate(
    {
        "reasoning": "Let's add two numbers",
        "tool_use": {"tool_name": "calculator", "expression": "2 + 2"},
    }
)
print(calc_2_2)
print(f"{calc_2_2.tool_use.run_tool()=}") # on a ToolUse object, we can call run_tool() to run the tool and get the result

reasoning="Let's add two numbers" tool_use=calculator(tool_name='calculator', expression='2 + 2')
calc_2_2.tool_use.run_tool()='4'


This is nice because if we can easily validate any arbitary schema and tool use is correct without any ad-hoc parsing (and we'll be able to enforce the LLM output is correct with guided decoding).

In [16]:
try:
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "none_existing_tool", "expression": "2 + 2"},
        }
    )
except ValidationError:
    print("tool not found!")
try:
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "wrong_arg": "2 + 2"},
        },
    )
except ValidationError:
    print("wrong argument!")

try:
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": 2 + 2},
        },
    )
except ValidationError:
    print("wrong argument type!")

try:
    # TODO: this should be a validation error
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": "2 + 2", "extra_arg": "extra_arg"},
        },
    )
except ValidationError:
    print("extra argument!")

tool not found!
wrong argument!
wrong argument type!
extra argument!


We can implement a ReACT loop with tools really easily:

In [23]:

from typing import Literal, Union

class Tools(BaseModel):
    """
    Run a tool.
    """
    action: Literal["tool_use"] = Field(..., description="Action discriminator")
    tool_use: XMLToolUse[calculator, search] = Field(..., description="The tool call to use")

class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """
    action: Literal["final_answer"] = Field(..., description="Action discriminator")
    answer: str = Field(..., description="The final answer to the question")

class Reason_and_Act(BaseModel):
    scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    reasoning: str = Field(..., description="It describes your thoughts about the question you have been asked")
    response: Union[Tools, FinalAnswer] = Field(..., description="Final output: choose between the tool call or the final answer", discriminator="action")


res = Reason_and_Act.model_validate(
    {
        "scratchpad": "the question is 2 + 2",
        "reasoning": "we should use the calculator tool!",
        "response": {
            "action": "tool_use",
            "tool_use": {
                # "tool_name": "calculator",
                "expression": "2 + 2",
            }
        }
    },
)
res

Reason_and_Act(scratchpad='the question is 2 + 2', reasoning='we should use the calculator tool!', response=Tools(action='tool_use', tool_use=calculator(expression='2 + 2')))

And in case we just want to run all the tools in a response:

In [25]:
run_tools(res)

{'response': {'tool_use': {'calculator': '4'}}}

This will return `None` where no tools were called, which is useful for checking for the end of the loop.

In [26]:
print(run_tools(Reason_and_Act(scratchpad="", reasoning="", response=FinalAnswer(action="final_answer", answer="42"))))

None


If you just want the output, run `run_tools` on the instantiated `ToolUse` object itself:

In [27]:
if isinstance(res.response, Tools):
    print(f"{res.response.tool_use.run_tool()=}")
else:
    print(f"{res.response.answer=}")

res.response.tool_use.run_tool()='4'


And of course we can do multiple tool calls in a single response:

In [28]:
class MultiToolUse(BaseModel):
    tool_use: list[XMLToolUse[calculator, search]]


res = MultiToolUse.model_validate(
    {
        "tool_use": [
            {"tool_name": "calculator", "expression": "2 + 2"},
            {
                "tool_name": "search",
                "query": "What is the radius of the moon?",
                "num_results": 2,
            },
            {"tool_name": "calculator", "expression": "3424 * 432432"},
        ]
    }
)
pprint(run_tools(res))

{'tool_use': [{'calculator': '4'},
              {'search': '• Moon Fact Sheet - NSSDCA\n'
                         '  Equatorial radius (km) 1738.1: 6378.1: 0.2725: '
                         'Polar radius (km) 1736.0: 6356.8: 0.2731: Volumetric '
                         'mean radius (km) 1737.4: 6371.0: 0.2727: Ellipticity '
                         '(Flattening) ...\n'
                         '\n'
                         '• Moon - Wikipedia\n'
                         '  The Moon has a solid iron-rich inner core with a '
                         'radius possibly as small as 240 kilometres (150 mi) '
                         'and a fluid outer core primarily made of liquid iron '
                         'with a radius of roughly 300 kilometres (190 m.'},
              {'calculator': '1480647168'}]}


So this is cool, kinda, but now we have structed schema, we can **enforce** the LLM output is correct with guided decoding.

Let's go back to our ReACT loop, and use `Env` to enforce the tool calls are correct.

In [None]:
from vllm import LLM

if "llm" not in globals():  # interactive use
    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=2000)

env = Env(Reason_and_Act)

env.sampling_params()

sampling_params = env.sampling_params(
    max_tokens=500,
    n=1,
    temperature=1.,

)

max_steps = 5
messages: list[dict] = [
    {
        "role": "system",
        "content": f"""\
You are a helpful assistant, responding in XML structured output.

- Think step by step using the scratchpad and reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer only once your are absolutely sure you have the answer.

Respond with a XML object, following the schema below:

{env.doc}

Use the tools!
""",
    },
    {"role": "user", "content": "What is the distance from the moon to the sun?"},
]

for _ in range(max_steps):
    outp = llm.chat(  # type: ignore
        messages=messages,  # type: ignore
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    text = outp[0].outputs[0].text
    print("=" * 80)
    print("Assistant:", text)

    struct_res = env.parse(text)
    if not struct_res:
        print("*** Invalid response, skipping ***")
        continue

    messages.append({"role": "assistant", "content": struct_res.model_dump()})
    tool_outp = run_tools(struct_res)
    if tool_outp:
        print("=" * 80)
        print("Tool output:", tool_outp)
        messages.append({"role": "user", "content": tool_outp})
    else:
        break

INFO 03-12 13:19:02 __init__.py:207] Automatically detected platform cuda.
INFO 03-12 13:19:07 config.py:549] This model supports multiple tasks: {'score', 'generate', 'embed', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 03-12 13:19:07 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_exec

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-12 13:19:11 model_runner.py:1115] Loading model weights took 2.8875 GB
INFO 03-12 13:19:11 worker.py:267] Memory profiling takes 0.55 seconds
INFO 03-12 13:19:11 worker.py:267] the current vLLM instance can use total_gpu_memory (23.58GiB) x gpu_memory_utilization (0.90) = 21.22GiB
INFO 03-12 13:19:11 worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 16.89GiB.
INFO 03-12 13:19:12 executor_base.py:111] # cuda blocks: 39529, # CPU blocks: 9362
INFO 03-12 13:19:12 executor_base.py:116] Maximum concurrency for 2000 tokens per request: 316.23x
INFO 03-12 13:19:16 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes:   6%|▌         | 2/35 [00:00<00:16,  2.03it/s]

In [19]:
txt = """
<Reason_and_Act>

<scratchpad>
Using the tool "search" to find information about the distance from the moon to the sun.
</scratchpad>
<reasoning>
The distance from the moon to the sun is a physical fact that is not obtained through direct calculation. However, we can use the tool "search" to find out the typical distance per year for recreational purposes.
</reasoning>
<response>
<FinalAnswer>

<action>
tool_use
</action>
<answer>
495028.5095009445
</answer>
</FinalAnswer>
</response>
</Reason_and_Act>
"""

import xmltodict
from xverify.xml.parser import _get_model_names, _squeeze_model_keys

# Force various container tags to be parsed as lists
parsed = xmltodict.parse(
    text, force_list=("list-item", "set-item", "key-item", "value-item")
)

# Include all container tags in model names for handling
model_names = {
    "list",
    "set",
    "dict",
    "list-item",
    "set-item",
    "key-item",
    "value-item",
    *(_get_model_names(Reason_and_Act)),
}

# First process dictionaries to handle alternating key-value pairs
# processed = _process_dicts(parsed)

# Then squeeze model keys
squeezed = _squeeze_model_keys(parsed, model_names)
Reason_and_Act.model_validate(squeezed)

ValidationError: 1 validation error for Reason_and_Act
response.tool_use.tool_use
  Field required [type=missing, input_value={'action': 'tool_use', 'a...r': '495028.5095009445'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing

In [20]:
squeezed

{'scratchpad': 'Using the tool "search" to find information about the distance from the moon to the sun.',
 'reasoning': 'The distance from the moon to the sun is a physical fact that is not obtained through direct calculation. However, we can use the tool "search" to find out the typical distance per year for recreational purposes.',
 'response': {'action': 'tool_use', 'answer': '495028.5095009445'}}

In [14]:
print(env.gbnf)

root ::= grammar-models
string ::= nl [^\n<] ([^<])*
boolean ::= nl "true" | nl "false"
integer ::= nl [0-9]+
float ::= nl "-"? [0-9]+ ("." [0-9]+)?
null ::= nl "null"
nl ::= "\n"
action-literal ::= nl "tool_use"
tool_name-literal ::= nl "calculator"
calculator ::= nl "<calculator>" nl nl "<tool_name>" tool_name-literal nl "</tool_name>" nl "<expression>" string nl "</expression>" nl "</calculator>"
search ::= nl "<search>" nl nl "<tool_name>" tool_name-literal nl "</tool_name>" nl "<query>" string nl "</query>" nl "<num_results>" integer nl "</num_results>" nl "</search>"
tool_use-union ::= calculator | search
Tools ::= nl "<Tools>" nl nl "<action>" action-literal nl "</action>" nl "<tool_use>" tool_use-union nl "</tool_use>" nl "</Tools>"
FinalAnswer ::= nl "<FinalAnswer>" nl nl "<action>" action-literal nl "</action>" nl "<answer>" string nl "</answer>" nl "</FinalAnswer>"
response-union ::= Tools | FinalAnswer
Reason_and_Act ::= nl "<Reason_and_Act>" nl nl "<scratchpad>" string nl 

In [None]:
print(env.doc)

Output Model: Reason_and_Act
  Output Fields:
    scratchpad (str):
        Description: Information from the Observation useful to answer the question
    reasoning (str):
        Description: It describes your thoughts about the question you have been asked
    response (Tools or FinalAnswer):
        Description: Final output: choose between the tool call or the final answer

Model: Tools
  Description: Run a tool.
  Fields:
    action (Literal['tool_use']):
        Description: Action discriminator
    tool_use (calculator or search):
        Description: The tool call to use

Model: FinalAnswer
  Description: Return a final answer.
  Fields:
    action (Literal['final_answer']):
        Description: Action discriminator
    answer (str):
        Description: The final answer to the question

Model: calculator
  Description: Evaluates a single line of Python math expression. No imports or variables allowed.
    
        Examples:
            <expression>
            2 + 2
         

And the cool thing is, this is a Qwen 1.5B model, not even trained on function calling!