# Structured Output with Tools

In [1]:
import json
from pprint import pprint

from pydantic import BaseModel, Field, ValidationError

from xverify import GuidedSchema, JSONToolUse, XMLToolUse, run_tools
from xverify.tools import calculator, search


> Often when running multi-step reasoning, we want to use tools to help us.

However, not many libraries natively support this. Pydantic for instance is optimized for a static declarative schema, which isn't well suited to ad-hoc tool use.

Here we can see two examples of tools:
- `calculator`: essentially a wrapper around the `eval` function
- `search`: uses duckduckgo to search the web

In [2]:
print(f"{calculator(expression='3 + 4 * (6 ** 7)')=}")
print("\n---\n")
print(f"{search(query='What is the capital of France?', num_results=1)=}")

calculator(expression='3 + 4 * (6 ** 7)')='1119747'

---

search(query='What is the capital of France?', num_results=1)='• Paris - Wikipedia\n  Paris is a global city of culture, finance, diplomacy, and tourism, with an estimated population of 2 million residents in 2025.'


The problem is we can't (natively) include a tool call in a Pydantic model (due to the static declarative schema).

However, we can use the new `ToolUse` class to handle tool calls.

In [3]:
class ReasoningTool(BaseModel):
    """The result of a reasoning tool"""

    reasoning: str
    tool_use: JSONToolUse[calculator, search]


calc_2_2 = ReasoningTool.model_validate(
    {
        "reasoning": "Let's add two numbers",
        "tool_use": {"tool_name": "calculator", "expression": "2 + 2"},
    }
)
print(calc_2_2)
print(
    f"{calc_2_2.tool_use.run_tool()=}"
)  # on a ToolUse object, we can call run_tool() to run the tool and get the result

reasoning="Let's add two numbers" tool_use=calculator(tool_name='calculator', expression='2 + 2')
calc_2_2.tool_use.run_tool()='4'


This is nice because if we can easily validate any arbitary schema and tool use is correct without any ad-hoc parsing (and we'll be able to enforce the LLM output is correct with guided decoding).

In [4]:
try:
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "none_existing_tool", "expression": "2 + 2"},
        }
    )
except ValidationError:
    print("tool not found!")
try:
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "wrong_arg": "2 + 2"},
        },
    )
except ValidationError:
    print("wrong argument!")

try:
    ReasoningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": 2 + 2},
        },
    )
except ValidationError:
    print("wrong argument type!")

tool not found!
wrong argument!
wrong argument type!


We can implement a ReACT loop with tools really easily:

In [5]:
from typing import Literal, Union


class Tools(BaseModel):
    """
    Run a tool.
    """

    tool_use: XMLToolUse[calculator, search] = Field(
        ..., description="The tool call to use"
    )


class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """

    answer: str = Field(..., description="The final answer to the question")


class Reason_and_Act(BaseModel):
    scratchpad: str = Field(
        ...,
        description="Information from the Observation useful to answer the question",
    )
    reasoning: str = Field(
        ...,
        description="It describes your thoughts about the question you have been asked",
    )
    response: Union[Tools, FinalAnswer]


res = Reason_and_Act.model_validate(
    {
        "scratchpad": "the question is 2 + 2",
        "reasoning": "we should use the calculator tool!",
        "response": {
            "tool_use": {
                # "tool_name": "calculator",
                "expression": "2 + 2",
            },
        },
    },
)

Reason_and_Act.model_validate(
    {
        "scratchpad": "the question is 2 + 2",
        "reasoning": "we should use the calculator tool!",
        "response": {"answer": "42"},
    }
)

Reason_and_Act(scratchpad='the question is 2 + 2', reasoning='we should use the calculator tool!', response=FinalAnswer(answer='42'))

And in case we just want to run all the tools in a response:

In [6]:
run_tools(res)

{'response': {'tool_use': {'calculator': '4'}}}

This will return `None` where no tools were called, which is useful for checking for the end of the loop.

In [7]:
print(
    run_tools(
        Reason_and_Act(
            scratchpad="",
            reasoning="",
            response=FinalAnswer(answer="42"),
        )
    )
)

None


If you just want the output, run `run_tools` on the instantiated `ToolUse` object itself:

In [8]:
if isinstance(res.response, Tools):
    print(f"{res.response.tool_use.run_tool()=}")
else:
    print(f"{res.response.answer=}")

res.response.tool_use.run_tool()='4'


And of course we can do multiple tool calls in a single response:

In [9]:
class MultiToolUse(BaseModel):
    tool_use: list[XMLToolUse[calculator, search]]


res = MultiToolUse.model_validate(
    {
        "tool_use": [
            {"tool_name": "calculator", "expression": "2 + 2"},
            {
                "tool_name": "search",
                "query": "What is the radius of the moon?",
                "num_results": 2,
            },
            {"tool_name": "calculator", "expression": "3424 * 432432"},
        ]
    }
)
pprint(run_tools(res))

{'tool_use': [{'calculator': '4'},
              {'search': '• Moon Fact Sheet - NSSDCA\n'
                         '  Equatorial radius (km) 1738.1: 6378.1: 0.2725: '
                         'Polar radius (km) 1736.0: 6356.8: 0.2731: Volumetric '
                         'mean radius (km) 1737.4: 6371.0: 0.2727: Ellipticity '
                         '(Flattening) ...\n'
                         '\n'
                         '• Moon - Wikipedia\n'
                         '  The Moon has a solid iron-rich inner core with a '
                         'radius possibly as small as 240 kilometres (150 mi) '
                         'and a fluid outer core primarily made of liquid iron '
                         'with a radius of roughly 300 kilometres (190 m.'},
              {'calculator': '1480647168'}]}


So this is cool, kinda, but now we have structed schema, we can **enforce** the LLM output is correct with guided decoding.

Let's go back to our ReACT loop, and use `Env` to enforce the tool calls are correct.

In [13]:


env = GuidedSchema(Reason_and_Act)
print(env.gbnf)

root ::= grammar-models
string ::= nl [^\n<] ([^<])*
boolean ::= nl "true" | nl "false"
integer ::= nl [0-9]+
float ::= nl "-"? [0-9]+ ("." [0-9]+)?
null ::= nl "null"
nl ::= "\n"
calculator ::= nl "<calculator>" nl "<expression>" string nl "</expression>" nl "</calculator>"
search ::= nl "<search>" nl "<query>" string nl "</query>" nl "<num_results>" integer nl "</num_results>" nl "</search>"
tool_use-union ::= calculator | search
Tools ::= nl "<Tools>" nl "<tool_use>" tool_use-union nl "</tool_use>" nl "</Tools>"
FinalAnswer ::= nl "<FinalAnswer>" nl "<answer>" string nl "</answer>" nl "</FinalAnswer>"
response-union ::= Tools | FinalAnswer
Reason_and_Act ::=  "<Reason_and_Act>" nl "<scratchpad>" string nl "</scratchpad>" nl "<reasoning>" string nl "</reasoning>" nl "<response>" response-union nl "</response>" nl "</Reason_and_Act>"
grammar-models ::= Reason_and_Act


In [10]:
from vllm import LLM

if "llm" not in globals():  # interactive use
    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=2000)

env = GuidedSchema(Reason_and_Act)

env.sampling_params()

sampling_params = env.sampling_params(
    max_tokens=500,
    n=1,
    temperature=1.0,
)

max_steps = 5
messages: list[dict] = [
    {
        "role": "system",
        "content": f"""\
You are a helpful assistant, responding in XML structured output.

- Think step by step using the scratchpad and reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer only once your are absolutely sure you have the answer.

Respond with a XML object, following the schema below:

{env.doc}

Use the tools!
""",
    },
    {"role": "user", "content": "What is the distance from the moon to the sun?"},
]

for _ in range(max_steps):
    outp = llm.chat(  # type: ignore
        messages=messages,  # type: ignore
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    text = outp[0].outputs[0].text
    print("=" * 80)
    print(f"Assistant:\n{text}")

    struct_res = env.parse(text)
    if not struct_res:
        print("*** Invalid response, skipping ***")
        continue

    messages.append({"role": "assistant", "content": struct_res.model_dump()})
    tool_outp = run_tools(struct_res)
    if tool_outp:
        print("=" * 80)
        print(f"Tool output:\n{tool_outp}")
        messages.append({"role": "user", "content": tool_outp})
    else:
        break

INFO 03-20 10:44:58 __init__.py:207] Automatically detected platform cuda.
INFO 03-20 10:45:04 config.py:549] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 03-20 10:45:04 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_exec

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-20 10:45:12 model_runner.py:1115] Loading model weights took 2.8875 GB
INFO 03-20 10:45:12 worker.py:267] Memory profiling takes 0.56 seconds
INFO 03-20 10:45:12 worker.py:267] the current vLLM instance can use total_gpu_memory (23.58GiB) x gpu_memory_utilization (0.90) = 21.22GiB
INFO 03-20 10:45:12 worker.py:267] model weights take 2.89GiB; non_torch_memory takes 3.65GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 13.30GiB.
INFO 03-20 10:45:13 executor_base.py:111] # cuda blocks: 31127, # CPU blocks: 9362
INFO 03-20 10:45:13 executor_base.py:116] Maximum concurrency for 2000 tokens per request: 249.02x


OutOfMemoryError: CUDA out of memory. Tried to allocate 488.00 MiB. GPU 0 has a total capacity of 23.58 GiB of which 245.56 MiB is free. Process 26922 has 16.59 GiB memory in use. Including non-PyTorch memory, this process has 6.69 GiB memory in use. Of the allocated memory 6.22 GiB is allocated by PyTorch, and 166.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

NameError: name 'env' is not defined