# Structured Output with Tools

In [1]:
import json
from pprint import pprint
from pydantic import BaseModel, Field

from xverify import ToolUse, run_tools, Env
from xverify.tools import calculator, search
from pydantic import ValidationError

> Often when running multi-step reasoning, we want to use tools to help us.

However, not many libraries natively support this. Pydantic for instance is optimized for a static declarative schema, which isn't well suited to ad-hoc tool use.

Here we can see two examples of tools:
- `calculator`: essentially a wrapper around the `eval` function
- `search`: uses duckduckgo to search the web

In [2]:
print(f"{calculator(expression='3 + 4 * (6 ** 7)')=}")
print("\n---\n")
print(f"{search(query='What is the capital of France?', num_results=1)=}")

calculator(expression='3 + 4 * (6 ** 7)')='1119747'

---

search(query='What is the capital of France?', num_results=1)='• Paris - Wikipedia\n  Paris is a global city of culture, finance, diplomacy, and tourism, with an estimated population of 2 million residents in 2025.'


The problem is we can't (natively) include a tool call in a Pydantic model (due to the static declarative schema).

However, we can use the new `ToolUse` class to handle tool calls.

In [3]:
class ReasoiningTool(BaseModel):
    """The result of a reasoning tool"""

    reasoning: str
    tool_use: ToolUse[calculator, search]


calc_2_2 = ReasoiningTool.model_validate(
    {
        "reasoning": "Let's add two numbers",
        "tool_use": {"tool_name": "calculator", "expression": "2 + 2"},
    }
)
print(calc_2_2)
print(f"{calc_2_2.tool_use.run_tool()=}") # on a ToolUse object, we can call run_tool() to run the tool and get the result

reasoning="Let's add two numbers" tool_use=calculator(expression='2 + 2', tool_name='calculator')
calc_2_2.tool_use.run_tool()='4'


This is nice because if we can easily validate any arbitary schema and tool use is correct without any ad-hoc parsing (and we'll be able to enforce the LLM output is correct with guided decoding).

In [4]:
try:
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "none_existing_tool", "expression": "2 + 2"},
        }
    )
except ValidationError:
    print("tool not found!")
try:
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "wrong_arg": "2 + 2"},
        },
    )
except ValidationError:
    print("wrong argument!")

try:
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": 2 + 2},
        },
    )
except ValidationError:
    print("wrong argument type!")

try:
    # TODO: this should be a validation error
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": "2 + 2", "extra_arg": "extra_arg"},
        },
    )
except ValidationError:
    print("extra argument!")

tool not found!
wrong argument!
wrong argument type!


We can implement a ReACT loop with tools really easily:

In [5]:

from typing import Literal, Union

class Tools(BaseModel):
    """
    Run a tool.
    """
    action: Literal["tool_use"] = Field(..., description="Action discriminator")
    tool_use: ToolUse[calculator, search] = Field(..., description="The tool call to use")

class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """
    action: Literal["final_answer"] = Field(..., description="Action discriminator")
    answer: str = Field(..., description="The final answer to the question")

class Reason_and_Act(BaseModel):
    scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    reasoning: str = Field(..., description="It describes your thoughts about the question you have been asked")
    output: Union[Tools, FinalAnswer] = Field(..., description="Final output: choose between the tool call or the final answer", discriminator="action")


res = Reason_and_Act.model_validate(
    {
        "scratchpad": "the question is 2 + 2",
        "reasoning": "we should use the calculator tool!",
        "output": {
            "action": "tool_use",
            "tool_use": {
                "tool_name": "calculator",
                "expression": "2 + 2",
            }
        }
    },
)
res

Reason_and_Act(scratchpad='the question is 2 + 2', reasoning='we should use the calculator tool!', output=Tools(action='tool_use', tool_use=calculator(expression='2 + 2', tool_name='calculator')))

And in case we just want to run all the tools in a response:

In [6]:
run_tools(res)

{'output': {'tool_use': {'calculator': '4'}}}

This will return `None` where no tools were called, which is useful for checking for the end of the loop.

In [7]:
print(run_tools(Reason_and_Act(scratchpad="", reasoning="", output=FinalAnswer(action="final_answer", answer="42"))))

None


If you just want the output, run `run_tools` on the instantiated `ToolUse` object itself:

In [8]:
if isinstance(res.output, Tools):
    print(f"{res.output.tool_use.run_tool()=}")
else:
    print(f"{res.output.answer=}")

res.output.tool_use.run_tool()='4'


And of course we can do multiple tool calls in a single response:

In [9]:
class MultiToolUse(BaseModel):
    tool_use: list[ToolUse[calculator, search]]


res = MultiToolUse.model_validate(
    {
        "tool_use": [
            {"tool_name": "calculator", "expression": "2 + 2"},
            {
                "tool_name": "search",
                "query": "What is the radius of the moon?",
                "num_results": 2,
            },
            {"tool_name": "calculator", "expression": "3424 * 432432"},
        ]
    }
)
pprint(run_tools(res))

{'tool_use': [{'calculator': '4'},
              {'search': '• Moon Fact Sheet - NSSDCA\n'
                         '  Equatorial radius (km) 1738.1: 6378.1: 0.2725: '
                         'Polar radius (km) 1736.0: 6356.8: 0.2731: Volumetric '
                         'mean radius (km) 1737.4: 6371.0: 0.2727: Ellipticity '
                         '(Flattening) ...\n'
                         '\n'
                         '• What is the Radius of Moon? - GeeksforGeeks\n'
                         "  The Moon's radius is about 27% of Earth's radius, "
                         'and its gravitational force is about 1/6th of '
                         "Earth's gravity."},
              {'calculator': '1480647168'}]}


So this is cool, kinda, but now we have structed schema, we can **enforce** the LLM output is correct with guided decoding.

Let's go back to our ReACT loop, and use `Env` to enforce the tool calls are correct.

In [10]:

env = Env(Reason_and_Act)
print(env.gbnf)

root ::= grammar-models
grammar-models ::= Reason_and_Act
Reason_and_Act ::= "<Reason_and_Act>" nl "<scratchpad>" nl string nl "</scratchpad>" nl "<reasoning>" nl string nl "</reasoning>" nl "<output>" nl Reason_and_Act-output-union nl "</output>" nl nl "</Reason_and_Act>" nl
Tools ::= "<Tools>" nl "<action>" nl Toolsaction nl "</action>" nl "<tool_use>" nl Tools-tool_use-union nl "</tool_use>" nl nl "</Tools>" nl
Toolsaction ::= "tool_use" 
calculator ::= "<calculator>" nl "<expression>" nl string nl "</expression>" nl "<tool_name>" nl calculatortool_name nl "</tool_name>" nl nl "</calculator>" nl
calculatortool_name ::= "calculator" 
search ::= "<search>" nl "<query>" nl string nl "</query>" nl "<num_results>" nl integer nl "</num_results>" nl "<tool_name>" nl searchtool_name nl "</tool_name>" nl nl "</search>" nl
searchtool_name ::= "search" 
Tools-tool_use-union ::= calculator | search
FinalAnswer ::= "<FinalAnswer>" nl "<action>" nl FinalAnsweraction nl "</action>" nl "<answer>" n

In [11]:
print(env.doc)

Output Model: Reason_and_Act
  Output Fields:
    scratchpad (str):
        Description: Information from the Observation useful to answer the question
    reasoning (str):
        Description: It describes your thoughts about the question you have been asked
    output (Tools or FinalAnswer):
        Description: Final output: choose between the tool call or the final answer

Model: Tools
  Description: Run a tool.
  Fields:
    action (Literal):
        Description: Action discriminator
    tool_use (calculator or search):
        Description: The tool call to use

Model: FinalAnswer
  Description: Return a final answer.
  Fields:
    action (Literal):
        Description: Action discriminator
    answer (str):
        Description: The final answer to the question

Model: calculator
  Description: Evaluates a single line of Python math expression. No imports or variables allowed.

    Examples:
        {"expression": "2 + 2"} -> "4"
        {"expression": "3 * (17 + 4)"} -> "63"
    

In [55]:
from vllm import LLM
from vllm.sampling_params import RequestOutputKind

if "llm" not in globals():  # interactive use
    llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", max_model_len=2000)

env = Env(Reason_and_Act)
print("***")
print(env.gbnf)
print("***")


sampling_params = env.sampling_params(
    max_tokens=500,
    guided_decoding=dict(whitespace_pattern=r""),
    n=1,
    temperature=1.,
    output_kind=RequestOutputKind.FINAL_ONLY,
)

max_steps = 5
messages: list[dict] = [
    {
        "role": "system",
        "content": f"""\
You are a helpful assistant, responding via JSON structured output. Tool responses from the user are also in JSON.
- Think step by step using the scratchpad, reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer once your are sure you have the answer.

Respond with a JSON object, following the schema below:

{env.doc}

""",
    },
    {"role": "user", "content": "What is the radius of the moon?"},
]

outp = llm.chat(  # type: ignore
    messages=messages,  # type: ignore
    sampling_params=sampling_params,
    use_tqdm=False,
)
print(outp[0].outputs[0].text)


# for _ in range(max_steps):
#     outp = llm.chat(  # type: ignore
#         messages=messages,  # type: ignore
#         sampling_params=sampling_params,
#         use_tqdm=False,
#     )
#     struct_res = env.parse(outp[0].outputs[0].text)
#     messages.append({"role": "assistant", "content": struct_res.model_dump()})
#     tool_outp = run_tools(struct_res)
#     if tool_outp:
#         messages.append({"role": "user", "content": tool_outp})
#     else:
#         break

# print(json.dumps(messages, indent=2))

***
root ::= grammar-models
grammar-models ::= Reason_and_Act
Reason_and_Act ::= nl "<Reason_and_Act>" nl "<scratchpad>" string nl "</scratchpad>" nl "<reasoning>" string nl "</reasoning>" nl "<output>" Reason_and_Act-output-union nl "</output>" nl "</Reason_and_Act>"
Tools ::= nl "<Tools>" nl "<action>" Toolsaction nl "</action>" nl "<tool_use>" Tools-tool_use-union nl "</tool_use>" nl "</Tools>"
Toolsaction ::= nl "tool_use" 
calculator ::= nl "<calculator>" nl "<expression>" string nl "</expression>" nl "<tool_name>" calculatortool_name nl "</tool_name>" nl "</calculator>"
calculatortool_name ::= nl "calculator" 
search ::= nl "<search>" nl "<query>" string nl "</query>" nl "<num_results>" integer nl "</num_results>" nl "<tool_name>" searchtool_name nl "</tool_name>" nl "</search>"
searchtool_name ::= nl "search" 
Tools-tool_use-union ::= calculator | search
FinalAnswer ::= nl "<FinalAnswer>" nl "<action>" FinalAnsweraction nl "</action>" nl "<answer>" string nl "</answer>" nl "</Fi

In [57]:
print(env.doc)

Output Model: Reason_and_Act
  Output Fields:
    scratchpad (str):
        Description: Information from the Observation useful to answer the question
    reasoning (str):
        Description: It describes your thoughts about the question you have been asked
    output (Tools or FinalAnswer):
        Description: Final output: choose between the tool call or the final answer

Model: Tools
  Description: Run a tool.
  Fields:
    action (Literal):
        Description: Action discriminator
    tool_use (calculator or search):
        Description: The tool call to use

Model: FinalAnswer
  Description: Return a final answer.
  Fields:
    action (Literal):
        Description: Action discriminator
    answer (str):
        Description: The final answer to the question

Model: calculator
  Description: Evaluates a single line of Python math expression. No imports or variables allowed.

    Examples:
        {"expression": "2 + 2"} -> "4"
        {"expression": "3 * (17 + 4)"} -> "63"
    

In [52]:
outp[0].outputs[0].text

"\n<Reason_and_Act>\n<scratchpad>\nTo answer this question, I'll need information about the Earth-Moon system and its known configuration. The Moon's distance from the Earth (perigee) and apogee is about 384, 404 kilometers apart.\nI also need estimates of the Moon's radius at perigee and apogee:\n- Perigee radius: about 3,474 kilometers\n- Apogee radius: about 3,844 kilometers\nExpectedly, these would give me an estimate for the radius of the Moon once these individual extents were assumed. The median and error estimates are round numbers, so this doesn't help address the specific, easily-supported criteria for the radius.\nWould you like to scale this into kilometers? Since the usual range for lunar rags is in the tens of kilometers, I'll give it up for now.\n</scratchpad>\n<reasoning>\nThe Moon's radius at apogee is approximately 3,844 kilometers, as mentioned in my initial explanation above. However, the current scope of the question doesn't provide direct access to these numbers o

In [14]:
outp

[RequestOutput(request_id=0, prompt='<|im_start|>system\nYou are a helpful assistant, responding via JSON structured output. Tool responses from the user are also in JSON.\n- Think step by step using the scratchpad, reasoning outputs. You have 4 steps to think before responding.\n- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.\n- Respond with a final answer once your are sure you have the answer.\n\nRespond with a JSON object, following the schema below:\n\nOutput Model: Reason_and_Act\n  Output Fields:\n    scratchpad (str):\n        Description: Information from the Observation useful to answer the question\n    reasoning (str):\n        Description: It describes your thoughts about the question you have been asked\n    output (Tools or FinalAnswer):\n        Description: Final output: choose between the tool call or the final answer\n\nModel: Tools\n  Description: Run a tool.\n  Fields:\n    action (Literal):\n        Description: Ac

In [17]:
print(outp[0].outputs[0].text)

nl <Reason_and_Act>
nl <scratchpad>
nl {"expression": "pi * r^2 = 1"} -> "The radius of the moon is approximately 1.7425698479799956"
nl {"expression": "r = 1.7425698479799956"} -> "The radius of the moon is approximately 1.7425698479799956" 
nl {"expression": "pi * 1.7425698479799956^2 = 1"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "r = 1.7425698479799956"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "r = 1.7425698479799956"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "r = 1.7425698479799956"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "r = 1.7425698479799956"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "r = 1.7425698479799956"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "r = 1.7425698479799956"} -> "True" 
nl {"expression": "pi * r^2 = 1"} -> "True" 
nl {"expression": "


And the cool thing is, this is a Qwen 1.5B model, not even trained on function calling!