# Structured Output with Tools

In [1]:
import json
from pprint import pprint
from pydantic import BaseModel, Field

from xverify import ToolUse, run_tools, Env
from xverify.tools import calculator, search
from pydantic import ValidationError

> Often when running multi-step reasoning, we want to use tools to help us.

However, not many libraries natively support this. Pydantic for instance is optimized for a static declarative schema, which isn't well suited to ad-hoc tool use.

Here we can see two examples of tools:
- `calculator`: essentially a wrapper around the `eval` function
- `search`: uses duckduckgo to search the web

In [2]:
print(f"{calculator(expression='3 + 4 * (6 ** 7)')=}")
print("\n---\n")
print(f"{search(query='What is the capital of France?', num_results=1)=}")

calculator(expression='3 + 4 * (6 ** 7)')='1119747'

---

search(query='What is the capital of France?', num_results=1)='• Paris - Wikipedia\n  Paris is a global city of culture, finance, diplomacy, and tourism, with an estimated population of 2 million residents in 2025.'


The problem is we can't (natively) include a tool call in a Pydantic model (due to the static declarative schema).

However, we can use the new `ToolUse` class to handle tool calls.

In [3]:
class ReasoiningTool(BaseModel):
    """The result of a reasoning tool"""

    reasoning: str
    tool_use: ToolUse[calculator, search]


calc_2_2 = ReasoiningTool.model_validate(
    {
        "reasoning": "Let's add two numbers",
        "tool_use": {"tool_name": "calculator", "expression": "2 + 2"},
    }
)
print(calc_2_2)
print(f"{calc_2_2.tool_use.run_tool()=}") # on a ToolUse object, we can call run_tool() to run the tool and get the result

reasoning="Let's add two numbers" tool_use=calculator(expression='2 + 2', tool_name='calculator')
calc_2_2.tool_use.run_tool()='4'


This is nice because if we can easily validate any arbitary schema and tool use is correct without any ad-hoc parsing (and we'll be able to enforce the LLM output is correct with guided decoding).

In [None]:
try:
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "none_existing_tool", "expression": "2 + 2"},
        }
    )
except ValidationError:
    print("tool not found!")
try:
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "wrong_arg": "2 + 2"},
        },
    )
except ValidationError:
    print("wrong argument!")

try:
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": 2 + 2},
        },
    )
except ValidationError:
    print("wrong argument type!")

try:
    # TODO: this should be a validation error
    ReasoiningTool.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": "2 + 2", "extra_arg": "extra_arg"},
        },
    )
except ValidationError:
    print("extra argument!")

tool not found!
wrong argument!
wrong argument type!


We can implement a ReACT loop with tools really easily:

In [48]:

from typing import Literal, Union

class Tools(BaseModel):
    """
    Run a tool.
    """
    action: Literal["tool_use"] = Field(..., description="Action discriminator")
    tool_use: ToolUse[calculator, search] = Field(..., description="The tool call to use")

class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """
    action: Literal["final_answer"] = Field(..., description="Action discriminator")
    answer: str = Field(..., description="The final answer to the question")

class Reason_and_Act(BaseModel):
    scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    reasoning: str = Field(..., description="It describes your thoughts about the question you have been asked")
    output: Union[Tools, FinalAnswer] = Field(..., description="Final output: choose between the tool call or the final answer", discriminator="action")


res = Reason_and_Act.model_validate(
    {
        "scratchpad": "the question is 2 + 2",
        "reasoning": "we should use the calculator tool!",
        "output": {
            "action": "tool_use",
            "tool_use": {
                "tool_name": "calculator",
                "expression": "2 + 2",
            }
        }
    },
)
res

Reason_and_Act(scratchpad='the question is 2 + 2', reasoning='we should use the calculator tool!', output=Tools(action='tool_use', tool_use=calculator(expression='2 + 2', tool_name='calculator')))

And in case we just want to run all the tools in a response:

In [19]:
run_tools(res)

{'output': {'tool_use': {'calculator': '4'}}}

This will return `None` where no tools were called, which is useful for checking for the end of the loop.

In [21]:
print(run_tools(Reason_and_Act(scratchpad="", reasoning="", output=FinalAnswer(action="final_answer", answer="42"))))

None


If you just want the output, run `run_tools` on the instantiated `ToolUse` object itself:

In [26]:
if isinstance(res.output, Tools):
    print(f"{res.output.tool_use.run_tool()=}")
else:
    print(f"{res.output.answer=}")

res.output.tool_use.run_tool()='4'


And of course we can do multiple tool calls in a single response:

In [27]:
class MultiToolUse(BaseModel):
    tool_use: list[ToolUse[calculator, search]]


res = MultiToolUse.model_validate(
    {
        "tool_use": [
            {"tool_name": "calculator", "expression": "2 + 2"},
            {
                "tool_name": "search",
                "query": "What is the radius of the moon?",
                "num_results": 2,
            },
            {"tool_name": "calculator", "expression": "3424 * 432432"},
        ]
    }
)
pprint(run_tools(res))

{'tool_use': [{'calculator': '4'},
              {'search': '• Moon Fact Sheet - NSSDCA\n'
                         '  Equatorial radius (km) 1738.1: 6378.1: 0.2725: '
                         'Polar radius (km) 1736.0: 6356.8: 0.2731: Volumetric '
                         'mean radius (km) 1737.4: 6371.0: 0.2727: Ellipticity '
                         '(Flattening) ...\n'
                         '\n'
                         '• Moon - Wikipedia\n'
                         '  The Moon has a solid iron-rich inner core with a '
                         'radius possibly as small as 240 kilometres (150 mi) '
                         'and a fluid outer core primarily made of liquid iron '
                         'with a radius of roughly 300 kilometres (190 m.'},
              {'calculator': '1480647168'}]}


So this is cool, kinda, but now we have structed schema, we can **enforce** the LLM output is correct with guided decoding.

Let's go back to our ReACT loop, and use `Env` to enforce the tool calls are correct.

In [28]:
from vllm import LLM

if "llm" not in globals():  # interactive use
    llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", max_model_len=2000)

env = Env(Reason_and_Act)
sampling_params = env.sampling_params(
    max_tokens=500,
    guided_decoding=dict(whitespace_pattern=r"[\n ]?"),
    n=1,
    temperature=0.5,
)

max_steps = 5
messages: list[dict] = [
    {
        "role": "system",
        "content": f"""\
You are a helpful assistant, responding via JSON structured output. Tool responses from the user are also in JSON.
- Think step by step using the scratchpad, reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer once your are sure you have the answer.

Respond with a JSON object, following the schema below:

{json.dumps(env.model.model_json_schema(), indent=2)}

""",
    },
    {"role": "user", "content": "What is the radius of the moon?"},
]

for _ in range(max_steps):
    outp = llm.chat(  # type: ignore
        messages=messages,  # type: ignore
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    struct_res = env.parse(outp[0].outputs[0].text)
    messages.append({"role": "assistant", "content": struct_res.model_dump()})
    tool_outp = run_tools(struct_res)
    if tool_outp:
        messages.append({"role": "user", "content": tool_outp})
    else:
        break

print(json.dumps(messages, indent=2))

[
  {
    "role": "system",
    "content": "You are a helpful assistant, responding via JSON structured output. Tool responses from the user are also in JSON.\n- Think step by step using the scratchpad, reasoning outputs. You have 4 steps to think before responding.\n- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.\n- Respond with a final answer once your are sure you have the answer.\n\nRespond with a JSON object, following the schema below:\n\n{\n  \"$defs\": {\n    \"FinalAnswer\": {\n      \"properties\": {\n        \"action\": {\n          \"const\": \"final_answer\",\n          \"title\": \"Action\",\n          \"type\": \"string\"\n        },\n        \"answer\": {\n          \"description\": \"The final answer to the question\",\n          \"title\": \"Answer\",\n          \"type\": \"string\"\n        }\n      },\n      \"required\": [\n        \"action\",\n        \"answer\"\n      ],\n      \"title\": \"FinalAnswer\",\n      \

In [30]:
env.model

__main__.Reason_and_Act

In [49]:
from pydantic_gbnf_grammar_generator import generate_gbnf_grammar_and_documentation

gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
    [Reason_and_Act],
    # outer_object_name="Reason_and_Act",
    documentation_with_field_description=True,
)


print(gbnf_grammar)

print("-" * 80)

print(documentation)

root ::= (" "| "\n") grammar-models
grammar-models ::= reason-and--act
reason-and--act ::= "{" "\n"  ws "\"scratchpad\"" ":" ws string "," "\n"  ws "\"reasoning\"" ":" ws string "," "\n"  ws "\"output\"" ":" ws reason-and--act-output-union "\n" ws "}"
tools ::= "{" "\n"  ws "\"action\"" ":" ws unknown "," "\n"  ws "\"tool_use\"" ":" ws unknown "\n" ws "}"
final-answer ::= "{" "\n"  ws "\"action\"" ":" ws unknown "," "\n"  ws "\"answer\"" ":" ws string "\n" ws "}"
reason-and--act-output-union ::= tools | final-answer
boolean ::= "true" | "false"
null ::= "null"
string ::= "\"" (
        [^"\\] |
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\"" ws
ws ::= ([ \t\n] ws)?
float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
integer ::= [0-9]+
--------------------------------------------------------------------------------
Output Model: Reason_and_Act
  Output Fields:
    scratchpad (str):
        Description: Information 

In [57]:
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
    [ToolUse[calculator]],
)

print(documentation)

Output Model: calculator
  Description: Evaluates a single line of Python math expression. No imports or variables allowed.

    Examples:
        {"expression": "2 + 2"} -> "4"
        {"expression": "3 * (17 + 4)"} -> "63"
        {"expression": "100 / 5"} -> "20.0"
    
Returns: str - The result of the calculation or an error message
  Output Fields:
    expression (str):
        Description: A mathematical expression using only numbers and basic operators (+,-,*,/,**,())
    tool_name (literal):
        Description: Function to call




And the cool thing is, this is a Qwen 1.5B model, not even trained on function calling!

In [68]:
class Tools(BaseModel):
    """
    Run a tool.
    """
    action: Literal["tool_use"] = Field(..., description="Action discriminator", json_schema_extra={"example": "{'action': 'tool_use', 'tool_use': {'tool_name': 'calculator', 'expression': '2 + 2'}}"})
    tool_use: ToolUse[calculator]

class FinalAnswer(BaseModel):
    """
    Return a final answer.
    """
    action: Literal["final_answer"] = Field(..., description="Action discriminator")
    answer: str = Field(..., description="The final answer to the question")

class Reason_and_Act(BaseModel):
    scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    reasoning: str = Field(..., description="It describes your thoughts about the question you have been asked")
    output: Union[Tools, FinalAnswer] = Field(..., description="Final output: choose between the tool call or the final answer", discriminator="action")

from pydantic_gbnf_grammar_generator import generate_gbnf_grammar_and_documentation

gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
    [Reason_and_Act],
    # outer_object_name="Reason_and_Act",
    documentation_with_field_description=True,
)


print(gbnf_grammar)

print("-" * 80)

print(documentation)

root ::= (" "| "\n") grammar-models
grammar-models ::= reason-and--act
reason-and--act ::= "{" "\n"  ws "\"scratchpad\"" ":" ws string "," "\n"  ws "\"reasoning\"" ":" ws string "," "\n"  ws "\"output\"" ":" ws reason-and--act-output-union "\n" ws "}"
tools ::= "{" "\n"  ws "\"action\"" ":" ws unknown "," "\n"  ws "\"tool_use\"" ":" ws calculator "\n" ws "}"
calculator ::= "{" "\n"  ws "\"expression\"" ":" ws string "," "\n"  ws "\"tool_name\"" ":" ws unknown "\n" ws "}"
final-answer ::= "{" "\n"  ws "\"action\"" ":" ws unknown "," "\n"  ws "\"answer\"" ":" ws string "\n" ws "}"
reason-and--act-output-union ::= tools | final-answer
boolean ::= "true" | "false"
null ::= "null"
string ::= "\"" (
        [^"\\] |
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\"" ws
ws ::= ([ \t\n] ws)?
float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
integer ::= [0-9]+
----------------------------------------------------------------

In [67]:
Reason_and_Act.model_json_schema()

{'$defs': {'FinalAnswer': {'description': 'Return a final answer.',
   'properties': {'action': {'const': 'final_answer',
     'description': 'Action discriminator',
     'title': 'Action',
     'type': 'string'},
    'answer': {'description': 'The final answer to the question',
     'title': 'Answer',
     'type': 'string'}},
   'required': ['action', 'answer'],
   'title': 'FinalAnswer',
   'type': 'object'},
  'Tools': {'description': 'Run a tool.',
   'properties': {'action': {'const': 'tool_use',
     'description': 'Action discriminator',
     'example': "{'action': 'tool_use', 'tool_use': {'tool_name': 'calculator', 'expression': '2 + 2'}}",
     'title': 'Action',
     'type': 'string'},
    'tool_use': {'$ref': '#/$defs/calculator',
     'description': 'The tool call to use'}},
   'required': ['action', 'tool_use'],
   'title': 'Tools',
   'type': 'object'},
  'calculator': {'description': 'Evaluates a single line of Python math expression. No imports or variables allowed.\n\n