# Structured Output with Tools

In [1]:
import json
from pprint import pprint
from textwrap import dedent
from pydantic import BaseModel, Field

from xverify import ToolUse, run_tools, Env
from xverify.tools import calculator, search
from pydantic import ValidationError

  from .autonotebook import tqdm as notebook_tqdm
2025-03-10 20:03:48,727	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


> Often when running multi-step reasoning, we want to use tools to help us.

However, not many libraries natively support this. Pydantic for instance is optimized for a static declarative schema, which isn't well suited to ad-hoc tool use.

Here we can see two examples of tools:
- `calculator`: essentially a wrapper around the `eval` function
- `search`: uses duckduckgo to search the web

In [2]:
print(f"{calculator(expression='3 + 4 * (6 ** 7)')=}")
print("\n---\n")
print(f"{search(query='What is the capital of France?', num_results=1)=}")

calculator(expression='3 + 4 * (6 ** 7)')='1119747'

---

search(query='What is the capital of France?', num_results=1)='• Paris - Wikipedia\n  Paris (French pronunciation: ⓘ) is the capital and largest city of France.'


The problem is we can't (natively) include a tool call in a Pydantic model (due to the static declarative schema).

However, we can use the new `ToolUse` class to handle tool calls.

In [3]:
class ReasoiningToolResult(BaseModel):
    """The result of a reasoning tool"""

    reasoning: str
    tool_use: ToolUse[calculator, search]


calc_2_2 = ReasoiningToolResult.model_validate(
    {
        "reasoning": "Let's add two numbers",
        "tool_use": {"tool_name": "calculator", "expression": "2 + 2"},
    }
)
print(calc_2_2)
print(f"{calc_2_2.tool_use.run_tool()=}") # on a ToolUse object, we can call run_tool() to run the tool and get the result

reasoning="Let's add two numbers" tool_use=calculator(expression='2 + 2', tool_name='calculator')
calc_2_2.tool_use.run_tool()='4'


This is nice because if we can easily validate any arbitary schema and tool use is correct without any ad-hoc parsing (and we'll be able to enforce the LLM output is correct with guided decoding).

In [4]:
try:
    ReasoiningToolResult.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "none_existing_tool", "expression": "2 + 2"},
        }
    )
except ValidationError:
    print("tool not found!")
try:
    ReasoiningToolResult.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "wrong_arg": "2 + 2"},
        },
    )
except ValidationError:
    print("wrong argument!")

try:
    ReasoiningToolResult.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": 2 + 2},
        },
    )
except ValidationError:
    print("wrong argument type!")

try:
    # TODO: this should be a validation error
    ReasoiningToolResult.model_validate(
        {
            "reasoning": "",
            "tool_use": {"tool_name": "calculator", "expression": "2 + 2", "extra_arg": "extra_arg"},
        },
    )
except ValidationError:
    print("extra argument!")

tool not found!
wrong argument!
wrong argument type!


We can implement a ReACT loop with tools really easily:

In [5]:
class Reason_and_Act(BaseModel):
    scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    reasoning: str = Field(..., description="It describes your thoughts about the question you have been asked")
    tool_use: ToolUse[calculator, search] = Field(..., description="The tool call to use")

class FinalAnswer(BaseModel):
    answer: str = Field(..., description="The final answer to the question")

class Response(BaseModel):
    decision: Reason_and_Act | FinalAnswer


res = Response.model_validate(
    {
        "decision": {
            "scratchpad": "the question is 2 + 2",
            "reasoning": "we should use the calculator tool!",
            "tool_use": {
                "tool_name": "calculator",
                "expression": "2 + 2",
            },
        },
    }
)
res

Response(decision=Reason_and_Act(scratchpad='the question is 2 + 2', reasoning='we should use the calculator tool!', tool_use=calculator(expression='2 + 2', tool_name='calculator')))

And in case we just want to run all the tools in a response:

In [6]:
run_tools(res)

{'decision': {'tool_use': {'calculator': '4'}}}

This will return `None` where no tools were called, which is useful for checking for the end of the loop.

In [7]:
print(run_tools(Response(decision=FinalAnswer(answer="42"))))

None


If you just want the output, run `run_tools` on the instantiated `ToolUse` object itself:

In [8]:
if isinstance(res.decision, Reason_and_Act):
    print(f"{res.decision.tool_use.run_tool()=}")
else:
    print(f"{res.decision.answer=}")

res.decision.tool_use.run_tool()='4'


And of course we can do multiple tool calls in a single response:

In [9]:
class MultiToolUse(BaseModel):
    tool_use: list[ToolUse[calculator, search]]


res = MultiToolUse.model_validate(
    {
        "tool_use": [
            {"tool_name": "calculator", "expression": "2 + 2"},
            {
                "tool_name": "search",
                "query": "What is the radius of the moon?",
                "num_results": 2,
            },
            {"tool_name": "calculator", "expression": "3424 * 432432"},
        ]
    }
)
pprint(run_tools(res))

{'tool_use': [{'calculator': '4'},
              {'search': '• Moon Fact Sheet - NSSDCA\n'
                         '  Equatorial radius (km) 1738.1: 6378.1: 0.2725: '
                         'Polar radius (km) 1736.0: 6356.8: 0.2731: Volumetric '
                         'mean radius (km) 1737.4: 6371.0: 0.2727: Ellipticity '
                         '(Flattening) ...\n'
                         '\n'
                         '• Moon - Wikipedia\n'
                         '  The Moon has a solid iron-rich inner core with a '
                         'radius possibly as small as 240 kilometres (150 mi) '
                         'and a fluid outer core primarily made of liquid iron '
                         'with a radius of roughly 300 kilometres (190 m.'},
              {'calculator': '1480647168'}]}


So this is cool, kinda, but now we have structed schema, we can **enforce** the LLM output is correct with guided decoding.

Let's go back to our ReACT loop, and use `Env` to enforce the tool calls are correct.

In [10]:
from vllm import LLM

if "llm" not in globals():  # interactive use
    llm = LLM(model="Qwen/Qwen2.5-14B-Instruct", max_model_len=2000)

env = Env(Response)
sampling_params = env.sampling_params(
    max_tokens=500,
    guided_decoding=dict(whitespace_pattern=r"[\n ]?"),
    n=1,
    temperature=0.5,
)

max_steps = 5
messages: list[dict] = [
    {
        "role": "system",
        "content": f"""\
You are a helpful assistant, responding via JSON structured output. Tool responses from the user are also in JSON.
- Think step by step using the scratchpad, reasoning outputs. You have {max_steps - 1} steps to think before responding.
- Use the tools provided. DO NOT rely on your own knowledge when a tool is available to help you.
- Respond with a final answer once your are sure you have the answer.

Respond with a JSON object, following the schema below:

{json.dumps(env.model.model_json_schema(), indent=2)}

""",
    },
    {"role": "user", "content": "What is the radius of the moon?"},
]

for _ in range(max_steps):
    outp = llm.chat(  # type: ignore
        messages=messages,  # type: ignore
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    struct_res = env.parse(outp[0].outputs[0].text)
    messages.append({"role": "assistant", "content": struct_res.model_dump()})
    tool_outp = run_tools(struct_res)
    if tool_outp:
        messages.append({"role": "user", "content": tool_outp})
    else:
        break

print(json.dumps(messages, indent=2))

INFO 03-10 20:03:52 __init__.py:207] Automatically detected platform cuda.
INFO 03-10 20:03:57 config.py:549] This model supports multiple tasks: {'score', 'classify', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 03-10 20:03:57 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-14B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-14B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execut

OutOfMemoryError: CUDA out of memory. Tried to allocate 270.00 MiB. GPU 0 has a total capacity of 23.58 GiB of which 86.88 MiB is free. Including non-PyTorch memory, this process has 23.45 GiB memory in use. Of the allocated memory 23.15 GiB is allocated by PyTorch, and 12.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [14]:
env = Env(Response)
print(json.dumps(env.model.model_json_schema(), indent=2))

{
  "$defs": {
    "FinalAnswer": {
      "properties": {
        "answer": {
          "description": "The final answer to the question",
          "title": "Answer",
          "type": "string"
        }
      },
      "required": [
        "answer"
      ],
      "title": "FinalAnswer",
      "type": "object"
    },
    "Reason_and_Act": {
      "properties": {
        "scratchpad": {
          "description": "Information from the Observation useful to answer the question",
          "title": "Scratchpad",
          "type": "string"
        },
        "reasoning": {
          "description": "It describes your thoughts about the question you have been asked",
          "title": "Reasoning",
          "type": "string"
        },
        "tool_use": {
          "description": "The tool call to use",
          "discriminator": {
            "mapping": {
              "calculator": "#/$defs/calculator",
              "search": "#/$defs/search"
            },
            "propertyName": "t

In [15]:
import json_schema_for_humans
json_schema_for_humans.generate(env.model.model_json_schema())

AttributeError: module 'json_schema_for_humans' has no attribute 'generate'

In [25]:
import io
import json

# Create a file-like object to mimic file I/O
schema_file = io.StringIO(json.dumps(env.model.model_json_schema()))
result_file = io.StringIO()

# Add name attribute to StringIO objects
schema_file.name = "schema.json"
result_file.name = "result.html"
schema_file.seek(0)
result_file.seek(0)


# Generate the schema documentation using the file-like objects
generate.generate_from_file_object(schema_file, result_file)

# Print the generated documentation
print(result_file.getvalue())



== Generating result.html ==


Exception: Cannot generate documentation since root schema could not be loaded

In [22]:
import io
from json_schema_for_humans import generate

# Create a file-like object to mimic file I/O
schema_file = io.StringIO(json.dumps(env.model.model_json_schema()))
result_file = io.StringIO()

# Generate the schema documentation using the file-like objects
generate.generate_from_file_object(schema_file, result_file)

# Print the generated documentation
print(result_file.getvalue())


AttributeError: '_io.StringIO' object has no attribute 'name'

In [None]:

generate.generate_from_schema()

ValueError: Unable to find a schema to render from 

And the cool thing is, this is a Qwen 1.5B model, not even trained on function calling!