From bcb80dee4e1c2eb8bc39d92f64bbc08537090a1e Mon Sep 17 00:00:00 2001 From: Gabor Szabo Date: Mon, 1 Jun 2026 04:58:03 +0200 Subject: [PATCH] fix(agents): stop read-only tool-call loop in experiment guard (#349) --- app/features/agents/agents/base.py | 10 ++++++++ .../agents/tests/test_read_only_guard.py | 23 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/app/features/agents/agents/base.py b/app/features/agents/agents/base.py index d098b567..c69b944e 100644 --- a/app/features/agents/agents/base.py +++ b/app/features/agents/agents/base.py @@ -403,6 +403,16 @@ def requires_approval(action_name: str) -> bool: to create, save, promote, archive, run a backtest, or run an experiment. - Answer directly in the ExperimentReport `summary` field, grounded in tool output. +FINISH IN ONE PASS — do not loop: +- Call each read-only tool AT MOST ONCE per question. +- The MOMENT a read tool returns, STOP calling tools and write your + ExperimentReport `summary` from what it returned — you already have the answer. +- NEVER call a tool again that has already returned. Re-running the same tool + (e.g. tool_list_runs twice) is the most common failure: it burns the retry + budget until the run is killed. Use the data you already received. +- If a read tool returns an EMPTY result, say so in the `summary` (e.g. "No model + runs found.") — do NOT retry the tool hoping for different data. + OUTPUT-FORMAT RETRIES: - If your previous reply failed schema validation (e.g. "summary: Field required"), DO NOT call any new tool. Only reformat the data you already obtained into a diff --git a/app/features/agents/tests/test_read_only_guard.py b/app/features/agents/tests/test_read_only_guard.py index c59e232d..6c1a3f4c 100644 --- a/app/features/agents/tests/test_read_only_guard.py +++ b/app/features/agents/tests/test_read_only_guard.py @@ -85,6 +85,29 @@ def test_guard_forbids_new_tools_on_validation_retry() -> None: assert "summary: Field required" in guard +def test_guard_forbids_tool_call_loops() -> None: + """The guard tells the model to finish in one pass and never re-call a tool (#349). + + Regression for the observed failure where a weak 8B model called + ``tool_list_runs`` four times in a row — even though it already had the data — + and blew the output-retry budget (``Exceeded maximum output retries (3)``). + """ + guard = READ_ONLY_INTENT_GUARD + assert "FINISH IN ONE PASS" in guard + assert "AT MOST ONCE" in guard + assert "NEVER call a tool again that has already returned" in guard + assert "STOP calling tools" in guard + + +def test_guard_handles_empty_tool_result() -> None: + """An empty read result is reported in the summary, not retried (#349).""" + guard = READ_ONLY_INTENT_GUARD + assert "EMPTY result" in guard + assert "do NOT retry the tool" in guard + # The wrapped example phrase, newlines/indent collapsed. + assert "No model runs found." in " ".join(guard.split()) + + def test_guard_requires_clarification_for_ambiguous_top_products() -> None: """An ambiguous "top products" ranking gets a clarifying question, not a guess.""" guard = READ_ONLY_INTENT_GUARD