From bcb80dee4e1c2eb8bc39d92f64bbc08537090a1e Mon Sep 17 00:00:00 2001
From: Gabor Szabo <shellsnake@icloud.com>
Date: Mon, 1 Jun 2026 04:58:03 +0200
Subject: [PATCH] fix(agents): stop read-only tool-call loop in experiment
 guard (#349)

---
 app/features/agents/agents/base.py            | 10 ++++++++
 .../agents/tests/test_read_only_guard.py      | 23 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/app/features/agents/agents/base.py b/app/features/agents/agents/base.py
index d098b567..c69b944e 100644
--- a/app/features/agents/agents/base.py
+++ b/app/features/agents/agents/base.py
@@ -403,6 +403,16 @@ def requires_approval(action_name: str) -> bool:
   to create, save, promote, archive, run a backtest, or run an experiment.
 - Answer directly in the ExperimentReport `summary` field, grounded in tool output.
 
+FINISH IN ONE PASS — do not loop:
+- Call each read-only tool AT MOST ONCE per question.
+- The MOMENT a read tool returns, STOP calling tools and write your
+  ExperimentReport `summary` from what it returned — you already have the answer.
+- NEVER call a tool again that has already returned. Re-running the same tool
+  (e.g. tool_list_runs twice) is the most common failure: it burns the retry
+  budget until the run is killed. Use the data you already received.
+- If a read tool returns an EMPTY result, say so in the `summary` (e.g. "No model
+  runs found.") — do NOT retry the tool hoping for different data.
+
 OUTPUT-FORMAT RETRIES:
 - If your previous reply failed schema validation (e.g. "summary: Field required"),
   DO NOT call any new tool. Only reformat the data you already obtained into a
diff --git a/app/features/agents/tests/test_read_only_guard.py b/app/features/agents/tests/test_read_only_guard.py
index c59e232d..6c1a3f4c 100644
--- a/app/features/agents/tests/test_read_only_guard.py
+++ b/app/features/agents/tests/test_read_only_guard.py
@@ -85,6 +85,29 @@ def test_guard_forbids_new_tools_on_validation_retry() -> None:
     assert "summary: Field required" in guard
 
 
+def test_guard_forbids_tool_call_loops() -> None:
+    """The guard tells the model to finish in one pass and never re-call a tool (#349).
+
+    Regression for the observed failure where a weak 8B model called
+    ``tool_list_runs`` four times in a row — even though it already had the data —
+    and blew the output-retry budget (``Exceeded maximum output retries (3)``).
+    """
+    guard = READ_ONLY_INTENT_GUARD
+    assert "FINISH IN ONE PASS" in guard
+    assert "AT MOST ONCE" in guard
+    assert "NEVER call a tool again that has already returned" in guard
+    assert "STOP calling tools" in guard
+
+
+def test_guard_handles_empty_tool_result() -> None:
+    """An empty read result is reported in the summary, not retried (#349)."""
+    guard = READ_ONLY_INTENT_GUARD
+    assert "EMPTY result" in guard
+    assert "do NOT retry the tool" in guard
+    # The wrapped example phrase, newlines/indent collapsed.
+    assert "No model runs found." in " ".join(guard.split())
+
+
 def test_guard_requires_clarification_for_ambiguous_top_products() -> None:
     """An ambiguous "top products" ranking gets a clarifying question, not a guess."""
     guard = READ_ONLY_INTENT_GUARD