diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py index d45d78f4e..1edbfe902 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py @@ -302,6 +302,24 @@ def _is_context_overflow_error(exc: Exception) -> bool: ) +def _all_mental_models_are_usable_and_fresh(tool_output: dict[str, Any]) -> bool: + """Return whether every retrieved mental model is explicitly fresh and has answerable content. + + Used to decide — without an extra LLM call — whether a forced + ``search_mental_models`` result is trustworthy enough to hand control back + to the agent. A model is usable only when it is explicitly ``is_stale == + False`` (an unknown/missing staleness flag is treated as unsafe) and has + non-empty content. + """ + models = tool_output.get("mental_models") or [] + for model in models: + if model.get("is_stale") is not False: + return False + if not str(model.get("content") or "").strip(): + return False + return True + + async def run_reflect_agent( llm_config: "LLMProvider", bank_id: str, @@ -464,6 +482,11 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): ) consecutive_errors = 0 + # When a forced ``search_mental_models`` returns fresh, usable models on a + # low/mid-budget call, we stop forcing the lower retrieval layers from this + # iteration onward and let the agent answer (or retrieve deeper itself) + # under ``auto`` tool choice. None means the full forced path still applies. + stop_forcing_from_iteration: int | None = None for iteration in range(max_iterations): is_last = iteration == max_iterations - 1 @@ -592,8 +615,11 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): if include_recall: forced_sequence.append("recall") - if iteration < len(forced_sequence): - iter_tool_choice: str | dict = {"type": "function", "function": {"name": forced_sequence[iteration]}} + if stop_forcing_from_iteration is not None and iteration >= stop_forcing_from_iteration: + # A fresh mental model already short-circuited the forced path. + iter_tool_choice: str | dict = "auto" + elif iteration < len(forced_sequence): + iter_tool_choice = {"type": "function", "function": {"name": forced_sequence[iteration]}} else: iter_tool_choice = "auto" @@ -956,6 +982,25 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): for mm in output["mental_models"]: if "id" in mm: available_mental_model_ids.add(mm["id"]) + # Deterministic short-circuit (no extra LLM call): on a + # low/mid-budget call, if every retrieved mental model is + # fresh and has usable content, stop forcing the lower + # retrieval layers. The next iteration runs under ``auto`` + # tool choice, so the agent can answer directly when the + # mental model suffices, or — having just read it — issue a + # targeted ``search_observations``/``recall`` itself. Stale, + # empty, or missing mental models keep the full forced path. + if ( + stop_forcing_from_iteration is None + and (budget or "low").lower() != "high" + and output.get("mental_models") + and _all_mental_models_are_usable_and_fresh(output) + ): + stop_forcing_from_iteration = iteration + 1 + logger.info( + f"[REFLECT {reflect_id}] Fresh mental models sufficient on iteration {iteration + 1}; " + "releasing forced lower-level retrieval to auto." + ) if ( normalized_tool_name == "search_observations" diff --git a/hindsight-api-slim/tests/test_reflect_agent.py b/hindsight-api-slim/tests/test_reflect_agent.py index 6b31919f0..b11d2de66 100644 --- a/hindsight-api-slim/tests/test_reflect_agent.py +++ b/hindsight-api-slim/tests/test_reflect_agent.py @@ -14,6 +14,7 @@ import pytest from hindsight_api.engine.reflect.agent import ( + _all_mental_models_are_usable_and_fresh, _clean_answer_text, _clean_done_answer, _count_messages_tokens, @@ -23,6 +24,7 @@ run_reflect_agent, ) from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage +from tests.llm_judge import assert_meets_criteria class TestCleanAnswerText: @@ -195,6 +197,43 @@ def test_is_done_tool(self): assert _is_done_tool("recall<|channel|>done") is False +class TestMentalModelFreshnessHelper: + """Deterministic freshness/usability guard for short-circuiting forced retrieval.""" + + def test_all_fresh_and_non_empty_is_usable(self): + output = { + "mental_models": [ + {"id": "mm-1", "content": "Fresh content.", "is_stale": False}, + {"id": "mm-2", "content": "More fresh content.", "is_stale": False}, + ] + } + assert _all_mental_models_are_usable_and_fresh(output) is True + + def test_any_stale_model_is_not_usable(self): + output = { + "mental_models": [ + {"id": "mm-1", "content": "Fresh content.", "is_stale": False}, + {"id": "mm-2", "content": "Old content.", "is_stale": True}, + ] + } + assert _all_mental_models_are_usable_and_fresh(output) is False + + def test_missing_staleness_flag_is_not_usable(self): + # An unknown/missing staleness flag must be treated as unsafe. + output = {"mental_models": [{"id": "mm-1", "content": "Fresh content."}]} + assert _all_mental_models_are_usable_and_fresh(output) is False + + def test_blank_content_is_not_usable(self): + output = {"mental_models": [{"id": "mm-1", "content": " ", "is_stale": False}]} + assert _all_mental_models_are_usable_and_fresh(output) is False + + def test_empty_list_is_vacuously_usable(self): + # The caller gates on a non-empty list separately; the helper itself is + # only responsible for freshness/content of the models it is given. + assert _all_mental_models_are_usable_and_fresh({"mental_models": []}) is True + assert _all_mental_models_are_usable_and_fresh({}) is True + + class TestReflectAgentMocked: """Test reflect agent with mocked LLM outputs.""" @@ -219,6 +258,242 @@ def mock_functions(self): "expand_fn": AsyncMock(return_value={"memories": []}), } + @staticmethod + def _mm_call(call_id: str = "1", query: str = "test query") -> LLMToolCallResult: + return LLMToolCallResult( + tool_calls=[ + LLMToolCall(id=call_id, name="search_mental_models", arguments={"reason": "curated", "query": query}) + ], + finish_reason="tool_calls", + ) + + @pytest.mark.asyncio + async def test_fresh_mental_model_releases_forced_retrieval(self, mock_llm, mock_functions): + """A fresh, usable mental model stops forced lower-level retrieval — with no extra LLM call. + + The agent answers on the very next (auto) iteration, so search_observations + and recall are never invoked. + """ + mock_functions["search_mental_models_fn"].return_value = { + "query": "test query", + "mental_models": [ + {"id": "mm-1", "name": "User prefs", "content": "The user prefers concise answers.", "is_stale": False} + ], + } + mock_llm.call_with_tools.side_effect = [ + self._mm_call(), + LLMToolCallResult( + tool_calls=[ + LLMToolCall( + id="2", name="done", arguments={"answer": "Be concise.", "mental_model_ids": ["mm-1"]} + ) + ], + finish_reason="tool_calls", + ), + ] + + result = await run_reflect_agent( + llm_config=mock_llm, + bank_id="test-bank", + query="test query", + bank_profile={"name": "Test", "mission": "Testing"}, + has_mental_models=True, + budget="low", + max_iterations=5, + **mock_functions, + ) + + assert result.text == "Be concise." + # The fix's whole point: no extra LLM round-trip to decide sufficiency. + mock_llm.call.assert_not_called() + mock_functions["search_observations_fn"].assert_not_called() + mock_functions["recall_fn"].assert_not_called() + # First iteration forced mental models; second was released to auto. + first_choice = mock_llm.call_with_tools.await_args_list[0].kwargs["tool_choice"] + assert first_choice == {"type": "function", "function": {"name": "search_mental_models"}} + assert mock_llm.call_with_tools.await_args_list[1].kwargs["tool_choice"] == "auto" + + @pytest.mark.asyncio + async def test_short_circuited_agent_may_still_retrieve_under_auto(self, mock_llm, mock_functions): + """After release, the agent can still choose to retrieve deeper itself (its own query).""" + mock_functions["search_mental_models_fn"].return_value = { + "query": "test query", + "mental_models": [ + {"id": "mm-1", "name": "Status", "content": "Launch was planned for Friday.", "is_stale": False} + ], + } + mock_llm.call_with_tools.side_effect = [ + self._mm_call(), + LLMToolCallResult( + tool_calls=[ + LLMToolCall(id="2", name="recall", arguments={"reason": "verify", "query": "launch completion proof"}) + ], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[ + LLMToolCall(id="3", name="done", arguments={"answer": "Confirmed.", "memory_ids": ["mem-1"]}) + ], + finish_reason="tool_calls", + ), + ] + + result = await run_reflect_agent( + llm_config=mock_llm, + bank_id="test-bank", + query="test query", + bank_profile={"name": "Test", "mission": "Testing"}, + has_mental_models=True, + budget="low", + max_iterations=5, + **mock_functions, + ) + + assert result.text == "Confirmed." + # recall ran because the model chose it under auto, not because it was forced, + # and it used the model's own targeted query (not a forced override). + assert mock_llm.call_with_tools.await_args_list[1].kwargs["tool_choice"] == "auto" + mock_functions["recall_fn"].assert_called_once() + assert mock_functions["recall_fn"].await_args.args[0] == "launch completion proof" + mock_functions["search_observations_fn"].assert_not_called() + + @pytest.mark.asyncio + async def test_stale_mental_model_keeps_forced_retrieval(self, mock_llm, mock_functions): + """A stale mental model must not short-circuit; the full forced path continues.""" + mock_functions["search_mental_models_fn"].return_value = { + "query": "test query", + "mental_models": [ + { + "id": "mm-1", + "name": "Old status", + "content": "Old summary.", + "is_stale": True, + "staleness_reason": "newer facts exist", + } + ], + } + mock_llm.call_with_tools.side_effect = [ + self._mm_call(), + LLMToolCallResult( + tool_calls=[LLMToolCall(id="2", name="search_observations", arguments={"query": "q"})], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[LLMToolCall(id="3", name="recall", arguments={"query": "q"})], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[ + LLMToolCall(id="4", name="done", arguments={"answer": "Verified.", "memory_ids": ["mem-1"]}) + ], + finish_reason="tool_calls", + ), + ] + + result = await run_reflect_agent( + llm_config=mock_llm, + bank_id="test-bank", + query="test query", + bank_profile={"name": "Test", "mission": "Testing"}, + has_mental_models=True, + budget="low", + max_iterations=5, + **mock_functions, + ) + + assert result.text == "Verified." + mock_functions["search_observations_fn"].assert_called_once() + mock_functions["recall_fn"].assert_called_once() + choices = [c.kwargs["tool_choice"] for c in mock_llm.call_with_tools.await_args_list[:3]] + assert choices == [ + {"type": "function", "function": {"name": "search_mental_models"}}, + {"type": "function", "function": {"name": "search_observations"}}, + {"type": "function", "function": {"name": "recall"}}, + ] + + @pytest.mark.asyncio + async def test_high_budget_keeps_forced_path_for_fresh_mental_model(self, mock_llm, mock_functions): + """High budget preserves the full verification path even for fresh mental models.""" + mock_functions["search_mental_models_fn"].return_value = { + "query": "test query", + "mental_models": [ + {"id": "mm-1", "name": "Prefs", "content": "Fresh and directly relevant.", "is_stale": False} + ], + } + mock_llm.call_with_tools.side_effect = [ + self._mm_call(), + LLMToolCallResult( + tool_calls=[LLMToolCall(id="2", name="search_observations", arguments={"query": "q"})], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[LLMToolCall(id="3", name="recall", arguments={"query": "q"})], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[ + LLMToolCall(id="4", name="done", arguments={"answer": "Verified.", "memory_ids": ["mem-1"]}) + ], + finish_reason="tool_calls", + ), + ] + + result = await run_reflect_agent( + llm_config=mock_llm, + bank_id="test-bank", + query="test query", + bank_profile={"name": "Test", "mission": "Testing"}, + has_mental_models=True, + budget="high", + max_iterations=5, + **mock_functions, + ) + + assert result.text == "Verified." + mock_functions["search_observations_fn"].assert_called_once() + mock_functions["recall_fn"].assert_called_once() + assert mock_llm.call_with_tools.await_args_list[1].kwargs["tool_choice"] == { + "type": "function", + "function": {"name": "search_observations"}, + } + + @pytest.mark.asyncio + async def test_no_mental_models_keeps_forced_retrieval(self, mock_llm, mock_functions): + """An empty mental-model result must not short-circuit the forced path.""" + mock_functions["search_mental_models_fn"].return_value = {"query": "test query", "mental_models": []} + mock_llm.call_with_tools.side_effect = [ + self._mm_call(), + LLMToolCallResult( + tool_calls=[LLMToolCall(id="2", name="search_observations", arguments={"query": "q"})], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[LLMToolCall(id="3", name="recall", arguments={"query": "q"})], + finish_reason="tool_calls", + ), + LLMToolCallResult( + tool_calls=[ + LLMToolCall(id="4", name="done", arguments={"answer": "Done.", "memory_ids": ["mem-1"]}) + ], + finish_reason="tool_calls", + ), + ] + + result = await run_reflect_agent( + llm_config=mock_llm, + bank_id="test-bank", + query="test query", + bank_profile={"name": "Test", "mission": "Testing"}, + has_mental_models=True, + budget="low", + max_iterations=5, + **mock_functions, + ) + + assert result.text == "Done." + mock_functions["search_observations_fn"].assert_called_once() + mock_functions["recall_fn"].assert_called_once() + @pytest.mark.asyncio async def test_handles_functions_prefix_in_done(self, mock_llm, mock_functions): """Test that 'functions.done' is handled correctly.""" @@ -767,3 +1042,138 @@ def __getattr__(self, name: str): finally: await memory.delete_bank(bank_id, request_context=request_context) + + +@pytest.mark.hs_llm_core +@pytest.mark.flaky(reruns=3, reruns_delay=2) +class TestMentalModelShortCircuitRealLLM: + """End-to-end, real-LLM coverage for the fresh-mental-model short-circuit. + + The deterministic release-to-auto mechanism is covered by the MockLLM tests + above. What only a real model can verify is the behaviour *after* release: + that a real agent, once it is no longer forced, actually answers off a fresh + sufficient mental model — and, when the model is fresh but incomplete, that + it chooses to retrieve deeper itself with its own targeted query. + + The search functions are stubbed so the mental-model content is controlled, + but ``llm_config`` drives the real agent loop. + """ + + @staticmethod + def _stub_functions(mental_models, recall_memories=None, observations=None): + async def search_mental_models_fn(query, max_results): + return {"query": query, "count": len(mental_models), "mental_models": mental_models} + + return { + "search_mental_models_fn": AsyncMock(side_effect=search_mental_models_fn), + "search_observations_fn": AsyncMock(return_value={"observations": observations or []}), + "recall_fn": AsyncMock(return_value={"memories": recall_memories or []}), + "expand_fn": AsyncMock(return_value={"memories": []}), + } + + @pytest.mark.asyncio + async def test_real_fresh_mental_model_answers_without_lower_retrieval(self, llm_config): + """A fresh, sufficient mental model lets the real agent answer without obs/recall.""" + functions = self._stub_functions( + mental_models=[ + { + "id": "mm-comm", + "name": "Architecture Communication Preference", + "content": ( + "For architecture decisions, the team prefers asynchronous written communication. " + "They use concise ADRs (Architecture Decision Records) and explicitly avoid settling " + "complex design questions in live meetings." + ), + "relevance": 0.94, + "is_stale": False, + } + ] + ) + + result = await run_reflect_agent( + llm_config=llm_config, + bank_id="test-bank", + query="According to what you know, how does the team prefer to communicate about architecture decisions?", + bank_profile={"name": "Test", "mission": "Answer from curated knowledge"}, + has_mental_models=True, + include_observations=True, + include_recall=True, + budget="low", + max_iterations=6, + **functions, + ) + + assert result.text, "agent must return a non-empty answer" + # The core behavioural win: the forced lower-level path was released, and a + # real model answered off the fresh mental model instead of digging deeper. + functions["search_observations_fn"].assert_not_called() + functions["recall_fn"].assert_not_called() + await assert_meets_criteria( + response=result.text, + criteria=( + "The answer states the team prefers asynchronous written communication for architecture " + "decisions (e.g. ADRs) rather than live meetings." + ), + context="The only knowledge available was a mental model describing the team's async/ADR preference.", + ) + + @pytest.mark.asyncio + async def test_real_stale_mental_model_forces_and_grounds_in_deeper_evidence(self, llm_config): + """A stale mental model must NOT short-circuit: the agent is forced deeper and grounds its answer there. + + This is the safety side of the guard. Forcing the lower layers is + deterministic (stale fails the freshness check), so observations/recall + run regardless of model discretion; the real-LLM value is confirming the + agent corrects the stale summary using the freshly retrieved raw fact. + """ + functions = self._stub_functions( + mental_models=[ + { + "id": "mm-aurora", + "name": "Project Aurora Launch Plan", + "content": "Project Aurora's launch is still pending and has not happened yet.", + "relevance": 0.91, + "is_stale": True, + "staleness_reason": "newer deployment facts exist", + } + ], + recall_memories=[ + { + "id": "mem-deploy", + "content": "Project Aurora shipped to production on Friday at 16:00 UTC (deploy log A-1029).", + } + ], + observations=[ + { + "id": "obs-deploy", + "content": "Aurora production deployment confirmed Friday; see deploy log A-1029.", + } + ], + ) + + result = await run_reflect_agent( + llm_config=llm_config, + bank_id="test-bank", + query="What is the current status of Project Aurora, and what concrete fact supports it?", + bank_profile={"name": "Test", "mission": "Answer with verifiable detail"}, + has_mental_models=True, + include_observations=True, + include_recall=True, + budget="low", + max_iterations=6, + **functions, + ) + + assert result.text, "agent must return a non-empty answer" + # Stale mental model → no short-circuit → lower layers are still forced. + assert functions["search_observations_fn"].await_count > 0 + assert functions["recall_fn"].await_count > 0 + await assert_meets_criteria( + response=result.text, + criteria=( + "The answer states Project Aurora has launched/shipped to production (NOT that it is still " + "pending) and cites the Friday production deployment (e.g. deploy log A-1029) as support." + ), + context="A stale mental model claimed the launch was still pending, but the freshly retrieved raw " + "fact (deploy log A-1029) shows it shipped on Friday. The agent should correct the stale summary.", + )