diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
index d45d78f4e..1edbfe902 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -302,6 +302,24 @@ def _is_context_overflow_error(exc: Exception) -> bool:
     )
 
 
+def _all_mental_models_are_usable_and_fresh(tool_output: dict[str, Any]) -> bool:
+    """Return whether every retrieved mental model is explicitly fresh and has answerable content.
+
+    Used to decide — without an extra LLM call — whether a forced
+    ``search_mental_models`` result is trustworthy enough to hand control back
+    to the agent. A model is usable only when it is explicitly ``is_stale ==
+    False`` (an unknown/missing staleness flag is treated as unsafe) and has
+    non-empty content.
+    """
+    models = tool_output.get("mental_models") or []
+    for model in models:
+        if model.get("is_stale") is not False:
+            return False
+        if not str(model.get("content") or "").strip():
+            return False
+    return True
+
+
 async def run_reflect_agent(
     llm_config: "LLMProvider",
     bank_id: str,
@@ -464,6 +482,11 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
         )
 
     consecutive_errors = 0
+    # When a forced ``search_mental_models`` returns fresh, usable models on a
+    # low/mid-budget call, we stop forcing the lower retrieval layers from this
+    # iteration onward and let the agent answer (or retrieve deeper itself)
+    # under ``auto`` tool choice. None means the full forced path still applies.
+    stop_forcing_from_iteration: int | None = None
     for iteration in range(max_iterations):
         is_last = iteration == max_iterations - 1
 
@@ -592,8 +615,11 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
         if include_recall:
             forced_sequence.append("recall")
 
-        if iteration < len(forced_sequence):
-            iter_tool_choice: str | dict = {"type": "function", "function": {"name": forced_sequence[iteration]}}
+        if stop_forcing_from_iteration is not None and iteration >= stop_forcing_from_iteration:
+            # A fresh mental model already short-circuited the forced path.
+            iter_tool_choice: str | dict = "auto"
+        elif iteration < len(forced_sequence):
+            iter_tool_choice = {"type": "function", "function": {"name": forced_sequence[iteration]}}
         else:
             iter_tool_choice = "auto"
 
@@ -956,6 +982,25 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                     for mm in output["mental_models"]:
                         if "id" in mm:
                             available_mental_model_ids.add(mm["id"])
+                    # Deterministic short-circuit (no extra LLM call): on a
+                    # low/mid-budget call, if every retrieved mental model is
+                    # fresh and has usable content, stop forcing the lower
+                    # retrieval layers. The next iteration runs under ``auto``
+                    # tool choice, so the agent can answer directly when the
+                    # mental model suffices, or — having just read it — issue a
+                    # targeted ``search_observations``/``recall`` itself. Stale,
+                    # empty, or missing mental models keep the full forced path.
+                    if (
+                        stop_forcing_from_iteration is None
+                        and (budget or "low").lower() != "high"
+                        and output.get("mental_models")
+                        and _all_mental_models_are_usable_and_fresh(output)
+                    ):
+                        stop_forcing_from_iteration = iteration + 1
+                        logger.info(
+                            f"[REFLECT {reflect_id}] Fresh mental models sufficient on iteration {iteration + 1}; "
+                            "releasing forced lower-level retrieval to auto."
+                        )
 
                 if (
                     normalized_tool_name == "search_observations"
diff --git a/hindsight-api-slim/tests/test_reflect_agent.py b/hindsight-api-slim/tests/test_reflect_agent.py
index 6b31919f0..b11d2de66 100644
--- a/hindsight-api-slim/tests/test_reflect_agent.py
+++ b/hindsight-api-slim/tests/test_reflect_agent.py
@@ -14,6 +14,7 @@
 import pytest
 
 from hindsight_api.engine.reflect.agent import (
+    _all_mental_models_are_usable_and_fresh,
     _clean_answer_text,
     _clean_done_answer,
     _count_messages_tokens,
@@ -23,6 +24,7 @@
     run_reflect_agent,
 )
 from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
+from tests.llm_judge import assert_meets_criteria
 
 
 class TestCleanAnswerText:
@@ -195,6 +197,43 @@ def test_is_done_tool(self):
         assert _is_done_tool("recall<|channel|>done") is False
 
 
+class TestMentalModelFreshnessHelper:
+    """Deterministic freshness/usability guard for short-circuiting forced retrieval."""
+
+    def test_all_fresh_and_non_empty_is_usable(self):
+        output = {
+            "mental_models": [
+                {"id": "mm-1", "content": "Fresh content.", "is_stale": False},
+                {"id": "mm-2", "content": "More fresh content.", "is_stale": False},
+            ]
+        }
+        assert _all_mental_models_are_usable_and_fresh(output) is True
+
+    def test_any_stale_model_is_not_usable(self):
+        output = {
+            "mental_models": [
+                {"id": "mm-1", "content": "Fresh content.", "is_stale": False},
+                {"id": "mm-2", "content": "Old content.", "is_stale": True},
+            ]
+        }
+        assert _all_mental_models_are_usable_and_fresh(output) is False
+
+    def test_missing_staleness_flag_is_not_usable(self):
+        # An unknown/missing staleness flag must be treated as unsafe.
+        output = {"mental_models": [{"id": "mm-1", "content": "Fresh content."}]}
+        assert _all_mental_models_are_usable_and_fresh(output) is False
+
+    def test_blank_content_is_not_usable(self):
+        output = {"mental_models": [{"id": "mm-1", "content": "   ", "is_stale": False}]}
+        assert _all_mental_models_are_usable_and_fresh(output) is False
+
+    def test_empty_list_is_vacuously_usable(self):
+        # The caller gates on a non-empty list separately; the helper itself is
+        # only responsible for freshness/content of the models it is given.
+        assert _all_mental_models_are_usable_and_fresh({"mental_models": []}) is True
+        assert _all_mental_models_are_usable_and_fresh({}) is True
+
+
 class TestReflectAgentMocked:
     """Test reflect agent with mocked LLM outputs."""
 
@@ -219,6 +258,242 @@ def mock_functions(self):
             "expand_fn": AsyncMock(return_value={"memories": []}),
         }
 
+    @staticmethod
+    def _mm_call(call_id: str = "1", query: str = "test query") -> LLMToolCallResult:
+        return LLMToolCallResult(
+            tool_calls=[
+                LLMToolCall(id=call_id, name="search_mental_models", arguments={"reason": "curated", "query": query})
+            ],
+            finish_reason="tool_calls",
+        )
+
+    @pytest.mark.asyncio
+    async def test_fresh_mental_model_releases_forced_retrieval(self, mock_llm, mock_functions):
+        """A fresh, usable mental model stops forced lower-level retrieval — with no extra LLM call.
+
+        The agent answers on the very next (auto) iteration, so search_observations
+        and recall are never invoked.
+        """
+        mock_functions["search_mental_models_fn"].return_value = {
+            "query": "test query",
+            "mental_models": [
+                {"id": "mm-1", "name": "User prefs", "content": "The user prefers concise answers.", "is_stale": False}
+            ],
+        }
+        mock_llm.call_with_tools.side_effect = [
+            self._mm_call(),
+            LLMToolCallResult(
+                tool_calls=[
+                    LLMToolCall(
+                        id="2", name="done", arguments={"answer": "Be concise.", "mental_model_ids": ["mm-1"]}
+                    )
+                ],
+                finish_reason="tool_calls",
+            ),
+        ]
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="test query",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            has_mental_models=True,
+            budget="low",
+            max_iterations=5,
+            **mock_functions,
+        )
+
+        assert result.text == "Be concise."
+        # The fix's whole point: no extra LLM round-trip to decide sufficiency.
+        mock_llm.call.assert_not_called()
+        mock_functions["search_observations_fn"].assert_not_called()
+        mock_functions["recall_fn"].assert_not_called()
+        # First iteration forced mental models; second was released to auto.
+        first_choice = mock_llm.call_with_tools.await_args_list[0].kwargs["tool_choice"]
+        assert first_choice == {"type": "function", "function": {"name": "search_mental_models"}}
+        assert mock_llm.call_with_tools.await_args_list[1].kwargs["tool_choice"] == "auto"
+
+    @pytest.mark.asyncio
+    async def test_short_circuited_agent_may_still_retrieve_under_auto(self, mock_llm, mock_functions):
+        """After release, the agent can still choose to retrieve deeper itself (its own query)."""
+        mock_functions["search_mental_models_fn"].return_value = {
+            "query": "test query",
+            "mental_models": [
+                {"id": "mm-1", "name": "Status", "content": "Launch was planned for Friday.", "is_stale": False}
+            ],
+        }
+        mock_llm.call_with_tools.side_effect = [
+            self._mm_call(),
+            LLMToolCallResult(
+                tool_calls=[
+                    LLMToolCall(id="2", name="recall", arguments={"reason": "verify", "query": "launch completion proof"})
+                ],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[
+                    LLMToolCall(id="3", name="done", arguments={"answer": "Confirmed.", "memory_ids": ["mem-1"]})
+                ],
+                finish_reason="tool_calls",
+            ),
+        ]
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="test query",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            has_mental_models=True,
+            budget="low",
+            max_iterations=5,
+            **mock_functions,
+        )
+
+        assert result.text == "Confirmed."
+        # recall ran because the model chose it under auto, not because it was forced,
+        # and it used the model's own targeted query (not a forced override).
+        assert mock_llm.call_with_tools.await_args_list[1].kwargs["tool_choice"] == "auto"
+        mock_functions["recall_fn"].assert_called_once()
+        assert mock_functions["recall_fn"].await_args.args[0] == "launch completion proof"
+        mock_functions["search_observations_fn"].assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_stale_mental_model_keeps_forced_retrieval(self, mock_llm, mock_functions):
+        """A stale mental model must not short-circuit; the full forced path continues."""
+        mock_functions["search_mental_models_fn"].return_value = {
+            "query": "test query",
+            "mental_models": [
+                {
+                    "id": "mm-1",
+                    "name": "Old status",
+                    "content": "Old summary.",
+                    "is_stale": True,
+                    "staleness_reason": "newer facts exist",
+                }
+            ],
+        }
+        mock_llm.call_with_tools.side_effect = [
+            self._mm_call(),
+            LLMToolCallResult(
+                tool_calls=[LLMToolCall(id="2", name="search_observations", arguments={"query": "q"})],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[LLMToolCall(id="3", name="recall", arguments={"query": "q"})],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[
+                    LLMToolCall(id="4", name="done", arguments={"answer": "Verified.", "memory_ids": ["mem-1"]})
+                ],
+                finish_reason="tool_calls",
+            ),
+        ]
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="test query",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            has_mental_models=True,
+            budget="low",
+            max_iterations=5,
+            **mock_functions,
+        )
+
+        assert result.text == "Verified."
+        mock_functions["search_observations_fn"].assert_called_once()
+        mock_functions["recall_fn"].assert_called_once()
+        choices = [c.kwargs["tool_choice"] for c in mock_llm.call_with_tools.await_args_list[:3]]
+        assert choices == [
+            {"type": "function", "function": {"name": "search_mental_models"}},
+            {"type": "function", "function": {"name": "search_observations"}},
+            {"type": "function", "function": {"name": "recall"}},
+        ]
+
+    @pytest.mark.asyncio
+    async def test_high_budget_keeps_forced_path_for_fresh_mental_model(self, mock_llm, mock_functions):
+        """High budget preserves the full verification path even for fresh mental models."""
+        mock_functions["search_mental_models_fn"].return_value = {
+            "query": "test query",
+            "mental_models": [
+                {"id": "mm-1", "name": "Prefs", "content": "Fresh and directly relevant.", "is_stale": False}
+            ],
+        }
+        mock_llm.call_with_tools.side_effect = [
+            self._mm_call(),
+            LLMToolCallResult(
+                tool_calls=[LLMToolCall(id="2", name="search_observations", arguments={"query": "q"})],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[LLMToolCall(id="3", name="recall", arguments={"query": "q"})],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[
+                    LLMToolCall(id="4", name="done", arguments={"answer": "Verified.", "memory_ids": ["mem-1"]})
+                ],
+                finish_reason="tool_calls",
+            ),
+        ]
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="test query",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            has_mental_models=True,
+            budget="high",
+            max_iterations=5,
+            **mock_functions,
+        )
+
+        assert result.text == "Verified."
+        mock_functions["search_observations_fn"].assert_called_once()
+        mock_functions["recall_fn"].assert_called_once()
+        assert mock_llm.call_with_tools.await_args_list[1].kwargs["tool_choice"] == {
+            "type": "function",
+            "function": {"name": "search_observations"},
+        }
+
+    @pytest.mark.asyncio
+    async def test_no_mental_models_keeps_forced_retrieval(self, mock_llm, mock_functions):
+        """An empty mental-model result must not short-circuit the forced path."""
+        mock_functions["search_mental_models_fn"].return_value = {"query": "test query", "mental_models": []}
+        mock_llm.call_with_tools.side_effect = [
+            self._mm_call(),
+            LLMToolCallResult(
+                tool_calls=[LLMToolCall(id="2", name="search_observations", arguments={"query": "q"})],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[LLMToolCall(id="3", name="recall", arguments={"query": "q"})],
+                finish_reason="tool_calls",
+            ),
+            LLMToolCallResult(
+                tool_calls=[
+                    LLMToolCall(id="4", name="done", arguments={"answer": "Done.", "memory_ids": ["mem-1"]})
+                ],
+                finish_reason="tool_calls",
+            ),
+        ]
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="test query",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            has_mental_models=True,
+            budget="low",
+            max_iterations=5,
+            **mock_functions,
+        )
+
+        assert result.text == "Done."
+        mock_functions["search_observations_fn"].assert_called_once()
+        mock_functions["recall_fn"].assert_called_once()
+
     @pytest.mark.asyncio
     async def test_handles_functions_prefix_in_done(self, mock_llm, mock_functions):
         """Test that 'functions.done' is handled correctly."""
@@ -767,3 +1042,138 @@ def __getattr__(self, name: str):
 
         finally:
             await memory.delete_bank(bank_id, request_context=request_context)
+
+
+@pytest.mark.hs_llm_core
+@pytest.mark.flaky(reruns=3, reruns_delay=2)
+class TestMentalModelShortCircuitRealLLM:
+    """End-to-end, real-LLM coverage for the fresh-mental-model short-circuit.
+
+    The deterministic release-to-auto mechanism is covered by the MockLLM tests
+    above. What only a real model can verify is the behaviour *after* release:
+    that a real agent, once it is no longer forced, actually answers off a fresh
+    sufficient mental model — and, when the model is fresh but incomplete, that
+    it chooses to retrieve deeper itself with its own targeted query.
+
+    The search functions are stubbed so the mental-model content is controlled,
+    but ``llm_config`` drives the real agent loop.
+    """
+
+    @staticmethod
+    def _stub_functions(mental_models, recall_memories=None, observations=None):
+        async def search_mental_models_fn(query, max_results):
+            return {"query": query, "count": len(mental_models), "mental_models": mental_models}
+
+        return {
+            "search_mental_models_fn": AsyncMock(side_effect=search_mental_models_fn),
+            "search_observations_fn": AsyncMock(return_value={"observations": observations or []}),
+            "recall_fn": AsyncMock(return_value={"memories": recall_memories or []}),
+            "expand_fn": AsyncMock(return_value={"memories": []}),
+        }
+
+    @pytest.mark.asyncio
+    async def test_real_fresh_mental_model_answers_without_lower_retrieval(self, llm_config):
+        """A fresh, sufficient mental model lets the real agent answer without obs/recall."""
+        functions = self._stub_functions(
+            mental_models=[
+                {
+                    "id": "mm-comm",
+                    "name": "Architecture Communication Preference",
+                    "content": (
+                        "For architecture decisions, the team prefers asynchronous written communication. "
+                        "They use concise ADRs (Architecture Decision Records) and explicitly avoid settling "
+                        "complex design questions in live meetings."
+                    ),
+                    "relevance": 0.94,
+                    "is_stale": False,
+                }
+            ]
+        )
+
+        result = await run_reflect_agent(
+            llm_config=llm_config,
+            bank_id="test-bank",
+            query="According to what you know, how does the team prefer to communicate about architecture decisions?",
+            bank_profile={"name": "Test", "mission": "Answer from curated knowledge"},
+            has_mental_models=True,
+            include_observations=True,
+            include_recall=True,
+            budget="low",
+            max_iterations=6,
+            **functions,
+        )
+
+        assert result.text, "agent must return a non-empty answer"
+        # The core behavioural win: the forced lower-level path was released, and a
+        # real model answered off the fresh mental model instead of digging deeper.
+        functions["search_observations_fn"].assert_not_called()
+        functions["recall_fn"].assert_not_called()
+        await assert_meets_criteria(
+            response=result.text,
+            criteria=(
+                "The answer states the team prefers asynchronous written communication for architecture "
+                "decisions (e.g. ADRs) rather than live meetings."
+            ),
+            context="The only knowledge available was a mental model describing the team's async/ADR preference.",
+        )
+
+    @pytest.mark.asyncio
+    async def test_real_stale_mental_model_forces_and_grounds_in_deeper_evidence(self, llm_config):
+        """A stale mental model must NOT short-circuit: the agent is forced deeper and grounds its answer there.
+
+        This is the safety side of the guard. Forcing the lower layers is
+        deterministic (stale fails the freshness check), so observations/recall
+        run regardless of model discretion; the real-LLM value is confirming the
+        agent corrects the stale summary using the freshly retrieved raw fact.
+        """
+        functions = self._stub_functions(
+            mental_models=[
+                {
+                    "id": "mm-aurora",
+                    "name": "Project Aurora Launch Plan",
+                    "content": "Project Aurora's launch is still pending and has not happened yet.",
+                    "relevance": 0.91,
+                    "is_stale": True,
+                    "staleness_reason": "newer deployment facts exist",
+                }
+            ],
+            recall_memories=[
+                {
+                    "id": "mem-deploy",
+                    "content": "Project Aurora shipped to production on Friday at 16:00 UTC (deploy log A-1029).",
+                }
+            ],
+            observations=[
+                {
+                    "id": "obs-deploy",
+                    "content": "Aurora production deployment confirmed Friday; see deploy log A-1029.",
+                }
+            ],
+        )
+
+        result = await run_reflect_agent(
+            llm_config=llm_config,
+            bank_id="test-bank",
+            query="What is the current status of Project Aurora, and what concrete fact supports it?",
+            bank_profile={"name": "Test", "mission": "Answer with verifiable detail"},
+            has_mental_models=True,
+            include_observations=True,
+            include_recall=True,
+            budget="low",
+            max_iterations=6,
+            **functions,
+        )
+
+        assert result.text, "agent must return a non-empty answer"
+        # Stale mental model → no short-circuit → lower layers are still forced.
+        assert functions["search_observations_fn"].await_count > 0
+        assert functions["recall_fn"].await_count > 0
+        await assert_meets_criteria(
+            response=result.text,
+            criteria=(
+                "The answer states Project Aurora has launched/shipped to production (NOT that it is still "
+                "pending) and cites the Friday production deployment (e.g. deploy log A-1029) as support."
+            ),
+            context="A stale mental model claimed the launch was still pending, but the freshly retrieved raw "
+            "fact (deploy log A-1029) shows it shipped on Friday. The agent should correct the stale summary.",
+        )