w7-mgfcode · w7-mgfcode · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
@@ -45,4 +45,5 @@ artifacts/
 HANDOFF.md
 
 # Local CI / dogfood logs and screenshots (per-session, never committed)
-.ci-logs/
+.ci-logs/
+docs/manual_hun/
@@ -327,6 +327,22 @@ def _parse_artifact_key(artifact_uri: str) -> str:
     return match.group(1)
 
 
+# Demo artifact keys are 12 hex chars -- the trained-model file stem
+# (``model_{KEY}.joblib``) that ``register`` copies into the registry root.
+# Kept next to ``_parse_artifact_key`` so the producer and parser stay in sync.
+_DEMO_ARTIFACT_KEY_LEN = 12
+
+
+def _format_demo_artifact_key(run_id_raw: str) -> str:
+    """Build a parseable demo artifact key from a registry run id.
+
+    Strips dashes (registry ids may be hyphenated UUIDs) and truncates to
+    ``_DEMO_ARTIFACT_KEY_LEN`` so the result is hex-only and matches the
+    ``_ARTIFACT_KEY_RE`` (``model_([0-9a-f]+)``) parser.
+    """
+    return run_id_raw.replace("-", "")[:_DEMO_ARTIFACT_KEY_LEN]
+
+
 # PRP-40 — curated 5-file user-guide corpus indexed by the knowledge phase.
 # The path_prefix RAG indexing additive contract scopes discovery to this
 # subset (memory anchor: [[rag-runtime-config-and-corpus-state]] — keep the
@@ -1159,15 +1175,22 @@ async def step_scenario_simulate_and_save(ctx: DemoContext, client: _Client) ->
     if ctx.date_end is None:
         return ("fail", "no date_end on ctx (status step did not populate it)", {})
 
-    # (1) Resolve alias -> registry run_id (32-char uuid).
-    alias_body = await client.request(
-        "scenario_simulate_and_save[alias]",
-        "GET",
-        f"/registry/aliases/{DEMO_ALIAS}",
-    )
-    winner_run_id = alias_body.get("run_id")
-    if not isinstance(winner_run_id, str):
-        return ("fail", f"{DEMO_ALIAS} alias has no run_id", {})
+    # (1) Resolve the champion via ctx.winning_run_id (set by step_register), not
+    # the live demo-production alias -- safer_promote_flow swaps that alias to a
+    # worse-WAPE run, which broke replay here (#324). The champion run keeps its
+    # real, parseable artifact_uri. Fall back to the alias only when no champion
+    # was recorded.
+    winner_run_id = ctx.winning_run_id
+    if winner_run_id is None:
+        alias_body = await client.request(
+            "scenario_simulate_and_save[alias]",
+            "GET",
+            f"/registry/aliases/{DEMO_ALIAS}",
+        )
+        alias_run_id = alias_body.get("run_id")
+        if not isinstance(alias_run_id, str):
+            return ("fail", f"{DEMO_ALIAS} alias has no run_id", {})
+        winner_run_id = alias_run_id
 
     # (2) Resolve run -> artifact_uri.
     run_body = await client.request(
@@ -1769,7 +1792,11 @@ async def step_safer_promote_flow(ctx: DemoContext, client: _Client) -> StepResu
         json_body={
             "status": "success",
             "metrics": {"wape": 99.0},
-            "artifact_uri": "demo/safer-promote-placeholder.joblib",
+            # #324 — real-shape, parseable artifact_uri (not a placeholder) so a
+            # downstream ``_parse_artifact_key`` consumer can resolve it.
+            "artifact_uri": (
+                f"demo/seasonal_naive-model_{_format_demo_artifact_key(worse_run_id_raw)}.joblib"
+            ),
             "artifact_hash": "0" * 64,
             "artifact_size_bytes": 1,
         },
@@ -1933,6 +1960,38 @@ async def step_batch_preset(ctx: DemoContext, client: _Client) -> StepResult:
     )
 
 
+async def _restore_demo_alias_after_failure(ctx: DemoContext, client: _Client) -> None:
+    """Best-effort restore of the demo-production alias after a mid-run failure.
+
+    issue #324 — when a step fails the pipeline aborts before the trailing
+    ``cleanup`` row runs, which would otherwise leave ``demo-production``
+    pointing at the ``safer_promote_flow`` worse-WAPE run. This restores the
+    original target captured before the swap. Never raises — a restore failure
+    must not mask the original step failure.
+    """
+    if ctx.original_demo_alias_run_id is None:
+        return
+    try:
+        await client.request(
+            "cleanup[alias_restore_safeguard]",
+            "POST",
+            "/registry/aliases",
+            json_body={
+                "alias_name": DEMO_ALIAS,
+                "run_id": ctx.original_demo_alias_run_id,
+                "description": ("Restored by the showcase pipeline failure safeguard (#324)."),
+            },
+        )
+    except (_StepError, httpx.HTTPError, OSError):
+        # Best-effort — a restore failure must never mask the original failure,
+        # but capture the exception so intermittent restore issues stay debuggable.
+        logger.warning(
+            "demo.cleanup.alias_restore_safeguard_failed",
+            run_id=ctx.original_demo_alias_run_id,
+            exc_info=True,
+        )
+
+
 async def step_cleanup(ctx: DemoContext, client: _Client) -> StepResult:
     """Close the agent session + restore the demo-production alias (PRP-39 R15).
 
@@ -2549,6 +2608,13 @@ async def run_pipeline(app: FastAPI, req: DemoRunRequest) -> AsyncIterator[StepE
             )
             if status == "fail":
                 any_fail = True
+                # issue #324 — guarantee demo-production alias restoration even
+                # when a step fails mid-run. The pipeline aborts here, before the
+                # trailing ``cleanup`` row runs, which would otherwise leave the
+                # alias pointing at the safer_promote_flow worse-WAPE run.
+                # Best-effort; never raises. Skipped if cleanup itself failed.
+                if name != "cleanup":
+                    await _restore_demo_alias_after_failure(ctx, client)
                 break
 
     wall = time.monotonic() - wall_start

@@ -1077,17 +1077,15 @@ def _make_showcase_ctx(scenario: ScenarioPreset = ScenarioPreset.SHOWCASE_RICH)
 
 
 async def test_scenario_simulate_and_save_happy_path():
-    """PRP-40 — happy path: resolves alias -> run -> artifact_key, saves plan."""
-    ctx = _make_showcase_ctx()
+    """PRP-40 + #324 — resolves the champion via ctx.winning_run_id -> run ->
+    artifact_key, saves the plan. Must NOT read the demo-production alias
+    (safer_promote_flow deliberately corrupts it)."""
+    ctx = _make_showcase_ctx()  # winning_run_id = "demo-run-abc123def456"
     client = _RecordingClient(
         None,
         responses={
-            (
-                "GET",
-                "/registry/aliases/demo-production",
-            ): {"alias_name": "demo-production", "run_id": "uuid-32-char"},
-            ("GET", "/registry/runs/uuid-32-char"): {
-                "run_id": "uuid-32-char",
+            ("GET", "/registry/runs/demo-run-abc123def456"): {
+                "run_id": "demo-run-abc123def456",
                 "artifact_uri": "demo/seasonal_naive-model_abc123def456.joblib",
             },
             ("POST", "/scenarios"): {
@@ -1118,11 +1116,15 @@ async def test_scenario_simulate_and_save_happy_path():
     assert body["run_id"] == "abc123def456"
     assert body["assumptions"]["price"]["change_pct"] == -0.10
     assert body["tags"] == ["showcase", "price"]
+    # #324 — the safer-promote-corrupted demo-production alias must NOT be read.
+    assert all(path != "/registry/aliases/demo-production" for _m, path, _b in client.calls)
 
 
-async def test_scenario_simulate_and_save_missing_alias_fails():
-    """PRP-40 — alias missing run_id -> FAIL with clear detail."""
+async def test_scenario_simulate_and_save_missing_champion_falls_back_to_alias():
+    """PRP-40 + #324 — with no champion recorded, fall back to the alias; an
+    alias missing run_id -> FAIL with clear detail."""
     ctx = _make_showcase_ctx()
+    ctx.winning_run_id = None  # force the defensive alias fallback
     client = _RecordingClient(
         None,
         responses={
@@ -1135,20 +1137,119 @@ async def test_scenario_simulate_and_save_missing_alias_fails():
 
 
 async def test_scenario_simulate_and_save_unparseable_artifact_uri_fails():
-    """PRP-40 — artifact_uri the regex can't parse -> FAIL."""
-    ctx = _make_showcase_ctx()
+    """PRP-40 — the champion run's artifact_uri the regex can't parse -> FAIL."""
+    ctx = _make_showcase_ctx()  # winning_run_id = "demo-run-abc123def456"
     client = _RecordingClient(
         None,
         responses={
-            ("GET", "/registry/aliases/demo-production"): {"run_id": "uuid"},
-            ("GET", "/registry/runs/uuid"): {"artifact_uri": "garbage-path.bin"},
+            ("GET", "/registry/runs/demo-run-abc123def456"): {"artifact_uri": "garbage-path.bin"},
         },
     )
     status, detail, _ = await pipeline.step_scenario_simulate_and_save(ctx, _as_client(client))
     assert status == "fail"
     assert "artifact-key" in detail
 
 
+async def test_scenario_simulate_and_save_ignores_corrupted_demo_alias():
+    """#324 regression — the step resolves the champion via ctx.winning_run_id
+    and never consults the safer-promote-corrupted demo-production alias."""
+    ctx = _make_showcase_ctx()  # winning_run_id = "demo-run-abc123def456"
+    client = _RecordingClient(
+        None,
+        responses={
+            ("GET", "/registry/runs/demo-run-abc123def456"): {
+                "artifact_uri": "demo/seasonal_naive-model_abc123def456.joblib",
+            },
+            ("POST", "/scenarios"): {
+                "scenario_id": "scn-001",
+                "comparison": {"method": "heuristic", "units_delta": 1.0, "revenue_delta": 2.0},
+            },
+        },
+    )
+    status, _detail, _data = await pipeline.step_scenario_simulate_and_save(ctx, _as_client(client))
+    assert status == "pass"
+    assert ctx.scenario_artifact_key == "abc123def456"
+    assert all(path != "/registry/aliases/demo-production" for _m, path, _b in client.calls)
+
+
+def test_parse_artifact_key_rejects_safer_promote_placeholder():
+    """#324 regression — the OLD PRP-39 placeholder artifact_uri is unparseable
+    (the exact failure the cascade surfaced); the NEW real-shape safer-promote
+    URI parses cleanly."""
+    import pytest
+
+    with pytest.raises(ValueError, match="Cannot parse artifact-key"):
+        pipeline._parse_artifact_key("demo/safer-promote-placeholder.joblib")
+    assert (
+        pipeline._parse_artifact_key("demo/seasonal_naive-model_abcdef012345.joblib")
+        == "abcdef012345"
+    )
+
+
+def test_format_demo_artifact_key_round_trips_through_parser():
+    """#324 — _format_demo_artifact_key strips dashes + truncates to a hex-only
+    key that round-trips through _parse_artifact_key (producer/parser in sync)."""
+    key = pipeline._format_demo_artifact_key("1234abcd-5678-90ef-dead-beef00112233")
+    assert key == "1234abcd5678"
+    assert len(key) == pipeline._DEMO_ARTIFACT_KEY_LEN
+    uri = f"demo/seasonal_naive-model_{key}.joblib"
+    assert pipeline._parse_artifact_key(uri) == key
+
+
+class _AliasRestoreSpyClient:
+    """Minimal _Client stand-in recording alias-restore POSTs (#324 safeguard)."""
+
+    def __init__(self, *, fail: bool = False) -> None:
+        self.calls: list[tuple[str, str, dict[str, Any] | None]] = []
+        self._fail = fail
+
+    async def request(
+        self,
+        step: str,
+        method: str,
+        path: str,
+        *,
+        json_body: dict[str, Any] | None = None,
+    ) -> dict[str, Any]:
+        self.calls.append((method, path, json_body))
+        if self._fail:
+            raise OSError("simulated transport failure")
+        return {}
+
+
+async def test_restore_demo_alias_after_failure_repoints_to_original():
+    """#324 — a mid-run failure must restore demo-production to the champion."""
+    ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False)
+    ctx.original_demo_alias_run_id = "champion-run-123"
+    spy = _AliasRestoreSpyClient()
+    await pipeline._restore_demo_alias_after_failure(ctx, cast("pipeline._Client", spy))
+    assert len(spy.calls) == 1
+    method, path, body = spy.calls[0]
+    assert method == "POST"
+    assert path == "/registry/aliases"
+    assert body is not None
+    assert body["alias_name"] == pipeline.DEMO_ALIAS
+    assert body["run_id"] == "champion-run-123"
+
+
+async def test_restore_demo_alias_after_failure_noop_without_swap():
+    """#324 — no original alias captured (no swap happened) -> no restore call."""
+    ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False)
+    ctx.original_demo_alias_run_id = None
+    spy = _AliasRestoreSpyClient()
+    await pipeline._restore_demo_alias_after_failure(ctx, cast("pipeline._Client", spy))
+    assert spy.calls == []
+
+
+async def test_restore_demo_alias_after_failure_swallows_errors():
+    """#324 — the safeguard must never raise (must not mask the original fail)."""
+    ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False)
+    ctx.original_demo_alias_run_id = "champion-run-123"
+    spy = _AliasRestoreSpyClient(fail=True)
+    await pipeline._restore_demo_alias_after_failure(ctx, cast("pipeline._Client", spy))  # no raise
+    assert len(spy.calls) == 1
+
+
 async def test_multi_plan_compare_happy_path():
     """PRP-40 — happy path: second-plan save + compare returns ranked list."""
     ctx = _make_showcase_ctx()

@@ -123,7 +123,7 @@ uv run python scripts/run_demo.py --seed 42 --quiet 2>&1 | tee demo.log
 15. **`batch_preset` step shows ⚠️ "batch poll timed out at 90s" (PRP-39, `showcase_rich` only)** — the batch's 18 sub-jobs together exceeded the poll-timeout budget. Cause: a slow-feature-pipeline branch makes each grain×model pair take longer than expected; on a developer laptop with limited CPU 18 jobs can exceed 90 s under load. Fix: visit `/visualize/batch/{batch_id}` to follow the run to completion; the step is `warn` (non-fatal), so the pipeline still goes green.
 16. **`batch_preset` step fails with `HTTP 422 -- Unprocessable Entity` from `/batch/forecasting` (PRP-39, `showcase_rich` only)** — `BatchSubmitRequest` validation rejected the body. Common causes: (a) `BatchScope.kind` casing drift (must be lowercase `"manual"`); (b) `operation` value drift (must be `"train"` / `"predict"` / `"backtest"` / `"train_backtest_register"`, NOT `"forecasting"`); (c) the discovered `store_ids` / `product_ids` list is empty because `step_status` did not seed the grain. Fix: re-tick `Re-seed first`; verify the discovery returns at least 3 stores + 2 products.
 17. **`cleanup` step shows `alias restored=False` in detail (PRP-39 R15, `showcase_rich` only)** — the `POST /registry/aliases` restore call returned non-2xx. Cause: the original alias target was archived between the swap and the cleanup (an `agent_require_approval` archive_run tool fire by an operator during the demo). Fix: re-create the alias manually pointing at the V2 winner. The cleanup step warns and continues so the run still goes green.
-18. **`scenario_simulate_and_save` step fails with `Cannot parse artifact-key from artifact_uri` (PRP-40, `showcase_rich` only)** — the `demo-production` alias's run has an `artifact_uri` the `_parse_artifact_key` regex can't match (`r"model_([0-9a-f]+)(?:\.joblib)?$"`). Causes: a backfilled run with an irregular `artifact_uri`, or a forecasting-slice change to the model-path convention. Fix: inspect the run via `GET /registry/aliases/demo-production` → `GET /registry/runs/{run_id}`, confirm `artifact_uri` matches one of the V1 (`demo/{model_type}-model_{KEY}.joblib`) or V2 (`artifacts/models/model_{KEY}.joblib`) shapes, then either re-run the showcase (the next `register` step rewrites the artifact_uri) or extend `_ARTIFACT_KEY_RE` if a new shape is intentional.
+18. **`scenario_simulate_and_save` step fails with `Cannot parse artifact-key from artifact_uri` (PRP-40, `showcase_rich` only)** — FIXED in #324. The cascade had two root causes: `safer_promote_flow` (PRP-39) swapped the `demo-production` alias to a worse-WAPE run whose placeholder `artifact_uri` (`demo/safer-promote-placeholder.joblib`) the `_parse_artifact_key` regex (`r"model_([0-9a-f]+)(?:\.joblib)?$"`) could not match, and `scenario_simulate_and_save` then resolved that corrupted alias. The fix: the planning step now resolves the champion via `ctx.winning_run_id` (recorded by `register`, never touched by the swap) instead of the live alias, and `safer_promote_flow` writes a real-shape parseable `artifact_uri`. The orchestrator also runs an alias-restore safeguard (`_restore_demo_alias_after_failure`) on any mid-run failure so `demo-production` is never left on the worse-WAPE run. If you still hit this on a forked pipeline, the run's `artifact_uri` is irregular: confirm it matches one of the V1 (`demo/{model_type}-model_{KEY}.joblib`) or V2 (`artifacts/models/model_{KEY}.joblib`) shapes via `GET /registry/runs/{run_id}`, re-run the showcase (the next `register` step rewrites the artifact_uri), or extend `_ARTIFACT_KEY_RE` if a new shape is intentional.
 19. **`multi_plan_compare` step shows ⚠️ with `holiday-plan save failed: ...; price-cut plan still saved` (PRP-40, `showcase_rich` only)** — the second `POST /scenarios` returned 4xx (most likely 422). The price-cut plan was still saved (partial success — R19), so the run keeps going green. Fix: read the RFC 7807 body in the detail; common causes are a horizon out of range or a malformed `holiday.dates` payload. Re-running the showcase regenerates both plans from scratch.
 20. **`embedding_provider_probe` step shows ✅ but `reachable=False` (PRP-40, `showcase_rich` only)** — expected when no embedding provider is configured. The probe always emits PASS so the pipeline still greens; downstream `rag_index_subset` and `rag_retrieve_probe` will emit ⏭️ skip with `detail="embedding provider unreachable"`. Fix only if you want the knowledge phase to run: set `OPENAI_API_KEY` (when `RAG_EMBEDDING_PROVIDER=openai`) or start Ollama on `OLLAMA_BASE_URL` (when `RAG_EMBEDDING_PROVIDER=ollama`), then re-run.
 21. **`rag_index_subset` step fails with `path_prefix escapes the project root` (PRP-40, `showcase_rich` only)** — the demo step hard-codes `path_prefix="docs/user-guide"`, so a real-world hit means `RAGService._base_dir` no longer points at the repo root (e.g. a misconfigured container start). Fix: confirm the backend was started from the repo root (or that `RAGService(base_dir=...)` was constructed with the right path); rerun the showcase. The path-traversal guard is load-bearing security — never relax it.