Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ artifacts/
HANDOFF.md

# Local CI / dogfood logs and screenshots (per-session, never committed)
.ci-logs/
.ci-logs/
docs/manual_hun/
86 changes: 76 additions & 10 deletions app/features/demo/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,22 @@ def _parse_artifact_key(artifact_uri: str) -> str:
return match.group(1)


# Demo artifact keys are 12 hex chars -- the trained-model file stem
# (``model_{KEY}.joblib``) that ``register`` copies into the registry root.
# Kept next to ``_parse_artifact_key`` so the producer and parser stay in sync.
_DEMO_ARTIFACT_KEY_LEN = 12


def _format_demo_artifact_key(run_id_raw: str) -> str:
"""Build a parseable demo artifact key from a registry run id.

Strips dashes (registry ids may be hyphenated UUIDs) and truncates to
``_DEMO_ARTIFACT_KEY_LEN`` so the result is hex-only and matches the
``_ARTIFACT_KEY_RE`` (``model_([0-9a-f]+)``) parser.
"""
return run_id_raw.replace("-", "")[:_DEMO_ARTIFACT_KEY_LEN]


# PRP-40 — curated 5-file user-guide corpus indexed by the knowledge phase.
# The path_prefix RAG indexing additive contract scopes discovery to this
# subset (memory anchor: [[rag-runtime-config-and-corpus-state]] — keep the
Expand Down Expand Up @@ -1159,15 +1175,22 @@ async def step_scenario_simulate_and_save(ctx: DemoContext, client: _Client) ->
if ctx.date_end is None:
return ("fail", "no date_end on ctx (status step did not populate it)", {})

# (1) Resolve alias -> registry run_id (32-char uuid).
alias_body = await client.request(
"scenario_simulate_and_save[alias]",
"GET",
f"/registry/aliases/{DEMO_ALIAS}",
)
winner_run_id = alias_body.get("run_id")
if not isinstance(winner_run_id, str):
return ("fail", f"{DEMO_ALIAS} alias has no run_id", {})
# (1) Resolve the champion via ctx.winning_run_id (set by step_register), not
# the live demo-production alias -- safer_promote_flow swaps that alias to a
# worse-WAPE run, which broke replay here (#324). The champion run keeps its
# real, parseable artifact_uri. Fall back to the alias only when no champion
# was recorded.
winner_run_id = ctx.winning_run_id
if winner_run_id is None:
alias_body = await client.request(
"scenario_simulate_and_save[alias]",
"GET",
f"/registry/aliases/{DEMO_ALIAS}",
)
alias_run_id = alias_body.get("run_id")
if not isinstance(alias_run_id, str):
return ("fail", f"{DEMO_ALIAS} alias has no run_id", {})
winner_run_id = alias_run_id

# (2) Resolve run -> artifact_uri.
run_body = await client.request(
Expand Down Expand Up @@ -1769,7 +1792,11 @@ async def step_safer_promote_flow(ctx: DemoContext, client: _Client) -> StepResu
json_body={
"status": "success",
"metrics": {"wape": 99.0},
"artifact_uri": "demo/safer-promote-placeholder.joblib",
# #324 — real-shape, parseable artifact_uri (not a placeholder) so a
# downstream ``_parse_artifact_key`` consumer can resolve it.
"artifact_uri": (
f"demo/seasonal_naive-model_{_format_demo_artifact_key(worse_run_id_raw)}.joblib"
),
"artifact_hash": "0" * 64,
"artifact_size_bytes": 1,
},
Expand Down Expand Up @@ -1933,6 +1960,38 @@ async def step_batch_preset(ctx: DemoContext, client: _Client) -> StepResult:
)


async def _restore_demo_alias_after_failure(ctx: DemoContext, client: _Client) -> None:
"""Best-effort restore of the demo-production alias after a mid-run failure.

issue #324 — when a step fails the pipeline aborts before the trailing
``cleanup`` row runs, which would otherwise leave ``demo-production``
pointing at the ``safer_promote_flow`` worse-WAPE run. This restores the
original target captured before the swap. Never raises — a restore failure
must not mask the original step failure.
"""
if ctx.original_demo_alias_run_id is None:
return
try:
await client.request(
"cleanup[alias_restore_safeguard]",
"POST",
"/registry/aliases",
json_body={
"alias_name": DEMO_ALIAS,
"run_id": ctx.original_demo_alias_run_id,
"description": ("Restored by the showcase pipeline failure safeguard (#324)."),
},
)
except (_StepError, httpx.HTTPError, OSError):
# Best-effort — a restore failure must never mask the original failure,
# but capture the exception so intermittent restore issues stay debuggable.
logger.warning(
"demo.cleanup.alias_restore_safeguard_failed",
run_id=ctx.original_demo_alias_run_id,
exc_info=True,
)
Comment thread
sourcery-ai[bot] marked this conversation as resolved.


async def step_cleanup(ctx: DemoContext, client: _Client) -> StepResult:
"""Close the agent session + restore the demo-production alias (PRP-39 R15).

Expand Down Expand Up @@ -2549,6 +2608,13 @@ async def run_pipeline(app: FastAPI, req: DemoRunRequest) -> AsyncIterator[StepE
)
if status == "fail":
any_fail = True
# issue #324 — guarantee demo-production alias restoration even
# when a step fails mid-run. The pipeline aborts here, before the
# trailing ``cleanup`` row runs, which would otherwise leave the
# alias pointing at the safer_promote_flow worse-WAPE run.
# Best-effort; never raises. Skipped if cleanup itself failed.
if name != "cleanup":
await _restore_demo_alias_after_failure(ctx, client)
break

wall = time.monotonic() - wall_start
Expand Down
129 changes: 115 additions & 14 deletions app/features/demo/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,17 +1077,15 @@ def _make_showcase_ctx(scenario: ScenarioPreset = ScenarioPreset.SHOWCASE_RICH)


async def test_scenario_simulate_and_save_happy_path():
"""PRP-40 — happy path: resolves alias -> run -> artifact_key, saves plan."""
ctx = _make_showcase_ctx()
"""PRP-40 + #324 — resolves the champion via ctx.winning_run_id -> run ->
artifact_key, saves the plan. Must NOT read the demo-production alias
(safer_promote_flow deliberately corrupts it)."""
ctx = _make_showcase_ctx() # winning_run_id = "demo-run-abc123def456"
client = _RecordingClient(
None,
responses={
(
"GET",
"/registry/aliases/demo-production",
): {"alias_name": "demo-production", "run_id": "uuid-32-char"},
("GET", "/registry/runs/uuid-32-char"): {
"run_id": "uuid-32-char",
("GET", "/registry/runs/demo-run-abc123def456"): {
"run_id": "demo-run-abc123def456",
"artifact_uri": "demo/seasonal_naive-model_abc123def456.joblib",
},
("POST", "/scenarios"): {
Expand Down Expand Up @@ -1118,11 +1116,15 @@ async def test_scenario_simulate_and_save_happy_path():
assert body["run_id"] == "abc123def456"
assert body["assumptions"]["price"]["change_pct"] == -0.10
assert body["tags"] == ["showcase", "price"]
# #324 — the safer-promote-corrupted demo-production alias must NOT be read.
assert all(path != "/registry/aliases/demo-production" for _m, path, _b in client.calls)


async def test_scenario_simulate_and_save_missing_alias_fails():
"""PRP-40 — alias missing run_id -> FAIL with clear detail."""
async def test_scenario_simulate_and_save_missing_champion_falls_back_to_alias():
"""PRP-40 + #324 — with no champion recorded, fall back to the alias; an
alias missing run_id -> FAIL with clear detail."""
ctx = _make_showcase_ctx()
ctx.winning_run_id = None # force the defensive alias fallback
client = _RecordingClient(
None,
responses={
Expand All @@ -1135,20 +1137,119 @@ async def test_scenario_simulate_and_save_missing_alias_fails():


async def test_scenario_simulate_and_save_unparseable_artifact_uri_fails():
"""PRP-40 — artifact_uri the regex can't parse -> FAIL."""
ctx = _make_showcase_ctx()
"""PRP-40 — the champion run's artifact_uri the regex can't parse -> FAIL."""
ctx = _make_showcase_ctx() # winning_run_id = "demo-run-abc123def456"
client = _RecordingClient(
None,
responses={
("GET", "/registry/aliases/demo-production"): {"run_id": "uuid"},
("GET", "/registry/runs/uuid"): {"artifact_uri": "garbage-path.bin"},
("GET", "/registry/runs/demo-run-abc123def456"): {"artifact_uri": "garbage-path.bin"},
},
)
status, detail, _ = await pipeline.step_scenario_simulate_and_save(ctx, _as_client(client))
assert status == "fail"
assert "artifact-key" in detail


async def test_scenario_simulate_and_save_ignores_corrupted_demo_alias():
"""#324 regression — the step resolves the champion via ctx.winning_run_id
and never consults the safer-promote-corrupted demo-production alias."""
ctx = _make_showcase_ctx() # winning_run_id = "demo-run-abc123def456"
client = _RecordingClient(
None,
responses={
("GET", "/registry/runs/demo-run-abc123def456"): {
"artifact_uri": "demo/seasonal_naive-model_abc123def456.joblib",
},
("POST", "/scenarios"): {
"scenario_id": "scn-001",
"comparison": {"method": "heuristic", "units_delta": 1.0, "revenue_delta": 2.0},
},
},
)
status, _detail, _data = await pipeline.step_scenario_simulate_and_save(ctx, _as_client(client))
assert status == "pass"
assert ctx.scenario_artifact_key == "abc123def456"
assert all(path != "/registry/aliases/demo-production" for _m, path, _b in client.calls)


def test_parse_artifact_key_rejects_safer_promote_placeholder():
"""#324 regression — the OLD PRP-39 placeholder artifact_uri is unparseable
(the exact failure the cascade surfaced); the NEW real-shape safer-promote
URI parses cleanly."""
import pytest

with pytest.raises(ValueError, match="Cannot parse artifact-key"):
pipeline._parse_artifact_key("demo/safer-promote-placeholder.joblib")
assert (
pipeline._parse_artifact_key("demo/seasonal_naive-model_abcdef012345.joblib")
== "abcdef012345"
)


def test_format_demo_artifact_key_round_trips_through_parser():
"""#324 — _format_demo_artifact_key strips dashes + truncates to a hex-only
key that round-trips through _parse_artifact_key (producer/parser in sync)."""
key = pipeline._format_demo_artifact_key("1234abcd-5678-90ef-dead-beef00112233")
assert key == "1234abcd5678"
assert len(key) == pipeline._DEMO_ARTIFACT_KEY_LEN
uri = f"demo/seasonal_naive-model_{key}.joblib"
assert pipeline._parse_artifact_key(uri) == key


class _AliasRestoreSpyClient:
"""Minimal _Client stand-in recording alias-restore POSTs (#324 safeguard)."""

def __init__(self, *, fail: bool = False) -> None:
self.calls: list[tuple[str, str, dict[str, Any] | None]] = []
self._fail = fail

async def request(
self,
step: str,
method: str,
path: str,
*,
json_body: dict[str, Any] | None = None,
) -> dict[str, Any]:
self.calls.append((method, path, json_body))
if self._fail:
raise OSError("simulated transport failure")
return {}


async def test_restore_demo_alias_after_failure_repoints_to_original():
"""#324 — a mid-run failure must restore demo-production to the champion."""
ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False)
ctx.original_demo_alias_run_id = "champion-run-123"
spy = _AliasRestoreSpyClient()
await pipeline._restore_demo_alias_after_failure(ctx, cast("pipeline._Client", spy))
assert len(spy.calls) == 1
method, path, body = spy.calls[0]
assert method == "POST"
assert path == "/registry/aliases"
assert body is not None
assert body["alias_name"] == pipeline.DEMO_ALIAS
assert body["run_id"] == "champion-run-123"


async def test_restore_demo_alias_after_failure_noop_without_swap():
"""#324 — no original alias captured (no swap happened) -> no restore call."""
ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False)
ctx.original_demo_alias_run_id = None
spy = _AliasRestoreSpyClient()
await pipeline._restore_demo_alias_after_failure(ctx, cast("pipeline._Client", spy))
assert spy.calls == []


async def test_restore_demo_alias_after_failure_swallows_errors():
"""#324 — the safeguard must never raise (must not mask the original fail)."""
ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False)
ctx.original_demo_alias_run_id = "champion-run-123"
spy = _AliasRestoreSpyClient(fail=True)
await pipeline._restore_demo_alias_after_failure(ctx, cast("pipeline._Client", spy)) # no raise
assert len(spy.calls) == 1


async def test_multi_plan_compare_happy_path():
"""PRP-40 — happy path: second-plan save + compare returns ranked list."""
ctx = _make_showcase_ctx()
Expand Down
2 changes: 1 addition & 1 deletion docs/_base/RUNBOOKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ uv run python scripts/run_demo.py --seed 42 --quiet 2>&1 | tee demo.log
15. **`batch_preset` step shows ⚠️ "batch poll timed out at 90s" (PRP-39, `showcase_rich` only)** — the batch's 18 sub-jobs together exceeded the poll-timeout budget. Cause: a slow-feature-pipeline branch makes each grain×model pair take longer than expected; on a developer laptop with limited CPU 18 jobs can exceed 90 s under load. Fix: visit `/visualize/batch/{batch_id}` to follow the run to completion; the step is `warn` (non-fatal), so the pipeline still goes green.
16. **`batch_preset` step fails with `HTTP 422 -- Unprocessable Entity` from `/batch/forecasting` (PRP-39, `showcase_rich` only)** — `BatchSubmitRequest` validation rejected the body. Common causes: (a) `BatchScope.kind` casing drift (must be lowercase `"manual"`); (b) `operation` value drift (must be `"train"` / `"predict"` / `"backtest"` / `"train_backtest_register"`, NOT `"forecasting"`); (c) the discovered `store_ids` / `product_ids` list is empty because `step_status` did not seed the grain. Fix: re-tick `Re-seed first`; verify the discovery returns at least 3 stores + 2 products.
17. **`cleanup` step shows `alias restored=False` in detail (PRP-39 R15, `showcase_rich` only)** — the `POST /registry/aliases` restore call returned non-2xx. Cause: the original alias target was archived between the swap and the cleanup (an `agent_require_approval` archive_run tool fire by an operator during the demo). Fix: re-create the alias manually pointing at the V2 winner. The cleanup step warns and continues so the run still goes green.
18. **`scenario_simulate_and_save` step fails with `Cannot parse artifact-key from artifact_uri` (PRP-40, `showcase_rich` only)** — the `demo-production` alias's run has an `artifact_uri` the `_parse_artifact_key` regex can't match (`r"model_([0-9a-f]+)(?:\.joblib)?$"`). Causes: a backfilled run with an irregular `artifact_uri`, or a forecasting-slice change to the model-path convention. Fix: inspect the run via `GET /registry/aliases/demo-production` → `GET /registry/runs/{run_id}`, confirm `artifact_uri` matches one of the V1 (`demo/{model_type}-model_{KEY}.joblib`) or V2 (`artifacts/models/model_{KEY}.joblib`) shapes, then either re-run the showcase (the next `register` step rewrites the artifact_uri) or extend `_ARTIFACT_KEY_RE` if a new shape is intentional.
18. **`scenario_simulate_and_save` step fails with `Cannot parse artifact-key from artifact_uri` (PRP-40, `showcase_rich` only)** — FIXED in #324. The cascade had two root causes: `safer_promote_flow` (PRP-39) swapped the `demo-production` alias to a worse-WAPE run whose placeholder `artifact_uri` (`demo/safer-promote-placeholder.joblib`) the `_parse_artifact_key` regex (`r"model_([0-9a-f]+)(?:\.joblib)?$"`) could not match, and `scenario_simulate_and_save` then resolved that corrupted alias. The fix: the planning step now resolves the champion via `ctx.winning_run_id` (recorded by `register`, never touched by the swap) instead of the live alias, and `safer_promote_flow` writes a real-shape parseable `artifact_uri`. The orchestrator also runs an alias-restore safeguard (`_restore_demo_alias_after_failure`) on any mid-run failure so `demo-production` is never left on the worse-WAPE run. If you still hit this on a forked pipeline, the run's `artifact_uri` is irregular: confirm it matches one of the V1 (`demo/{model_type}-model_{KEY}.joblib`) or V2 (`artifacts/models/model_{KEY}.joblib`) shapes via `GET /registry/runs/{run_id}`, re-run the showcase (the next `register` step rewrites the artifact_uri), or extend `_ARTIFACT_KEY_RE` if a new shape is intentional.
19. **`multi_plan_compare` step shows ⚠️ with `holiday-plan save failed: ...; price-cut plan still saved` (PRP-40, `showcase_rich` only)** — the second `POST /scenarios` returned 4xx (most likely 422). The price-cut plan was still saved (partial success — R19), so the run keeps going green. Fix: read the RFC 7807 body in the detail; common causes are a horizon out of range or a malformed `holiday.dates` payload. Re-running the showcase regenerates both plans from scratch.
20. **`embedding_provider_probe` step shows ✅ but `reachable=False` (PRP-40, `showcase_rich` only)** — expected when no embedding provider is configured. The probe always emits PASS so the pipeline still greens; downstream `rag_index_subset` and `rag_retrieve_probe` will emit ⏭️ skip with `detail="embedding provider unreachable"`. Fix only if you want the knowledge phase to run: set `OPENAI_API_KEY` (when `RAG_EMBEDDING_PROVIDER=openai`) or start Ollama on `OLLAMA_BASE_URL` (when `RAG_EMBEDDING_PROVIDER=ollama`), then re-run.
21. **`rag_index_subset` step fails with `path_prefix escapes the project root` (PRP-40, `showcase_rich` only)** — the demo step hard-codes `path_prefix="docs/user-guide"`, so a real-world hit means `RAGService._base_dir` no longer points at the repo root (e.g. a misconfigured container start). Fix: confirm the backend was started from the repo root (or that `RAGService(base_dir=...)` was constructed with the right path); rerun the showcase. The path-traversal guard is load-bearing security — never relax it.
Expand Down
Loading