From b51730d184eed82ea53dafa9536f95e3d7c90cc6 Mon Sep 17 00:00:00 2001
From: willamhou <willamhou@ceresman.com>
Date: Sat, 23 May 2026 22:09:42 +0800
Subject: [PATCH] Clean up current documentation status

---
 README.ja-JP.md            |   45 +-
 README.md                  |   27 +-
 README.zh-CN.md            |   45 +-
 docs/agents.md             |    2 +-
 docs/architecture.md       |  298 +++--
 docs/current-status.md     |  252 ++---
 docs/install.md            |   21 +-
 docs/mvp.md                |  126 +--
 docs/release.md            |   24 +-
 docs/repl.md               |   12 +-
 docs/roadmap.md            | 2117 ++----------------------------------
 docs/superpowers/README.md |   17 +
 docs/todos.md              |    2 +-
 13 files changed, 464 insertions(+), 2524 deletions(-)
 create mode 100644 docs/superpowers/README.md

diff --git a/README.ja-JP.md b/README.ja-JP.md
index 30078dc..acbaf3e 100644
--- a/README.ja-JP.md
+++ b/README.ja-JP.md
@@ -123,10 +123,11 @@ DeepSeekCode は自身の開発に使える段階ですが、Claude Code CLI / C
 ほどの製品成熟度にはまだ届いていません。Linux/macOS のローカル coding-agent
 CLI に絞ると、残差は主に evidence depth と配布面の polish です。
 
-- macOS shell/runtime の hosted CI evidence は PR #14 / CI run #35 で通過済み。
-  release binary evidence は次回 release matrix run で取得します。
-- disposable Python invoice fixture の online multi-file external evidence は記録済み。
-  追加の外部サンプルは任意の hardening です。
+- Linux/macOS shell/runtime と multi-file fixture scaffold の hosted CI
+  evidence は記録済みです。PR #16 / CI run #39 が現在の全 platform green
+  run で、release binary evidence は次回 release matrix run で取得します。
+- disposable Python invoice fixture の online multi-file external evidence と
+  verifier 結果は記録済みです。追加の外部サンプルは任意の hardening です。
 - Homebrew 公開。tap 資格情報が未設定です。
 - コミット済み model-backed SVG を超える、任意の polish 済み GIF/MP4 キャプチャ。
 
@@ -188,10 +189,13 @@ node packaging/homebrew/verify-formula.js
 
 ```bash
 deepseek update publish-status
-deepseek update publish-status --dist dist-assets --npm-dist npm-dist --strict
+deepseek update publish-status --dist dist-assets --npm-dist npm-dist \
+  --live-evidence-verification .dscode/dogfood/live-evidence-verification.json \
+  --strict
 deepseek update publish-status --json
 deepseek agents service-doctor --kind all --workdir "$PWD" --bin "$(command -v deepseek)" --json
-deepseek agents service-smoke --workdir "$PWD" --bin "$(command -v deepseek)" --json
+mkdir -p /tmp/dsc-smk
+deepseek agents service-smoke --workdir /tmp/dsc-smk --bin "$(command -v deepseek)" --json
 deepseek agents shell-fixture-smoke --json
 deepseek tui --entrypoint-smoke --smoke-bin "$(command -v deepseek)"
 ```
@@ -209,20 +213,37 @@ repository を使います。まず dry-run で preflight し、その後 isolat
 実行して dogfood report に記録します。
 
 ```bash
-scripts/create-multifile-external-fixture.sh /tmp/deepseek-external-fixtures/python-invoice-multifile
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --dry-run \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --benchmark-gate \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
+fixture_dir=/tmp/deepseek-external-fixtures/python-invoice-multifile
+scripts/create-multifile-external-fixture.sh "$fixture_dir"
+task='replace `return amount - discount` with `return max(amount - discount, 0.0)` in src/invoice_math/pricing.py and replace `Invoice total` with `Final total` in src/invoice_math/summary.py, validate with python3 -m unittest discover -s tests'
+deepseek dogfood external-fixture --workdir "$fixture_dir" --dry-run "$task"
+deepseek dogfood external-fixture --workdir "$fixture_dir" \
+  --evidence-out .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  "$task"
+deepseek dogfood external-evidence \
+  --file .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  --out .dscode/dogfood/external-fixture-python-invoice-multifile-verification.json \
+  --require-successful-external-fixtures 1
 deepseek dogfood report --limit 10
+deepseek dogfood live-plan --limit 10
+deepseek dogfood live-run --limit 3 --json
+deepseek dogfood live-run --limit 3 --evidence-out .dscode/dogfood/live-evidence.json --execute
+deepseek dogfood live-evidence --file .dscode/dogfood/live-evidence.json \
+  --out .dscode/dogfood/live-evidence-verification.json \
+  --require-benchmark-gate --require-report-gate
 deepseek dogfood report --limit 20 \
   --require-min-runs 100 \
   --require-success-rate 90 \
+  --require-live-runs 100 \
+  --require-live-success-rate 90 \
   --require-recent-clean 20 \
   --require-external-write-fixtures 3 \
   --require-category write_validate:25:90 \
   --require-category recovery:25:90 \
-  --require-category pr_workflow:25:90
+  --require-category pr_workflow:25:90 \
+  --require-live-category write_validate:25:90 \
+  --require-live-category recovery:25:90 \
+  --require-live-category pr_workflow:25:90
 ```
 
 ## ドキュメント
diff --git a/README.md b/README.md
index 2a73056..041bef4 100644
--- a/README.md
+++ b/README.md
@@ -122,11 +122,12 @@ DeepSeekCode is close enough to use as its own coding CLI, but it is not yet at
 Claude Code CLI / Codex CLI polish. For a Linux/macOS local coding-agent CLI,
 the remaining gaps are mostly evidence depth and distribution polish:
 
-- macOS shell/runtime CI evidence beyond the entrypoint smoke is now recorded
-  in PR #14 / CI run #35; release-binary evidence will come from the next
-  release matrix run;
-- online multi-file external fixture evidence is now recorded for the
-  disposable Python invoice fixture; additional external samples are optional;
+- Linux/macOS shell/runtime and multi-file fixture scaffold evidence is now
+  recorded in hosted CI; PR #16 / CI run #39 is the current all-platform green
+  run, and release-binary evidence will come from the next release matrix run;
+- online multi-file external fixture evidence is now recorded and verified for
+  the disposable Python invoice fixture; additional external samples are
+  optional hardening;
 - Homebrew publishing, still blocked on tap credentials;
 - optional polished GIF/MP4 capture beyond the committed model-backed SVG.
 
@@ -213,11 +214,17 @@ outside this checkout. The command dry-runs preflight first, then runs against
 an isolated copy and records the result in the dogfood report:
 
 ```bash
-scripts/create-multifile-external-fixture.sh /tmp/deepseek-external-fixtures/python-invoice-multifile
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --dry-run \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --benchmark-gate \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
+fixture_dir=/tmp/deepseek-external-fixtures/python-invoice-multifile
+scripts/create-multifile-external-fixture.sh "$fixture_dir"
+task='replace `return amount - discount` with `return max(amount - discount, 0.0)` in src/invoice_math/pricing.py and replace `Invoice total` with `Final total` in src/invoice_math/summary.py, validate with python3 -m unittest discover -s tests'
+deepseek dogfood external-fixture --workdir "$fixture_dir" --dry-run "$task"
+deepseek dogfood external-fixture --workdir "$fixture_dir" \
+  --evidence-out .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  "$task"
+deepseek dogfood external-evidence \
+  --file .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  --out .dscode/dogfood/external-fixture-python-invoice-multifile-verification.json \
+  --require-successful-external-fixtures 1
 deepseek dogfood report --limit 10
 deepseek dogfood live-plan --limit 10
 deepseek dogfood live-run --limit 3
diff --git a/README.zh-CN.md b/README.zh-CN.md
index ad00d2e..a15d2c9 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -118,10 +118,11 @@ DeepSeekCode 已经可以直接拿来写自己的代码，但还没有达到 Cla
 Codex CLI 的产品成熟度。如果只看 Linux/macOS 本地 coding-agent CLI，剩余差距主要是
 证据厚度和分发打磨：
 
-- macOS shell/runtime 的 hosted CI 证据已经在 PR #14 / CI run #35 通过；
-  release binary 证据等待下一次 release matrix 产出；
-- disposable Python invoice fixture 已有 online multi-file external evidence；
-  继续增加外部样本属于可选加固；
+- Linux/macOS shell/runtime 和 multi-file fixture scaffold 已有 hosted CI
+  证据；PR #16 / CI run #39 是当前全平台绿色 run，release binary 证据等待
+  下一次 release matrix 产出；
+- disposable Python invoice fixture 已有 online multi-file external evidence
+  和 verifier 结果；继续增加外部样本属于可选加固；
 - Homebrew 发布仍缺 tap 凭据；
 - 已提交 model-backed SVG 之外，可选的更精致 GIF/MP4 录屏素材。
 
@@ -181,10 +182,13 @@ node packaging/homebrew/verify-formula.js
 
 ```bash
 deepseek update publish-status
-deepseek update publish-status --dist dist-assets --npm-dist npm-dist --strict
+deepseek update publish-status --dist dist-assets --npm-dist npm-dist \
+  --live-evidence-verification .dscode/dogfood/live-evidence-verification.json \
+  --strict
 deepseek update publish-status --json
 deepseek agents service-doctor --kind all --workdir "$PWD" --bin "$(command -v deepseek)" --json
-deepseek agents service-smoke --workdir "$PWD" --bin "$(command -v deepseek)" --json
+mkdir -p /tmp/dsc-smk
+deepseek agents service-smoke --workdir /tmp/dsc-smk --bin "$(command -v deepseek)" --json
 deepseek agents shell-fixture-smoke --json
 deepseek tui --entrypoint-smoke --smoke-bin "$(command -v deepseek)"
 ```
@@ -201,20 +205,37 @@ deepseek pr live-status owner/repo#42 --json
 命令会先 dry-run 检查，然后在 isolated copy 中执行，并把结果写入 dogfood report：
 
 ```bash
-scripts/create-multifile-external-fixture.sh /tmp/deepseek-external-fixtures/python-invoice-multifile
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --dry-run \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --benchmark-gate \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
+fixture_dir=/tmp/deepseek-external-fixtures/python-invoice-multifile
+scripts/create-multifile-external-fixture.sh "$fixture_dir"
+task='replace `return amount - discount` with `return max(amount - discount, 0.0)` in src/invoice_math/pricing.py and replace `Invoice total` with `Final total` in src/invoice_math/summary.py, validate with python3 -m unittest discover -s tests'
+deepseek dogfood external-fixture --workdir "$fixture_dir" --dry-run "$task"
+deepseek dogfood external-fixture --workdir "$fixture_dir" \
+  --evidence-out .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  "$task"
+deepseek dogfood external-evidence \
+  --file .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  --out .dscode/dogfood/external-fixture-python-invoice-multifile-verification.json \
+  --require-successful-external-fixtures 1
 deepseek dogfood report --limit 10
+deepseek dogfood live-plan --limit 10
+deepseek dogfood live-run --limit 3 --json
+deepseek dogfood live-run --limit 3 --evidence-out .dscode/dogfood/live-evidence.json --execute
+deepseek dogfood live-evidence --file .dscode/dogfood/live-evidence.json \
+  --out .dscode/dogfood/live-evidence-verification.json \
+  --require-benchmark-gate --require-report-gate
 deepseek dogfood report --limit 20 \
   --require-min-runs 100 \
   --require-success-rate 90 \
+  --require-live-runs 100 \
+  --require-live-success-rate 90 \
   --require-recent-clean 20 \
   --require-external-write-fixtures 3 \
   --require-category write_validate:25:90 \
   --require-category recovery:25:90 \
-  --require-category pr_workflow:25:90
+  --require-category pr_workflow:25:90 \
+  --require-live-category write_validate:25:90 \
+  --require-live-category recovery:25:90 \
+  --require-live-category pr_workflow:25:90
 ```
 
 ## 文档
diff --git a/docs/agents.md b/docs/agents.md
index 475d327..5e67ee2 100644
--- a/docs/agents.md
+++ b/docs/agents.md
@@ -306,7 +306,7 @@ deepseek agents clear-current
 
 ## Dispatch
 
-`dispatch_subagent` accepts an optional `agent` argument. When set, DeepseekCode
+`dispatch_subagent` accepts an optional `agent` argument. When set, DeepSeekCode
 loads the matching project or user agent and injects its prompt into the child
 task. Project agents take precedence over user agents with the same name.
 
diff --git a/docs/architecture.md b/docs/architecture.md
index 500b691..7f389a3 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -1,191 +1,147 @@
-# 架构设计
-
-## 总体原则
-
-架构按四层划分：
-
-1. `CLI / UI`
-2. `Core Runtime`
-3. `Model Adapter`
-4. `Tooling + Strategy`
-
-设计目标：
-
-- 模型适配与 agent 运行时解耦
-- 工具原子化，方便权限控制和审计
-- `Skill` 与 `Language Profile` 作为策略层存在，不污染核心运行时
-- 先保证闭环稳定，再做生态扩展
-
-## 目录建议
-
-```text
-DeepSeek Code/
-  Cargo.toml
-  src/
-    main.rs
-
-    cli/
-      mod.rs
-      app.rs
-      commands/
-        chat.rs
-        run.rs
-        diff.rs
-        resume.rs
-        config.rs
-        doctor.rs
-
-    core/
-      mod.rs
-      agent.rs
-      loop.rs
-      planner.rs
-      executor.rs
-      memory.rs
-      session.rs
-      context.rs
-      approval.rs
-
-    model/
-      mod.rs
-      client.rs
-      deepseek.rs
-      protocol.rs
-      stream.rs
-
-    tools/
-      mod.rs
-      registry.rs
-      types.rs
-      read_file.rs
-      list_files.rs
-      search_text.rs
-      apply_patch.rs
-      run_shell.rs
-      git_diff.rs
-
-    language/
-      mod.rs
-      detect.rs
-      profile.rs
-      infer.rs
-      profiles/
-        generic.rs
-        rust.rs
-        python.rs
-        typescript.rs
-        go.rs
-        java.rs
-
-    skills/
-      mod.rs
-      schema.rs
-      loader.rs
-      registry.rs
-      resolver.rs
-
-    config/
-      mod.rs
-      types.rs
-      load.rs
-      paths.rs
-
-    ui/
-      mod.rs
-      render.rs
-      diff.rs
-      confirm.rs
-      stream.rs
-
-    error/
-      mod.rs
-
-  skills/
-  profiles/
-```
-
-## 分层说明
-
-### CLI / UI
-
-职责：
-
-- 解析参数
-- 呈现交互式会话
-- 展示 diff、确认框、流式输出
-- 提供 `chat/run/diff/resume/config/doctor` 命令
-
-### Core Runtime
-
-职责：
-
-- 维护 agent loop
-- 管理上下文和记忆
-- 驱动工具调用
-- 处理 approval policy
-- 保存会话状态
-
-`Core` 应尽量不依赖具体模型供应商。
+# Architecture
+
+DeepSeekCode is a terminal-first code-agent CLI. The implementation is organized
+around a local agent loop, a durable runtime store, permissioned tools, and
+multiple human/operator surfaces over the same core behavior.
+
+## Layers
+
+### CLI And UI
+
+The public entrypoint is `deepseek`.
+
+- Bare `deepseek` opens the full-screen TUI when stdin/stdout are real TTYs.
+- `deepseek chat` opens the line-oriented REPL.
+- `deepseek run` / `deepseek exec` run one-shot or scriptable tasks.
+- `deepseek agents`, `deepseek task`, `deepseek github`, `deepseek mcp`,
+  `deepseek dogfood`, and `deepseek update` expose operational surfaces for
+  runtime workers, CI, release, and evidence.
+
+The TUI and REPL are separate frontends over the same agent/runtime/tooling
+contracts. The TUI emphasizes durable sessions, approvals, task panels, MCP
+manager screens, and live runtime refresh. The REPL emphasizes fast local prompt
+iteration, slash commands, raw-mode line editing, and session save/load.
+
+### Agent Loop
+
+The agent loop builds a `ModelRequest` from:
+
+- user task text;
+- workspace instructions (`AGENTS.md`, compatible Claude instruction files, and
+  optional user instructions);
+- selected profile/model/provider settings;
+- recent transcript/runtime context;
+- tool observations and recovery hints;
+- skill/custom command prompt additions.
+
+The model returns either a final response or a tool call. Tool calls are routed
+through the same permission, policy, hook, execution, observation, and recovery
+paths regardless of whether the request came from TUI, REPL, exec, runtime
+daemon, GitHub Action bridge, or dogfood.
 
 ### Model Adapter
 
-职责：
+The model layer normalizes DeepSeek/OpenAI-compatible and Anthropic-compatible
+responses into the same internal action types.
+
+Supported behavior includes:
+
+- OpenAI-compatible function/tool calls;
+- same-turn batch tool calls;
+- Anthropic-style `tool_use` content blocks;
+- streaming assistant deltas and reasoning/thinking deltas;
+- provider/model aliases, including DeepSeek V4 aliases and `auto` routing.
+
+The default provider is DeepSeek-first, but the rest of the runtime is not tied
+to one provider-specific response shape.
+
+### Tools And Policy
+
+Tools are small, auditable operations behind a registry. Core tools include:
+
+- file and repository inspection (`list_files`, `read_file`, `search_text`,
+  `git_diff`, project map/status helpers);
+- edits (`apply_patch`, write/move/copy/delete helpers where enabled);
+- shell execution and shell-supervisor control;
+- diagnostics, tests, review helpers, todo/plan state, notes, memory, rollback;
+- MCP/ACP bridge tools;
+- runtime task, automation, subagent, and user-input tools;
+- web/search/finance/image/document helper tools where configured.
+
+Before side effects run, policy and approval checks decide whether the tool may
+execute. Hooks can add context, deny actions, or inject shell environment keys.
+Tool results are summarized into observations and persisted into runtime records
+when a durable thread is active.
+
+### Durable Runtime
+
+The runtime store lives under `.dscode/runtime/` and records:
+
+- sessions and linked threads;
+- turns and item timelines;
+- usage and cost/cache telemetry;
+- events such as permission requests, approvals, cancellations, and user-input
+  requests;
+- task and automation records.
 
-- 与 DeepSeek API 通信
-- 统一消息格式
-- 解析模型输出
-- 适配流式输出与 tool-call 风格
+The same store is exposed through:
 
-这里先只做 `DeepSeek`，但接口上保留扩展空间。
+- local file-backed TUI access;
+- `deepseek serve --http` REST/SSE endpoints;
+- `deepseek agents daemon` and `deepseek agents run-task`;
+- local service templates for systemd/launchd;
+- release and smoke checks.
 
-### Tooling + Strategy
+Runtime events use append-only records so UI clients and background workers can
+coordinate without sharing process memory.
 
-职责：
+### Shell Supervisor
 
-- 提供可执行工具
-- 识别语言与仓库类型
-- 加载 `Skill` 和 `Language Profile`
-- 选择合理命令与文件优先级
+The shell-supervisor is the long-running shell control plane. It supports:
 
-## 核心运行循环
+- safe background shell jobs;
+- start/wait/attach/replay/stdin/resize/cancel;
+- Linux native PTY jobs;
+- byte-stream and raw-proxy modes;
+- Linux `pty_fd` handoff for direct PTY master control;
+- Windows ConPTY/TCP smoke paths;
+- service smoke and installed-service probes.
 
-第一版建议采用简单 loop：
+This lets the agent keep terminal work observable, cancellable, replayable, and
+separate from the parent CLI process.
 
-1. 用户输入任务
-2. 检测语言 profile
-3. 解析可选 skill
-4. 组装 system prompt
-5. 调用模型
-6. 如果模型请求工具，则执行工具
-7. 将工具结果回填模型
-8. 循环直到完成或达到 step limit
-9. 展示变更 diff 和最终总结
+### MCP And ACP
 
-第一版不要过早把 planner/executor 做得很重，`loop.rs` 可以先承载主流程。
+DeepSeekCode can act as:
 
-## 核心抽象
+- an MCP server exposing workspace/runtime tools, resources, and prompts;
+- an MCP client for configured stdio/HTTP/SSE servers;
+- an ACP stdio adapter for runtime sessions and tool events.
 
-建议保留两个基础 trait：
+Agent-visible MCP calls go through approval and allowlist policy. Remote MCP
+tools can be exposed as dynamic `mcp__server__tool` tools when explicitly
+enabled.
 
-```rust
-pub trait ModelClient {
-    async fn respond(&self, input: ModelRequest) -> anyhow::Result<ModelResponse>;
-}
-```
+### Automation And CI
 
-```rust
-pub trait Tool {
-    fn name(&self) -> &'static str;
-    async fn execute(&self, input: ToolInput) -> anyhow::Result<ToolOutput>;
-}
-```
+The automation layer includes:
 
-统一通过 `ToolRegistry` 做分发和权限控制。
+- `deepseek task` background worktree runner;
+- `deepseek github action` bridge for review/fix/patch workflows;
+- local fixture smoke commands for tasks, GitHub bridge, hooks, MCP, subagents,
+  shell supervisor, and services;
+- dogfood live/external evidence recorders and verifiers;
+- release packaging and public install readiness checks.
 
-## 关键工程原则
+The release goal is to make important behavior repeatable through one-command
+gates rather than relying on ad hoc manual proof.
 
-- 文件修改优先走 patch，不做整文件覆盖式写入
-- shell 执行必须受控，并支持审批
-- 所有工具调用都要保留日志和可审计记录
-- 策略配置数据驱动，避免把语言和任务逻辑写死在 prompt 里
+## Safety Principles
 
+- Prefer patches over whole-file overwrites.
+- Keep side effects permissioned and auditable.
+- Keep credentials out of transcripts, evidence, and committed files.
+- Bind release evidence to ledger fingerprints and verifier output.
+- Treat historical specs as audit records, not current truth.
+- Keep Linux/macOS local CLI readiness separate from broader Windows, hosted
+  IDE, and publishing hardening.
diff --git a/docs/current-status.md b/docs/current-status.md
index fca18b9..3aa37b0 100644
--- a/docs/current-status.md
+++ b/docs/current-status.md
@@ -8,172 +8,92 @@ DeepSeekCode 的目标是成为一个 DeepSeek-first 的 code agent CLI：用户
 `deepseek` 后，可以像使用 Claude Code CLI、Codex CLI 或 DeepSeek-TUI 一样，完成真实
 仓库里的读代码、改代码、跑命令、查看 diff、继续修复、恢复会话和发布前验证。
 
-当前执行口径先收敛到 Linux/macOS 本地 code agent CLI：只要用户能在 Linux/macOS
-安装并运行 `deepseek`，稳定进入 TUI/REPL，完成模型读写代码、shell 验证、diff
-review、resume 和本地 runtime/shell-supervisor 工作流，就可以认为这个 milestone
-成立。Windows ConPTY/service proof、hosted IDE 证据和 npm 发布属于后续跨平台/集成
-硬化，不再阻塞 Linux/macOS 本地 CLI 目标。
-
-最终验收口径不是“功能列表看起来很多”，而是：
-
-- 裸 `deepseek` 在真实 TTY 里稳定进入 coding-agent TUI；
-- 模型能稳定调用文件、shell、patch、diagnostics、review、runtime、subagent、MCP/ACP 等工具；
-- 一个真实代码任务可以从需求进入、修改文件、跑测试、修失败、输出 diff 和总结；
-- shell/PTY、审批、回滚、secret scan、dogfood 和 CI gate 都能证明行为可靠；
-- 安装分发覆盖 GitHub Release、GHCR、npm、Homebrew，并且 README 能让新用户快速理解和试用；
-- 和 Claude Code CLI / Codex CLI / DeepSeek-TUI 的核心使用差距收敛到 5% 以内。
-
-## 当前做到哪里
-
-当前项目已经不是早期原型，可以直接用于仓库内 dogfood 和中小型代码任务。当前主入口是
-`deepseek`，历史兼容入口 `dscode` 仍保留。
-
-已经具备的核心能力：
-
-- 终端入口：`deepseek`、`deepseek chat`、`deepseek run`、`deepseek tui`、`deepseek exec`。
-- TUI：全屏 workbench、Plan / Agent / YOLO 模式、approval modal、command palette、session/thread 视图、MCP 管理、setup/onboarding、provider/model picker。
-- Runtime：`.dscode/runtime/` 下持久化 sessions、threads、turns、items、events、tasks、usage、automations。
-- 工具：文件读取/搜索、patch、diff、shell、background jobs、diagnostics、review、notes、memory、rollback、skills、subagents。
-- 模型协议：OpenAI-compatible tool calls，同轮 batch tool calls，DeepSeek provider/model alias 兼容。
-- 审批与安全：approve-once、approve-for-session、deny fingerprint、secret scan、shell/network policy、rollback snapshot。
-- Shell/PTY：后台 shell job、wait/replay/attach/stdin/resize/cancel；Linux native-supervisor PTY；workspace shell-supervisor protocol bridge。
-- Linux/macOS CLI readiness：CI 和 Release Matrix 会在非 Windows 平台跑
-  `agents shell-fixture-smoke --json`、`agents service-smoke --json`、TUI entrypoint
-  smoke、task worktree smoke 和 GitHub bridge smoke；这把 Linux/macOS 本地 CLI 的入口、
-  runtime、shell-supervisor 和后台 worktree 基线纳入同一类 release gate。
-  PR #14 的 CI run #35 已在 hosted Linux/macOS debug binary 上通过 shell fixture、
-  service smoke 和 multi-file external fixture scaffold gate：
-  https://github.com/willamhou/DeepSeekCode/actions/runs/26333425574 。
-- 本轮新增：`deepseek chat` / `deepseek repl` / `deepseek interactive` 的真实 TTY 输入现在走内置 raw-mode line editor，补齐 Claude Code-like REPL 的 Up/Down history、history draft restore、左右移动、Home/End、Backspace/Delete、Ctrl+A/E/U/K/W、Tab slash/session completion、空行 Ctrl+D 和 prompt Ctrl+C 退出；运行中的 REPL turn 也会把 SIGINT 接到 `AgentLoopOptions.cancel_check`，让模型 stream 和 cancel-aware tools 协作取消，并在取消后恢复本轮 transcript/snapshot 指针，避免半截 prompt 污染后续上下文；`/sessions [prefix]` 可以列出保存的 REPL session，`/load ` 后 Tab 可补全 session 名；非交互测试路径仍保留 buffered reader，不需要真实终端。
-- 本轮新增：Phase 12E background worktree runner 第一片。新增 `deepseek task start/list/show/stop/diff/merge/reject` 和 `deepseek task fixture-smoke --json`：`task start` 会在当前 git repo 的 `.dscode/task-runner/worktrees/<id>` 创建隔离 worktree 和默认 `deepseek-task/<id>` 分支，把记录写到 `.dscode/task-runner/records/`，stdout/stderr 写到 `.dscode/task-runner/logs/`，并在该 worktree 中启动 `deepseek exec --json`；父 CLI 退出后 child 进程仍可继续。`--no-run` 可只创建 worktree/record，用于无 API key 的本地 gate；`task diff` 展示 task worktree 的 tracked patch/stat 和 untracked files，`task merge --check` dry-run 验证，`task merge` 要求原 worktree 干净后把 patch 和 untracked regular files 合回原 repo，`task reject` 默认删除受管 task worktree 并把记录标记为 rejected。`deepseek github action --background-task` 现在也可把解析出的 GitHub PR review/fix/patch 请求委派到同一 task runner，`--task-id` 支持 workflow 稳定 id，`--task-no-run` 支持无凭据本地 workflow gate。当前 `task fixture-smoke --json` 实测 `ok=true`、`worktree_created=true`、`record_listed=true`、`merge_check_ok=true`、`merge_apply_ok=true`、`reject_ok=true`、`cleanup_ok=true`；CI 已把该 smoke 接到 Linux/macOS/Windows debug binary，Release Matrix 也会在各平台 release binary packaging 前运行。
-- 本轮新增：`deepseek agents shell attach <task_id> --interactive` / `--takeover`。它会进入本地 raw mode，把按键转发到 supervisor `stdin`，把 resize 转发到 supervisor `resize`，并把 output 事件的 raw bytes replay 回当前终端；Linux 集成 smoke 已覆盖 raw-mode PTY 启动、`tty=true` job、stdin、resize、replay 和 bounded detach。它是可用的 bounded interactive attach，不是字节级 PTY fd 直连代理。
-- 本轮新增：shell-supervisor terminal event log 已为 PTY output/input 记录可选 `raw_base64`，supervisor `attach` JSON 响应会透出结构化 `terminal_raw_outputs`，`exec_shell_attach` 摘要仍带兼容的 `terminal_raw_base64` section。human `--follow` 现在走同一个 Unix socket 上的 `attach_stream` newline-JSON frame stream，`--follow` / `--interactive` 会优先解码 output raw bytes；`deepseek agents shell attach <task_id> --raw` 可为脚本直接输出 PTY bytes。新增 `deepseek agents shell byte-stream <task_id>` / supervisor `byte_stream`，可在一个 socket stream 里处理初始 stdin/resize、后续 newline-JSON stdin/resize/close/detach control frames，并以 `byte_outputs[].bytes_base64` 输出 raw PTY bytes；`--raw-proxy` / `raw_proxy=true` 会在初始 JSON 后切到原始 socket bytes：socket input 直接进 PTY stdin，PTY output bytes 直接写回 socket。`deepseek agents shell proxy <task_id>` 现在是面向人的 raw-proxy wrapper，会进入本地 raw mode、同步终端尺寸、转发 key/paste/resize、直接写回 PTY bytes，并用 `Ctrl-]` detach。Linux 上还新增 supervisor `pty_fd` / `deepseek agents shell fd-proxy <task_id>`：通过 SCM_RIGHTS 把 native-supervisor PTY master fd 临时交给本地 Unix client，handoff 期间暂停 supervisor replay reader，detach 后恢复事件记录；测试已覆盖 fd 交还后普通 supervisor `resize`、`stdin`、terminal `replay` 继续可用，Ctrl-D EOF 触发目标 PTY 关闭时 fd-proxy 把 Linux `EIO` 当作正常 EOF 成功退出，运行中本地终端 SIGWINCH resize 会同步到目标 PTY，Ctrl-C 会中断目标 PTY 的 foreground process group，以及 fd-proxy 被 SIGKILL 后 handoff lease 释放并恢复普通 supervisor 控制。HTTP SSE、ACP `session/shell/subscribe` 和 MCP `exec_shell_terminal_events` progress metadata 也会透出 raw bytes。
-- 本轮新增：`deepseek agents shell-fixture-smoke --json` 本地 Shell/PTY gate。它创建短路径临时 workspace，启动当前二进制的 `agents shell-supervisor --json`，验证 `health`、start/wait/attach/replay、Linux native PTY stdin/resize/replay/cancel，并把 `byte_stream` duplex control frames、`raw_proxy=true` 原始 socket bytes、Linux `pty_fd` fd handoff 和 human `agents shell proxy` raw-mode wrapper 纳入同一可复跑 smoke。当前实测 `blockers=0`、`warnings=0`，且 `shell_control` 摘要包含 `byte_stream duplex/raw_proxy/fd_handoff/human_proxy smoke passed`；`service-smoke` 的 shell-supervisor control smoke 也同步覆盖这些 byte-stream/proxy/fd-handoff 切片。聚焦测试还验证了 direct `pty_fd` 和 CLI `fd-proxy` detach 后，普通 supervisor 控制面可继续 resize/stdin/replay 同一个 PTY job。
-- 本轮新增：VS Code `DeepseekCode Agent` panel 的 native workbench 继续推进。侧栏 webview 现在直接启动 `deepseek exec --json`，在 panel 内流式显示 assistant delta、reasoning log、tool call/result、permission request、stderr 和完成状态，并把 active file、selection、diagnostics、dirty-buffer marker、Git status/diff summary 注入任务上下文；post-run 控件已支持 active-file `Review Diff`、panel 内 `Accept` 标记、确认后的 `Revert File`、`Refresh Diff`、workspace validation command 输出捕获、`Resume Latest` 继续最近 runtime session、workspace changed-file queue 的 open diff / mark reviewed / confirmed revert，以及 generated patch artifact queue。模型 final/tool result 里的 unified diff 会进入独立队列，单文件 patch 可打开 VS Code diff，pending patch 可确认后 `git apply --check` + `git apply`，也可 reject；`apply_patch` tool call 的 patch 会以 captured 状态记录，避免和已写入的 Git changes 混在一起。扩展目录也新增了 mocked DeepSeekCode binary 的 extension-host smoke harness，以及可在当前机器运行的 headless panel fixture；后者用临时 Git repo 证明 diagnostic context injection -> generated patch capture -> single-file diff opening -> checked `git apply` -> workspace queue refresh -> validation pass 的闭环，并修掉了 generated patch `-p0` fallback 和 `git status --short` 前导空格解析问题。当前机器没有 `code`/`codium` CLI，所以真实 VS Code runner 执行证据仍缺；这仍不是完整 Phase 12B，manual GUI fixture 证据还需要继续补。
-- 本轮新增：GitHub Action bridge 第一片。`deepseek github action` 会读取 `GITHUB_EVENT_PATH` / `GITHUB_EVENT_NAME`，把 `pull_request`、`issue_comment`、`pull_request_review`、`pull_request_review_comment` 事件解析成 `owner/repo#PR`，comment/review 事件默认要求 `@deepseek` trigger。`--mode auto|review|fix|patch` 会复用现有 `deepseek pr review/fix/patch` 路径，`@deepseek fix` / `@deepseek patch` 可自动路由到 CI-log repair / patch workflow；`--background-task` 可改为创建 `deepseek task start` 背景 worktree 记录，`--task-id` / `--task-no-run` 支持稳定 workflow id 和无凭据 gate；`--dry-run` 只输出解析目标，`--github-output` 会把 target fields 写入 `$GITHUB_OUTPUT`，`--require-mode` 可让 write workflow 对非 fix/patch trigger fail fast，`--post` 通过现有 `gh pr comment` 发布 review summary。`deepseek github pr-head` 会把 PR head owner/ref 解析为 CLI-tested step output，并在 write-capable checkout 前拒绝 fork-owned PR branch。`deepseek github fixture-smoke` 现在可本地 no-network 验证 review/write trigger、同仓库 PR head guard、fork guard、临时 Git remote 上的 checkout/commit/push、pushed-head 校验和 background-task worktree/record 创建；CI 和 Release Matrix 也会用 debug/release binary 跑这个 smoke。仓库新增 disabled-by-default 的 `.github/workflows/deepseek-code-review.yml` 示例，需设置 `DEEPSEEK_CODE_REVIEW_ENABLED=true` 和 `DEEPSEEK_API_KEY` 后才运行；示例 workflow 固定 `--mode review` 作为安全默认。仓库也新增 disabled-by-default 的 `.github/workflows/deepseek-code-write.yml` 写入示例，需设置 `DEEPSEEK_CODE_WRITE_ENABLED=true`，它只响应 `@deepseek fix/patch`，先用 CLI dry-run 解析 PR 并输出 step outputs，再通过 `deepseek github pr-head` 解析并校验同仓库 PR head，运行 `--mode auto`，最后 commit/push 工作区改动。默认 benchmark manifest 也补到 `26` 条 `pr_workflow` cases，新增 action-labeled review-comment-plan、`@deepseek fix`、`@deepseek patch` 和 hosted exact replacement request 覆盖。
-- 本轮新增：真实 hosted GitHub workflow 证据。PR #10 验证了 hosted write bridge 修复，`@deepseek patch change ... becomes ...` 由 GitHub Actions 成功生成并推送 `6fd5010 deepseek: apply requested PR update`；PR #11 在修复合入默认分支后重新做 post-merge smoke，同一路径再次生成并推送 `f0fe9a7 deepseek: apply requested PR update`；PR #12 删除了两份临时 evidence/smoke fixture，merge commit 为 `9423126`。三个临时分支已清理，证据保留在 PR 历史和对应 workflow run 记录里。
-- 本轮新增：Phase 12D 的 MCP fixture smoke 第一片。`deepseek mcp fixture-smoke --json` 会创建临时 fixture workspace 和 MCP config，用当前二进制的 `serve --mcp` 验证 stdio discovery/call，再启动本地 loopback HTTP/SSE MCP fixtures 验证 HTTP/SSE discovery/call，并通过默认 agent tool registry 验证动态 `mcp__server__tool` 暴露和 input schema cache。当前实测 stdio 工具发现 `57` 个、HTTP/SSE 各 `1` 个，三类 call 均通过，动态 schema cache 覆盖 `stdio-self/read_file`、`http-fixture/echo`、`sse-fixture/echo`。fixture 现在还会配置一个故意失败的 `broken-stdio` server，并证明它不会隐藏或破坏健康 server 的动态工具发现；同时验证 generic `mcp_call` 和动态 `mcp__stdio-self__read_file` 的 permission request、allowlisted allow、allowlist deny 三条 policy 路径。最新 JSON 字段 `bad_server_isolated`、`mcp_call_permission_ok`、`dynamic_permission_ok`、`mcp_call_allow_ok`、`mcp_call_allowlist_deny_ok`、`dynamic_allow_ok`、`dynamic_allowlist_deny_ok` 均为 `true`。本轮还把 prompts/resources/templates 纳入同一个 fixture smoke：stdio/HTTP/SSE 的 prompt discovery/get、resource discovery/read、resource template discovery 均通过，最新 JSON 中 `stdio_prompt_ok`、`http_prompt_ok`、`sse_prompt_ok`、`stdio_resource_ok`、`http_resource_ok`、`sse_resource_ok` 均为 `true`，template counts 为 `3/1/1`。这补上了 completion gate 中 MCP stdio/HTTP/SSE tool discovery/call/schema 注入、prompt/resource/template、bad-server isolation、MCP tool approval/allowlist 的本地证据。
-- 本轮新增：Phase 12D hooks fixture smoke。新增 `deepseek hooks fixture-smoke --json`，它创建临时 hook root 和 workspace，安装结构化 `{"decision":"allow","add_context":"..."}` recorder scripts，然后通过真实 `AgentLoop::run_with_client` 触发 `list_files` 工具调用。当前实测 JSON 为 `deepseek.hooks_fixture_smoke.v1`，`session_start_ok`、`user_prompt_submit_ok`、`pre_tool_ok`、`post_tool_ok`、`session_stop_ok`、`hook_contexts_ok`、`tool_ran_ok` 全为 `true`，事件顺序为 `session_start -> user_prompt_submit -> pre_tool_use -> post_tool_use -> session_stop`。这给 Phase 12D hooks prompt/session/tool lifecycle 和 structured allow/add_context 提供了单命令本地 gate。
-- 本轮新增：Phase 12D skills/custom command validation。新增 `deepseek skills list [--json]` 与 `deepseek skills validate [--strict] [--json]`，按运行时同一优先级扫描 bundled skills 和 `workspace.user_skills_dir`，报告同名覆盖、loader 错误、空核心元数据、空 `allowed_tools` 和未知工具名。当前 `cargo run --quiet -- skills validate --strict --json --dir skills` 实测 bundled `skills/` 共 `16` 个 skill，`valid_files=16`、`error_count=0`、`warning_count=0`、`ok=true`。`docs/skills-and-profiles.md` 也补了 PR review、release-check、security-lite 三类 skill 示例。这给 Phase 12D skill metadata validation 和 command/skill discovery 提供了单命令本地 gate。
-- 本轮新增：Phase 12D subagent fixture smoke。`dispatch_subagent` / `dispatch_subagents` 现在支持 `write_scope` / `write_set` 元数据，child prompt 会带 assigned write scope，parallel summary 会输出 `meta.parallel_blocked_children`、`meta.parallel_readback_required`、`meta.parallel_next_action`、`meta.parallel_child_N_files`、`meta.parallel_child_N_write_scope` 和 `meta.parallel_write_scope_conflicts`。父 planner 现在会同时消费 `dispatch_subagent` 与 `dispatch_subagents` 汇总，看到 child files 后先 `read_file` 回读再继续。新增 `deepseek agents subagent-fixture-smoke --json` 本地 gate，当前实测 `parser_ok`、`disjoint_write_scope_ok`、`readback_required_ok`、`blocker_summary_ok`、`conflict_summary_ok`、`artifact_ok` 全为 `true`，`child_count=2`。默认 benchmark manifest 已有 `20` 条 `subagent` category cases；benchmark runner 现在支持 `--category <name>` 与可重复 `--case <name>`，filtered run 只生成 report，不推进 history，也不强制全量 trend/live gate。当前 targeted subagent benchmark 实测 `20/20`，报告在 `/tmp/deepseek-subagent-benchmark.md`，trend/live 均标记为 filtered selection skip。本轮补上了并发 subagent 汇总的 readback 单测、fixture gate 和 targeted benchmark evidence。
-- 本轮新增：MCP 模型规划 benchmark 第一片。benchmark runner 现在支持 per-case 自举 `stdio-self` MCP fixture，可在 isolated workdir 写入 `.dscode/mcp.json`、按 case 开关动态 MCP 工具暴露，并设置 case-local `mcp_call_allowlist`。默认 manifest 新增 `fixture-mcp-dynamic-readme`、`fixture-mcp-generic-call-readme`、`fixture-mcp-allowlist-deny-recovery` 三条 MCP cases，分别覆盖动态 `mcp__stdio-self__read_file`、generic `mcp_call`、allowlist deny 后通过 `mcp_list_tools` 恢复。本轮 targeted MCP manifest 从 `/tmp` 运行实测 `3/3`，报告在 `/tmp/deepseek-mcp-benchmark.md`；完整默认 manifest 已扩到 `82` cases，并已刷新通过 `82/82`，最新完整报告在 `.dscode/benchmarks/latest.md`。
-- 本轮新增：benchmark PR planner hardening。离线 planner 现在会保留 `github_pr_context` / `review` / `pr_review_comment_plan` 这类结构化观察，不会因为同属 `Other` kind 被后续工具压缩成 superseded stub；PR comment 失败恢复在成功重建 plan 后不再重复重建第二次；skill auto-select 也会避免把远程 PR review/comment 任务降级到隐藏 `github_pr_context` / `review` / `pr_review_comment_plan` 的 debug skill。`run_shell` 现在会把 `pytest` / `python -m pytest` 缺失以及 profile 生成的 `uv run pytest` 安全标准化到 `uv --with pytest` fallback，并会自动发现 `~/.local/toolchains/go/*/bin` / `~/sdk/go*/bin` 这类用户级 Go toolchain。当前完整离线 benchmark 历史实测 `82/82`；本轮新增 hosted exact patch request targeted benchmark 通过，`26` 条 `pr_workflow` 中 planner/action/comment-plan/exact-request 项均有覆盖，新增 `mcp` category `3/3`；当前 trend gate 处于 comparable warmup，`found 2`，live gate 基于本机 dogfood ledger 从 `runs=5` 到 `runs=20` 通过。
-- 本轮新增：offline dogfood replay 覆盖补强。`cargo run --quiet -- dogfood replay-benchmark --category pr_workflow --limit 12 --benchmark-gate` 新增了 12 条 `pr_workflow` replay，覆盖 GitHub Action `@deepseek fix` / `@deepseek patch`、JS/Rust/Python/Go PR CI repair、PR retry validate、second-round feedback 和 Go patch validate，全部成功；该批次把 ledger 推到 `17` runs 后暴露 live coverage gate 还缺 `recovery` slice。随后 `cargo run --quiet -- dogfood replay-benchmark --category recovery --limit 3 --benchmark-gate` 新增 3 条 recovery replay，`recovery` 为 `3/3`，最终 `.dscode/dogfood/latest.md` 为 `20` runs、`19/20` success、`1` historical failed、`0` stuck、`0` manual；post-replay default benchmark 为 `82/82`，live gate `pass against previous dogfood snapshot (runs 5 -> 20)`。这些仍是 `offline` transport，不能替代后续真实 model-backed dogfood。
-- 本轮新增：`deepseek dogfood live-plan` 的推荐命令改为 `deepseek dogfood live-run ...`，文本和 JSON 都同时输出 dry-run 与 `--execute` 命令，避免 release operator 为 model-backed 证据误走 offline-friendly `replay-benchmark` 路径。`deepseek dogfood live-plan` 和 `deepseek dogfood live-run --json` 现在还输出 `post_run_report_command` / `evidence_gate`，直接给出 `dogfood report --require-live-runs ... --require-live-category ...` 的后置验收命令，让真实 online 执行后的 model-backed 证据可以 fail closed。`deepseek dogfood live-run --json` 保持机器可读 dry-run plan，包含 selected cases、online readiness、execute blocker 和 follow-up `--execute` command；它故意不和 `--execute` 混用，避免在线执行日志污染 JSON。`dogfood live-run` 还支持 `--api-key-file`/`--key-file` 指向仓库外 key 文件，只把 key 注入当前进程的 `model.api_key_env` 并在返回时恢复，JSON 只记录 `credential_source` 和文件路径，不输出 key 值。`dogfood live-run --execute --evidence-out <path>` 现在会在批次结束或首个失败后写出 `deepseek.dogfood.live_run_evidence.v1` JSON，记录 before/after ledger live counts、每个 case 追加的 model-backed ledger 行、benchmark gate 结果、同一条 post-run report gate，以及当前 ledger 文件的 `fnv1a64` fingerprint，仍不写入 API key 值。`deepseek dogfood live-evidence --file <path>` 现在可验证该 evidence 文件，默认要求 completed、online、至少 1 条 appended model-backed row；`--require-benchmark-gate` 可把 benchmark gate 也纳入 release fail-closed 检查，`--require-report-gate` 会读取 evidence 的 structured `evidence_gate` 和 ledger path，用 `dogfood report` 同一套 live requirement 逻辑验证 full live gate，重新计算 ledger fingerprint 并逐条核对 evidence 中 appended case 的 timestamp/outcome/model_transport/category 能在 ledger 中找到匹配记录，而不是执行 JSON 里的 shell command；`--json` 输出 `deepseek.dogfood.live_evidence_verification.v1`，`--out <path>` 可把 verification JSON 落盘作为 release evidence artifact。`dogfood external-fixture` 真实执行现在也默认要求 `model_transport=online`，离线只能 dry-run 或显式 `--allow-offline` 做 rehearsal，避免把 offline disposable repo 样本误计为 release evidence；`--evidence-out` 会写出 `deepseek.dogfood.external_fixture_evidence.v1`，包含 appended external fixture row、CLI 后置 validation command/pass 状态、release-evidence readiness 和 ledger fingerprint，便于上传发布证据。
-- 本轮新增：在线 DeepSeek dogfood 从 smoke 推进到完整 release gate。使用当前进程注入的 DeepSeek key 执行 `dogfood live-run --execute --evidence-out ...`，最终 `deepseek dogfood report --limit 100 --require-live-runs 100 --require-live-success-rate 90 --require-live-category write_validate:25:90 --require-live-category recovery:25:90 --require-live-category pr_workflow:25:90` 通过；外部 fixture 跑完后 `live-plan` 显示 `105` 条 online run、`99` 条 success，分类为 `write_validate 29/30`、`recovery 23/25`、`pr_workflow 47/50`。执行过程中又修掉两类真实模型卡点：Python pytest retry readback 现在能识别 `def test_` / `assert ` 测试文件，并从错误的 `a * b` 回退到 `a + b`；空搜索恢复任务在看到 no matches 后完成 repository layout inspection 会 clean finish，不再重复列目录。release evidence verification 落在 `.dscode/dogfood/live-evidence-final-total-pr-4-release-verification.json`，`report_gate_passed=true`。
-- 本轮新增：外部 disposable repo write-fixture 证据第一批。已在 `/tmp/deepseek-external-fixtures/` 下构造 Rust、Python、JavaScript 三个独立 git repo，初始测试均失败，然后用真实 online DeepSeek 跑 `dogfood external-fixture --workdir ... --evidence-out ...`，三条都完成 `read_file -> apply_patch -> validation -> finish`，并分别通过 `dogfood external-evidence --require-successful-external-fixtures 1`：`.dscode/dogfood/external-fixture-rust-add-v3-verification.json`、`.dscode/dogfood/external-fixture-python-add-verification.json`、`.dscode/dogfood/external-fixture-js-add-verification.json`。本轮还修复了 external fixture evidence record 缺少 `model_backed` 字段导致 verifier 无法和 ledger online row 对齐的问题。
-- 本轮新增：multi-file external fixture scaffold。`scripts/create-multifile-external-fixture.sh`
-  会在 checkout 之外创建 disposable Python invoice repo，初始 `python3 -m unittest discover -s tests`
-  按预期失败，并输出 dry-run/evidence 两条 `deepseek dogfood external-fixture` 命令；任务要求同时修改
-  `src/invoice_math/pricing.py` 和 `src/invoice_math/summary.py`。CI/Release Matrix 在 Linux/macOS
-  上会执行该脚手架，先保证更真实的 multi-file 样本可重复构造；当前已经用 online DeepSeek
-  跑通该样本，`dogfood external-evidence --require-successful-external-fixtures 1` 通过，
-  verification 为 `.dscode/dogfood/external-fixture-python-invoice-multifile-verification.json`，
-  且 `post_validation_passed=true`、`release_evidence_ready=true`。
-- 本轮新增：external fixture release evidence fail-closed 加固。`dogfood external-fixture`
-  会从任务里的 `validate with ...` 抽出验证命令，在 isolated workdir 清理前由 CLI 后置执行；
-  evidence/verifier 现在要求 `post_validation_command` 和 `post_validation_passed=true`。同时
-  explicit edit guardrail 支持同一任务里的多个 `replace ... with ... in ...` 片段，本轮 Python
-  invoice 样本实测会依次 patch `pricing.py` 和 `summary.py`。
-- 本轮新增：README 真实 model-backed demo SVG。`docs/demo/record-model-backed-demo.sh` 使用当前 DeepSeek key 录制了 disposable Rust crate 的 failure -> `deepseek exec` -> patch -> passing `cargo test` -> diff transcript，`docs/demo/verify-model-backed-demo.js` 验证通过后由 `docs/demo/render-model-backed-demo-svg.js` 渲染为 `docs/demo/deepseek-code-model-demo.svg`。本轮还修复了 explicit edit parser 对 `in src/lib.rs, validate ...` 的路径截断问题，以及 renderer 把 `test result: ok ... 0 failed` 误标红的问题；README 英文、中文、日文都已引用该真实模型 SVG。
-- 本轮新增：`deepseek update publish-status` 现在支持 `--live-evidence-verification <path>`（别名 `--live-evidence`），会读取 `dogfood live-evidence --out` 生成的 `deepseek.dogfood.live_evidence_verification.v1`，要求 `ok=true`、completed、online、appended model-backed row、report gate required/passed、ledger fingerprint/current ledger fingerprint 都成立。`--strict` 因此会把缺失或无效的 online dogfood verification artifact 计入 not-ready，`public_install` 对 GitHub Release、npm、Homebrew 和 GHCR 的 `ready_to_publish` 也不再只看包材料，还要求 release evidence 已验证。
-- 本轮新增：Windows target warning cleanup。Unix-only shell byte-stream/PTY helpers、hook fixture helpers、rollback Unix metadata helpers 和相关测试 fixture 现在只在对应 Unix cfg 下编译；`cargo check --target x86_64-pc-windows-gnu --all-targets` 当前已无 warnings 通过。这让 Windows ConPTY/TCP runtime proof 的编译面更接近 release-quality，而不是只做到“能编过但带一串条件编译噪音”。
-- 发布面：`v0.1.1` GitHub Release binaries、GHCR image、npm/Homebrew packaging metadata、release matrix、download-plan、publish-status、README 多语言、README TUI demo recorder。
-- CI 证据：Linux/macOS/Windows bare `deepseek` TUI entrypoint smoke 已经在 CI 里通过；Windows 路径使用 ConPTY-backed smoke。
-
-当前可以怎么用：
-
-```bash
-deepseek
-deepseek chat
-deepseek run "explain this repository"
-deepseek tui --entrypoint-smoke --smoke-bin "$(command -v deepseek)"
-deepseek agents service-smoke --workdir /tmp/dsc-smk --bin "$(command -v deepseek)" --json
-deepseek agents shell-fixture-smoke --json
-```
-
-真实模型调用需要配置 DeepSeek API key。不要把 key 写进仓库；推荐使用环境变量或仓库外文件。
-
-## 还差什么
-
-当前距离 Claude Code CLI / Codex CLI / DeepSeek-TUI 的成熟产品形态，主要差在以下几类：
-
-如果只看 Linux/macOS 本地 CLI milestone，核心交互能力和要求的 evidence gate 已经成立；
-剩下主要是 Homebrew 发布凭据、下一次 release matrix 的 release-binary smoke 证据、
-更多外部样本和文档压缩。Windows/IDE/hosted 发布证据继续保留在更大产品目标里，但不是
-这个 milestone 的 blocker。
-
-1. Shell/PTY 深水区
-   - 已有 bounded interactive attach、duplex `byte_stream` raw-output proxy slice、human `agents shell proxy` raw-mode wrapper、Windows `native-supervisor` ConPTY backend compile gate，以及 Linux 本地 `pty_fd` / SCM_RIGHTS PTY master fd handoff slice。
-   - `deepseek agents shell-fixture-smoke --json` 已把 Linux native PTY、duplex `byte_stream`、`raw_proxy`、`pty_fd` fd handoff 和 human `agents shell proxy` wrapper 纳入本地单命令 gate；direct `pty_fd` 与 CLI `fd-proxy` 测试已覆盖交还后 supervisor stdin/resize/replay 恢复，CLI `fd-proxy` Ctrl-C、Ctrl-D/PTY EOF、SIGWINCH resize 和异常 client 退出恢复也已有集成测试。
-   - Linux shell-supervisor native PTY 已有；Windows shell-supervisor ConPTY 已接入 `portable-pty` 后端，daemon/client 控制面已有 loopback TCP 第一片，端点写入 `.dscode/shell-supervisor/supervisor.tcp` 并复用 newline JSON 协议；`cargo check --target x86_64-pc-windows-gnu --all-targets` 已通过，CI 已新增 Windows endpoint/status、TCP daemon/client runtime smoke、真实二进制 `agents shell-fixture-smoke --json` 和 targeted ConPTY start/resize smoke，但真实 Windows runner 结果仍需产出后才能关闭该证据缺口。Windows `byte_stream/raw_proxy` 和 `pty_fd` 仍未支持。
-   - `service-doctor` 已有本地模板安装前 gate：会解析生成的 systemd `ExecStart`/`WorkingDirectory` 和 launchd `ProgramArguments`/`WorkingDirectory`，并精确比对 expected argv/workdir；`service-doctor --installed` 会只读检查实际 user service 的 systemd unit 或 launchd label 是否 loaded/running/enabled；`service-smoke --installed` 会继续探活实际 runtime `/health` 和已有 shell-supervisor endpoint，Unix 读取 socket、Windows 读取 `supervisor.tcp`，且不会停止 service-manager-owned 进程。真实干净机器安装后的 systemd/launchd service smoke 证据仍需要外部环境产出。
-
-2. 真实模型 dogfood 证据
-   - 已有 recorder、verifier、redaction self-test、release evidence verifier 和 `100` 条 online run release gate 证据。
-   - 已有 `4` 个真实 disposable repo 外部 write-fixture 样本，覆盖 Rust/Python/JavaScript 和 Python invoice multi-file 的 failure -> edit -> test 链路；后续可以继续扩到 5 个以上样本。
-   - README 现在已有真实 model-backed SVG；后续可选补更精致的 GIF/MP4 或 TUI 录屏版。
-
-3. 发布渠道
-   - GitHub Release 和 GHCR 已通。
-   - npm registry 和 Homebrew tap 还被凭据阻塞。
-   - crates.io 是否发布仍需要明确 crate 命名、license/package policy。
-
-4. 产品打磨
-   - TUI 已能用，但还需要更多真实工作流下的性能、长输出、失败恢复、窗口 resize、旧终端兼容性验证。
-   - VS Code 已有 native panel、resume、active-file review、workspace changed-file queue、generated patch queue/apply/reject、extension-host smoke harness、headless diagnostic patch fixture 和 validation 第一片，但完整 IDE agent workbench 仍缺真实 VS Code CLI runner 证据和 manual GUI fixture 证据。
-   - GitHub automation 已有 event bridge、review/fix/patch mode routing、review workflow 示例、写入型 PR-head checkout workflow 示例、本地 workflow fixture smoke、真实 hosted write workflow 证据、`26` 条 `pr_workflow` benchmark cases，以及 `14/14` offline `pr_workflow` dogfood replay；当前本机已有 `50` 条 online model-backed `pr_workflow` 样本、`47` 条 success，分类 release gate 已通过。后续重点是把这类远端证据沉淀成更稳定的周期性 smoke，而不是继续保留临时 fixture 文件。
-   - MCP 已有 stdio/HTTP/SSE 本地 fixture smoke、动态工具暴露、schema cache、prompt/resource/template 单命令 smoke、bad-server isolation、generic/dynamic MCP approval/allowlist policy 证据，也已有三条模型规划型 MCP benchmark 和完整默认 benchmark `82/82` 证据；当前没有已知 MCP-specific Phase 12D smoke 缺口。
-   - Hooks 已有 prompt submit、session start/stop、pre/post tool use 和 structured allow/add_context 的 `deepseek hooks fixture-smoke --json` 本地 gate；skills/custom command 已有 `deepseek skills validate --strict --json` 元数据 gate 和 discovery 文档；subagent 已有 `deepseek agents subagent-fixture-smoke --json` gate、20 条 subagent benchmark cases、targeted subagent benchmark `20/20`、完整默认 benchmark `82/82`、并发 child readback 和 write-scope conflict metadata。Phase 12D 本地 extension gates 已基本齐备，offline dogfood coverage gate 和 online dogfood release gate 都已通过；下一步应继续外部兼容性和真实 demo 证据。
-   - 文档需要继续压缩成新用户能快速理解的安装、配置、试用、故障排查路径。
-   - 和上游 DeepSeek-TUI 的新变化需要持续周期性 refresh。
-
-## 下一步优先级
-
-建议按这个顺序推进，避免在低价值 polish 上分散：
-
-当前执行 spec：`docs/superpowers/specs/2026-05-23-final-parity-execution-spec.md`。
-该 spec 固化了本轮重新核对后的剩余差距、可执行命令、外部阻塞项和停止条件。
-
-1. 固化 Linux/macOS CLI release gate
-   - CI 和 Release Matrix 已新增非 Windows `agents shell-fixture-smoke --json`、
-     `agents service-smoke --json` 和 multi-file external fixture scaffold smoke。
-   - 已记录 PR #14 / CI run #35 的 Linux/macOS debug binary 证据：
-     https://github.com/willamhou/DeepSeekCode/actions/runs/26333425574 。
-     下一步等待下一次 release matrix 产出 release-binary 证据；Windows
-     shell-supervisor 继续作为后续跨平台目标。
-
-2. 补外部 model-backed 证据和真实 demo
-   - 先轮换任何已经泄漏到聊天记录里的 key。
-   - 保留 `.dscode/dogfood/live-evidence-final-total-pr-4-release-verification.json` 作为当前 online dogfood release 证据。
-   - 已完成 4 个 disposable repo/write-fixture 样本，其中 Python invoice multi-file 样本已通过
-     online `dogfood external-fixture --evidence-out ...` 和
-     `dogfood external-evidence --require-successful-external-fixtures 1`。
-
-3. 补 README 真实录屏
-   - 已完成 CLI 版真实模型 SVG：失败测试、模型修改、通过测试和 diff。
-   - 后续如果要更强视觉冲击，可以补 TUI/GUI 风格 GIF/MP4，但不是当前 minimum evidence gap。
-
-4. 完成发布渠道
-   - 配置 npm token 并发布 npm wrapper。
-   - 配置 Homebrew tap token 并发布 formula。
-   - 决定 crates.io 是否进入 v0.2 目标。
-
-5. 最后一轮差距审计
-   - 重新拉取 DeepSeek-TUI 最新 main。
-   - 和 Claude Code CLI / Codex CLI 的核心 loop 对照：入口、TUI、tool use、approval、shell、resume、diff、release、docs。
-   - 只保留会影响真实用户使用的差距，目标是核心差距低于 5%。
+当前执行口径收敛到 Linux/macOS 本地 code agent CLI。只要用户能在 Linux/macOS 安装并
+运行 `deepseek`，稳定进入 TUI/REPL，完成模型读写代码、shell 验证、diff review、
+resume 和本地 runtime/shell-supervisor 工作流，就可以认为这个 milestone 成立。
+Windows、hosted IDE 和 npm 发布属于更大的产品硬化目标；Homebrew 是 macOS 分发打磨，
+但不是核心交互能力 blocker。
 
 ## 当前判断
 
-DeepSeekCode 现在已经是一个可以实际使用的 code agent CLI，尤其适合在本仓库继续 dogfood。
-但它还不是“可以公开宣称等同 Claude Code CLI / Codex CLI”的成熟产品。
-
-如果目标限定为 Linux/macOS 本地 code agent CLI，则当前判断更强：Linux 本机已经通过
-TUI entrypoint、shell fixture、service smoke 和在线 dogfood release gate；PR #14 / CI
-run #35 已在 hosted macOS 上通过 shell/runtime smoke 和 multi-file scaffold gate；online
-multi-file external fixture evidence 也已经记录并验证通过。Windows 不再影响这个限定目标。
-
-最准确的公开表述是：
-
-> DeepSeekCode is usable today for Linux/macOS dogfooding and repository work, with a full-screen TUI, REPL, durable runtime, permissioned tools, hosted Linux/macOS shell-supervisor smoke gates, release binaries, a 100-run online dogfood release gate, verified online multi-file external fixture evidence, real hosted GitHub workflow evidence, and a committed real model-backed README demo SVG. The remaining Linux/macOS CLI work is Homebrew publishing, next-release binary smoke evidence, broader external sample depth, and documentation polish; hosted IDE, Windows/service proof, npm publishing, and optional richer demo media remain broader product-hardening work.
+DeepSeekCode 已经可以实际用于 Linux/macOS dogfood 和仓库内代码任务。它有全屏 TUI、
+line-oriented REPL、持久 runtime、权限工具、shell/PTY、本地服务 smoke、MCP/ACP、
+background worktree task runner、GitHub Action bridge、model-backed demo 和真实 online
+dogfood 证据。
+
+但它还不是“可以公开宣称等同 Claude Code CLI / Codex CLI”的成熟产品。剩余差距主要是：
+
+- release-binary 级别的下一轮 release matrix smoke 证据；
+- Homebrew tap 和 npm registry 的发布凭据与公开安装验证；
+- 更多真实外部 repo 样本；
+- 更精简的新用户文档和故障排查路径；
+- hosted IDE、真实安装后的 systemd/launchd service smoke、以及更广的 Windows 长尾验证。
+
+## 已经成立的证据
+
+- `deepseek` 裸入口、TUI entrypoint、task worktree runner、GitHub Action bridge、
+  shell fixture、service smoke 和 multi-file fixture scaffold 已纳入 CI gate。
+- PR #16 的完整 CI 已通过 Linux、macOS 和 Windows：
+  https://github.com/willamhou/DeepSeekCode/actions/runs/26334525472
+- PR #16 记录并验证了 online multi-file external fixture evidence：
+  `.dscode/dogfood/external-fixture-python-invoice-multifile-verification.json`
+  中 `post_validation_passed=true`、`release_evidence_ready=true`。
+- PR #14 引入的 Linux/macOS CLI readiness gates 已在 hosted debug binary 上通过：
+  https://github.com/willamhou/DeepSeekCode/actions/runs/26333425574
+- online dogfood release gate 已达到 100+ model-backed run 口径；当前 live plan 曾显示
+  `105` 条 online run、`99` 条 success，分类为 `write_validate 29/30`、
+  `recovery 23/25`、`pr_workflow 47/50`。
+- README 已提交真实 model-backed SVG，展示失败 Rust 测试、模型修改、通过 `cargo test`
+  和最终 diff。
+- `v0.1.1` 已有 GitHub Release binaries、GHCR image、npm/Homebrew packaging metadata、
+  release matrix、download-plan 和 publish-status 检查。
+
+## 当前能力概览
+
+- 入口：`deepseek`、`deepseek chat`、`deepseek run`、`deepseek tui`、`deepseek exec`。
+- TUI：Plan / Agent / YOLO 模式、approval modal、command palette、session/thread 视图、
+  MCP 管理、setup/onboarding、provider/model picker。
+- REPL：raw-mode line editor、history、session list/load completion、SIGINT cancel、
+  `/save`、`/load`、`/sessions`、custom slash commands。
+- Runtime：`.dscode/runtime/` 下持久化 sessions、threads、turns、items、events、
+  tasks、usage、automations，并提供 HTTP/SSE runtime surface。
+- 工具：文件读写/search、patch、diff、shell、background jobs、diagnostics、review、
+  notes、memory、rollback、skills、subagents、MCP/ACP。
+- Shell/PTY：Linux native PTY、bounded interactive attach、byte stream、raw proxy、
+  Linux `pty_fd` handoff、shell-supervisor protocol bridge；Windows ConPTY/TCP path 已有
+  CI smoke，但不是 Linux/macOS milestone blocker。
+- 自动化：background worktree task runner，GitHub Action review/fix/patch bridge，
+  disabled-by-default hosted review/write workflow examples。
+- 发布：GitHub Release、GHCR、release package、npm package staging、Homebrew formula
+  rendering、secret scan、publish-status readiness audit。
+
+## 剩余工作
+
+### Linux/macOS 本地 CLI milestone
+
+这个限定目标的核心交互能力和 evidence gate 已经成立。下一步主要是 release hardening：
+
+1. 等下一次 release matrix 产出 release-binary 级别的 Linux/macOS shell/runtime smoke 证据。
+2. 配置 Homebrew tap 凭据，完成 tap 发布和公开安装验证。
+3. 可选再增加 1-2 个真实外部 repo fixture，扩大 multi-file/多语言样本厚度。
+4. 继续压缩 README、install、release、current-status，让新用户能快速安装、配置、试用、排障。
+
+### 更大产品目标
+
+1. 配置 `NPM_TOKEN` 并发布 npm wrapper，验证 `npm install` 后裸 `deepseek` 入口。
+2. 在干净 Linux/macOS 机器上安装 systemd/launchd user services，记录
+   `service-doctor --installed` 和 `service-smoke --installed` 证据。
+3. 补真实 VS Code CLI runner 或 manual GUI fixture 证据。
+4. 持续和 Claude Code CLI / Codex CLI / DeepSeek-TUI 做核心 loop 对照，只保留会影响真实用户使用的差距。
+
+## 推荐公开表述
+
+> DeepSeekCode is usable today for Linux/macOS dogfooding and repository work,
+> with a full-screen TUI, REPL, durable runtime, permissioned tools, hosted
+> Linux/macOS shell-supervisor smoke gates, release binaries, a 100-run online
+> dogfood release gate, verified online multi-file external fixture evidence,
+> real hosted GitHub workflow evidence, and a committed real model-backed README
+> demo SVG. The remaining Linux/macOS CLI work is Homebrew publishing,
+> next-release binary smoke evidence, broader external sample depth, and
+> documentation polish; hosted IDE, Windows/service proof, npm publishing, and
+> optional richer demo media remain broader product-hardening work.
diff --git a/docs/install.md b/docs/install.md
index 84ec958..117d162 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -79,13 +79,22 @@ deepseek dogfood report --limit 5
 
 外部 write-fixture 证据需要当前 checkout 之外的 disposable git 仓库。先 dry-run
 检查，真实运行会复制到 isolated workdir，并在 dogfood report 里计入
-`external-write-fixture`：
+`external-write-fixture`。真实 evidence 还会从 task 里的 `validate with ...`
+抽取后置验证命令，在 isolated workdir 中执行，并要求
+`post_validation_passed=true` 才能通过 `dogfood external-evidence`：
 
 ```bash
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --dry-run \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
-deepseek dogfood external-fixture --workdir /tmp/disposable-repo --benchmark-gate \
-  'replace `a - b` with `a + b` in src/lib.rs and validate with cargo test'
+fixture_dir=/tmp/deepseek-external-fixtures/python-invoice-multifile
+scripts/create-multifile-external-fixture.sh "$fixture_dir"
+task='replace `return amount - discount` with `return max(amount - discount, 0.0)` in src/invoice_math/pricing.py and replace `Invoice total` with `Final total` in src/invoice_math/summary.py, validate with python3 -m unittest discover -s tests'
+deepseek dogfood external-fixture --workdir "$fixture_dir" --dry-run "$task"
+deepseek dogfood external-fixture --workdir "$fixture_dir" \
+  --evidence-out .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  "$task"
+deepseek dogfood external-evidence \
+  --file .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  --out .dscode/dogfood/external-fixture-python-invoice-multifile-verification.json \
+  --require-successful-external-fixtures 1
 deepseek dogfood report --limit 10
 deepseek dogfood live-plan --limit 10
 deepseek dogfood live-run --api-key-file /tmp/deepseek-live.key --limit 3 --json
@@ -380,7 +389,7 @@ reasoning delta 保存为 durable `reasoning` item。默认仍保持 `off`，直
 reasoning transcript replay 和更完整的 thinking/tool-call 兼容性验证完成。
 
 `deepseek` 每次任务开始前也会读取 workspace instruction 文件。团队共享规则可放在 repo root 或子目录的
-`AGENTS.md`；已有 Claude Code 项目也可继续用 `CLAUDE.md` 或 `.claude/CLAUDE.md`，DeepseekCode 会在同一目录没有
+`AGENTS.md`；已有 Claude Code 项目也可继续用 `CLAUDE.md` 或 `.claude/CLAUDE.md`，DeepSeekCode 会在同一目录没有
 `AGENTS*.md` 时把它们作为 fallback。个人默认指令文件是 `~/.config/dscode/AGENTS.md`，可通过
 `workspace.user_instructions_file` 改路径或设为空字符串禁用。
 
diff --git a/docs/mvp.md b/docs/mvp.md
index 1370df2..fd84a3b 100644
--- a/docs/mvp.md
+++ b/docs/mvp.md
@@ -1,102 +1,50 @@
-# MVP 与路线图
+# Historical MVP
 
-## v0.1 目标
+This page records the original v0.1 target. It is historical context, not the
+current product roadmap. For current status and next work, use:
 
-`v0.1` 的目标不是“最强 code agent”，而是“稳定跑通本地代码修改闭环”。
+- [Current status](./current-status.md)
+- [Roadmap](./roadmap.md)
+- [Release checklist](./release.md)
 
-主命令以 `deepseek` 为准；`dscode` 仍保留为兼容别名。
+## Original v0.1 Goal
 
-一句话：
+The first milestone was not "the strongest code agent"; it was a stable local
+code-editing loop:
 
-> 在本地仓库中，能够围绕 DeepSeek 完成读代码、改代码、跑命令、继续修复的基本代理流程。
+> In a local repository, use DeepSeek to read code, edit code, run commands, and
+> continue fixing based on validation output.
 
-## v0.1 功能清单
+The primary command is `deepseek`; `dscode` remains only as a compatibility
+alias.
 
-### 基础交互
+## Original Scope
 
-- `deepseek`
-- `deepseek "task"`
-- `deepseek diff`
-- `deepseek resume`
-- `deepseek config`
-- `deepseek doctor`
+- Start the CLI with `deepseek`.
+- Inspect a repository with file listing, file reading, and text search.
+- Apply patches instead of overwriting whole files.
+- Run approved shell commands.
+- Show diffs and save/resume session state.
+- Use `doctor` and `smoke` to diagnose local setup.
 
-### 项目理解
+That milestone is complete and has been superseded by the current Linux/macOS
+code-agent CLI goal.
 
-- 扫描目录结构
-- 检测主要语言
-- 推断包管理器与常见命令
-- 识别忽略目录
+## What Changed Since v0.1
 
-### 工具能力
+Several items that were explicitly out of scope for the first version now exist
+as implemented features or working prototypes:
 
-- `list_files`
-- `read_file`
-- `search_text`
-- `apply_patch`
-- `run_shell`
-- `git_diff`
+- multi-provider/model configuration and pickers;
+- TUI workbench and REPL line editor;
+- durable runtime with HTTP/SSE surfaces;
+- MCP/ACP client and server surfaces;
+- local skills, remote skill installers, custom slash commands, and hooks;
+- subagents and background worktree tasks;
+- GitHub Action review/write bridge;
+- VS Code native panel prototype;
+- release matrix, GHCR image, npm package staging, and Homebrew formula
+  rendering.
 
-### Runtime 能力
-
-- agent loop
-- 上下文裁剪
-- 会话保存和恢复
-- diff 展示
-- 审批策略
-
-## 首版不做
-
-- 多模型 provider
-- IDE 插件
-- 远程 skill 安装
-- 多 agent 并行
-- 自动提交和推送 git
-- AST 级大规模重构
-
-## 开发阶段建议
-
-### Phase 1: 基础骨架
-
-- 初始化 CLI
-- 接入配置加载
-- 接入 DeepSeek API
-- 打通最简单的单轮问答
-
-### Phase 2: Tool 闭环
-
-- 实现文件读取与搜索
-- 实现 patch 应用
-- 实现 shell 执行
-- 跑通模型请求工具 -> 工具执行 -> 模型继续
-
-### Phase 3: 仓库策略
-
-- 语言检测
-- profile 加载
-- 命令推断
-- 忽略规则
-
-### Phase 4: 任务策略
-
-- skill 加载
-- 工具白名单
-- shell allowlist
-- skill-based prompt augmentation
-
-### Phase 5: 体验打磨
-
-- diff 渲染
-- 流式输出
-- 会话恢复
-- `doctor` 命令
-
-## 验收标准
-
-如果下面几类任务能稳定完成，`v0.1` 就是成立的：
-
-- “解释这个模块是干什么的”
-- “修复这个 failing test”
-- “修复 lint / typecheck 错误”
-- “基于报错做一轮小范围修改”
-- “给一个函数加一处小功能并跑验证命令”
+The current remaining work is therefore not MVP closure. It is release and
+product hardening.
diff --git a/docs/release.md b/docs/release.md
index b199509..29b152f 100644
--- a/docs/release.md
+++ b/docs/release.md
@@ -114,8 +114,28 @@ ledger by timestamp, outcome, transport, and category.
 Use `--out` to persist the verification JSON for release evidence upload.
 For external fixtures, `--evidence-out` writes
 `deepseek.dogfood.external_fixture_evidence.v1` with the source workdir, appended
-external-fixture ledger row(s), release-evidence readiness boolean, and the same
-ledger fingerprint binding used by live-run evidence.
+external-fixture ledger row(s), the extracted `validate with ...` command,
+`post_validation_passed`, release-evidence readiness boolean, and the same
+ledger fingerprint binding used by live-run evidence. `dogfood
+external-evidence --require-successful-external-fixtures N` fails closed unless
+the evidence is online/model-backed, completed, matched back to the current
+ledger, and has `post_validation_passed=true`.
+
+The current tracked multi-file release fixture is the disposable Python invoice
+sample:
+
+```bash
+fixture_dir=/tmp/deepseek-external-fixtures/python-invoice-multifile
+scripts/create-multifile-external-fixture.sh "$fixture_dir"
+task='replace `return amount - discount` with `return max(amount - discount, 0.0)` in src/invoice_math/pricing.py and replace `Invoice total` with `Final total` in src/invoice_math/summary.py, validate with python3 -m unittest discover -s tests'
+deepseek dogfood external-fixture --workdir "$fixture_dir" \
+  --evidence-out .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  "$task"
+deepseek dogfood external-evidence \
+  --file .dscode/dogfood/external-fixture-python-invoice-multifile-evidence.json \
+  --out .dscode/dogfood/external-fixture-python-invoice-multifile-verification.json \
+  --require-successful-external-fixtures 1
+```
 
 For a release-readiness evidence gate, make the report fail closed when the
 ledger does not have enough live proof:
diff --git a/docs/repl.md b/docs/repl.md
index 4860055..d3f9d84 100644
--- a/docs/repl.md
+++ b/docs/repl.md
@@ -77,12 +77,12 @@ Invoke them by filename:
 
 Inside the markdown body, `$ARGUMENTS` expands to all arguments, `$0` / `$1` expand positional
 arguments, and `$ARGUMENTS[0]` / `$ARGUMENTS[1]` are the long indexed forms. If no argument
-placeholder appears, DeepseekCode appends `ARGUMENTS: ...` to the prompt automatically.
+placeholder appears, DeepSeekCode appends `ARGUMENTS: ...` to the prompt automatically.
 
 ### MCP Prompt Slash Commands
 
 Connected MCP servers can expose prompt templates through `prompts/list` and `prompts/get`.
-DeepseekCode can load those prompts directly from the REPL:
+DeepSeekCode can load those prompts directly from the REPL:
 
 ```text
 > /mcp/github/review_pr {"number":42}
@@ -109,7 +109,7 @@ tests and fixtures can feed `/help\n/quit\n` without requiring a real terminal.
 
 ## Workspace Instructions
 
-At the start of each agent loop, DeepseekCode loads bounded markdown instructions into the system
+At the start of each agent loop, DeepSeekCode loads bounded markdown instructions into the system
 prompt. This gives repeated project rules a first-class place instead of requiring users to paste
 them into every task.
 
@@ -123,7 +123,7 @@ Default sources:
 <git-root>/.claude/CLAUDE.md
 ```
 
-For subdirectories, DeepseekCode walks from the git root to the current directory. At each directory
+For subdirectories, DeepSeekCode walks from the git root to the current directory. At each directory
 level it reads the first existing file in this precedence order: `AGENTS.override.md`, `AGENTS.md`,
 `CLAUDE.md`, `.claude/CLAUDE.md`. Later files are appended later in the prompt, so more local
 instructions naturally win when they conflict. Each loaded file is capped at 32 KiB.
@@ -158,7 +158,7 @@ Supported event directories:
 .dscode/hooks/shell_env/*
 ```
 
-Scripts must be executable. DeepseekCode runs user hooks first, then project hooks, in lexical path
+Scripts must be executable. DeepSeekCode runs user hooks first, then project hooks, in lexical path
 order. Each script receives a JSON payload on stdin and `DSCODE_HOOK_EVENT` in the environment.
 `user_prompt_submit`, `pre_tool_use`, and `permission_request` scripts block the turn or tool call
 when they exit nonzero or return `{"decision":"deny","reason":"..."}`. Other hook failures are
@@ -182,7 +182,7 @@ prompt. To keep token usage bounded:
 
 `/compact` mutates the transcript: older turns are replaced by one
 assistant summary turn, while the latest 8 turns are kept verbatim. If
-hooks are enabled, DeepseekCode runs `pre_compact` before rewriting the
+hooks are enabled, DeepSeekCode runs `pre_compact` before rewriting the
 transcript; hook output is printed as advisory context.
 
 `/clear` wipes the transcript when you want to start fresh without
diff --git a/docs/roadmap.md b/docs/roadmap.md
index 4ea8826..eae7fb8 100644
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -1,2070 +1,91 @@
-# Roadmap 与状态
+# Roadmap
 
-最后更新：`2026-05-11`
+Last updated: 2026-05-23
 
-## 当前状态
+This page is the current product roadmap. Older phase-by-phase execution notes
+live in `docs/superpowers/` and in git history; those historical notes are useful
+for audit trails, but they are not the source of truth for current status.
 
-`DeepseekCode` 已经从“可运行的本地 agent 原型”推进到可回归验证的 CLI code-agent 工作树。
+## Current Position
 
-当前代码具备这些基础能力：
+DeepSeekCode is now usable for Linux/macOS dogfooding and repository work:
 
-- Rust CLI 骨架可运行
-- 本地 `skill` / `profile` 可加载
-- 可执行基础工具：
-  - `list_files`
-  - `read_file`
-  - `search_text`
-  - `apply_patch`
-  - `run_shell`
-  - `git_diff`
-- 可运行离线 planner loop、REPL、scriptable `exec --json` 和 benchmark/dogfood 门禁
-- DeepSeek 远端传输层已接入：
-  - `OpenAI-compatible` 路径支持正式 `tools` / function-calling
-  - `Anthropic-compatible` 路径支持正式 `tool_use` content block，输入支持字符串与数值
-- 工具执行受策略约束：
-  - `allowed_tools`
-  - `shell_allowlist`
-  - 写入/命令审批：交互式 TTY prompt（非 TTY 默认拒绝），env 可放行
-  - 错误分类 `PolicyDenied` / `ToolFailure` / `Other`
-- 本机可验证性已具备：
-  - `doctor` 输出 workspace / model / api key / network / hints / capabilities 等诊断
-  - `smoke` 可对 OpenAI 与 Anthropic 兼容路径单独发起最小远端请求
-  - 默认离线 benchmark 当前 `70/70`，live gate 通过；trend gate 因新增 Git history cases 进入新 comparable baseline warmup
-  - dogfood ledger 当前 `39` runs；新增 Phase 12A replay 均为 success，但距离 100+ live gate 仍不足
+- bare `deepseek` opens the full-screen TUI in a real terminal;
+- `deepseek chat` remains available as the line-oriented REPL;
+- model-backed tasks can read files, apply patches, run shell checks, inspect
+  diffs, and resume from durable runtime state;
+- local runtime, shell-supervisor, background worktree tasks, MCP/ACP surfaces,
+  GitHub Action bridge, dogfood evidence, and release packaging checks all have
+  repeatable gates;
+- PR #16 full CI passed Linux, macOS, and Windows:
+  https://github.com/willamhou/DeepSeekCode/actions/runs/26334525472
+- verified online multi-file external fixture evidence is tracked under
+  `.dscode/dogfood/external-fixture-python-invoice-multifile-verification.json`.
 
-## 已完成
+The Linux/macOS local code-agent CLI milestone is effectively established. The
+remaining work is mostly release hardening, external evidence depth, publishing,
+and documentation polish.
 
-### 基础工程
+## Near-Term Priorities
 
-- 初始化 git 仓库与项目文档
-- 建立 Rust 项目目录结构
-- 建立 `cli / core / model / tools / language / skills / config / ui` 模块边界
-- 在无外网依赖场景下保持工程可离线构建和测试
+### 1. Release Hardening For Linux/macOS
 
-### CLI 与配置
+- Run the next release matrix and preserve release-binary smoke evidence for
+  Linux/macOS TUI entrypoint, shell fixture, service smoke, task worktree smoke,
+  GitHub bridge smoke, and multi-file fixture scaffold.
+- Keep `deepseek update publish-status --strict` fail-closed on verified online
+  dogfood evidence, release assets, npm package artifacts, Homebrew checksums,
+  and public install readiness.
+- Keep `node scripts/check-secrets.js` in every release path.
 
-说明：当前用户入口以 `deepseek` 为主；历史 phase 记录中仍可能保留旧命令名 `dscode`。
+### 2. Homebrew And npm Publishing
 
-- 支持：
-  - `deepseek`
-  - `dscode`
-  - `deepseek "task"`
-  - `deepseek run "task"`
-  - `deepseek exec "task"` / `deepseek exec --json`
-  - `deepseek agents`
-  - `deepseek benchmark --manifest <file> --out <report>`
-  - `deepseek dogfood run "task"`
-  - `deepseek dogfood report --out <report>`
-  - `deepseek diff`
-  - `deepseek resume`
-  - `deepseek config`
-  - `deepseek doctor`
-  - `deepseek smoke`（支持 `--flavor openai|anthropic` 与 `--prompt`）
-  - `deepseek mcp`
-  - `deepseek completion`
-  - `deepseek update`
-  - `deepseek version`
-- 支持 `--skill`
-- 支持简单 `.dscode/config.toml` 配置读取：
-  - `model.base_url`
-  - `model.model`
-  - `model.api_key_env`
-  - `approval.require_write_confirmation`
-  - `approval.require_shell_confirmation`
-  - `approval.require_mcp_confirmation`
-  - `approval.mcp_call_allowlist`
-  - `workspace.config_dir`
-  - `workspace.session_dir`
+- Configure `HOMEBREW_TAP_REPOSITORY` and `HOMEBREW_TAP_TOKEN`.
+- Publish and verify the generated `Formula/deepseek.rb` against the GitHub
+  Release archives and `.sha256` files.
+- Configure `NPM_TOKEN` / `NODE_AUTH_TOKEN`.
+- Publish platform npm packages and the root wrapper, then verify public
+  `npm install` produces a working `deepseek` command.
 
-### Tooling
+For the Linux/macOS CLI milestone, Homebrew is higher priority than npm because
+it is the most natural install path for macOS users.
 
-- `list_files`
-  - 目录扫描
-  - 深度和数量限制
-  - 常见目录跳过
-- `read_file`
-  - 读取文本文件并返回带行号片段
-- `search_text`
-  - 仓库全文搜索
-  - 结果数量限制
-- `run_shell`
-  - 本地受控执行
-  - 内建安全前缀限制
-- `git_diff`
-  - 查看工作区 diff
-- `apply_patch`
-  - 文本替换模式
-  - unified diff patch 模式（多文件）
-  - patch dry-run 校验
-  - patch 头路径归一化（支持 git `a/` `b/` 前缀与 `/dev/null`）
-  - cwd 路径范围校验（拒绝 `..` 逃逸与 cwd 外绝对路径）
-  - 失败诊断分类（缺文件 / hunk 失败 / 已应用 / 格式错误）
-  - 成功摘要按 modified / created / deleted / renamed 分项列出
+### 3. More External Model-Backed Samples
 
-### Planner / Runtime
+- Keep the Python invoice multi-file fixture as the canonical tracked sample.
+- Add one or two more disposable external repo samples only when they cover new
+  behavior, such as multi-step recovery, larger diffs, or non-Python/Rust/JS
+  workflows.
+- Require `dogfood external-evidence` verification with
+  `post_validation_passed=true` for every sample counted as release evidence.
 
-- 已实现本地离线 planner loop：
-  - 组装任务上下文
-  - 基于 profile 和 observations 决策下一步
-  - 执行工具
-  - 回填 observation
-  - 最终结束
-- 已支持简单编辑任务：
-  - `replace "a" with "b" in path`
-  - 默认走 patch 模式（构造 unified diff，含正确 `cwd` 与基名）
-  - patch 不可构造时（多匹配 / 多行 / 缺文件）回退到文本替换
-- 已支持 skill 提示增强
-- 已支持 session snapshot 保存
-- Observation 已分类（file_excerpt / listing / search_results / patch / diff / shell_output / other）
-  - 按类裁剪：shell 取尾部、文件/搜索/列表取头部、diff 保留 hunk header
-  - 同类型只保留最新内容，旧观察被替换为 `(superseded ...)` 桩，降低上下文污染
+### 4. Documentation Compression
 
-### Skill / Policy
+- Keep README focused on install, first run, current gap, demo, and validation.
+- Keep `docs/current-status.md` focused on current facts and near-term work.
+- Keep `docs/release.md` as the operator checklist.
+- Treat `docs/superpowers/` as historical execution logs, not user-facing
+  status.
 
-- 本地 `skills/*.toml` 可解析：
-  - `name`
-  - `description`
-  - `allowed_tools`
-  - `system_append`
-  - `suggested_steps`
-  - `policy.require_write_confirmation`
-  - `policy.require_shell_confirmation`
-  - `policy.shell_allowlist`
-- `allowed_tools` 已接入工具可见性控制
-- `policy` 已接入执行约束
-- 环境变量支持：
-  - `DSCODE_AUTO_APPROVE_WRITES=1`
-  - `DSCODE_AUTO_APPROVE_SHELL=1`
-  - `DSCODE_AUTO_APPROVE_MCP=1`
+### 5. Broader Product Hardening
 
-### DeepSeek 远端传输
+- Record installed systemd/launchd service smoke evidence on clean machines.
+- Record a real VS Code runner/manual GUI fixture for the native agent panel.
+- Continue Windows ConPTY/service validation, while keeping it separate from
+  the Linux/macOS local CLI milestone.
+- Periodically compare the core loop against Claude Code CLI, Codex CLI, and
+  DeepSeek-TUI.
 
-- 已接入 `DEEPSEEK_API_KEY` 检测
-- 远端调用失败时自动回退离线 planner
-- `OpenAI-compatible` 路径：
-  - `/chat/completions`
-  - 正式 `tools` / function-calling
-- `Anthropic-compatible` 路径：
-  - `/messages`
-  - 正式 `tools` 数组 + `tool_use` content block 解析，输入接受字符串/数值/布尔/null
+## Current Stop Conditions
 
-## 当前限制
+For the Linux/macOS local CLI milestone, stop treating new work as blocking once
+these are true:
 
-这些是已经明确存在、但还未完成的部分：
+- hosted Linux/macOS CI gates remain green;
+- release-binary Linux/macOS smoke evidence exists from the next release matrix;
+- at least one verified online multi-file external fixture remains tracked;
+- Homebrew public install is either published and verified, or explicitly marked
+  blocked on tap credentials;
+- README and install docs show an accurate first-run path.
 
-- 当前执行环境无法直接验证外网访问：本轮尝试 live dogfood 时 `api.deepseek.com` DNS 解析失败，guard 正确跳过 ledger 写入
-- Phase 12 完整产品面仍未收口：
-  - VS Code 仍是 terminal-backed 入口，不是完整 native agent workbench
-  - GitHub automation 仍是本地 `gh` workflow，不是 Action / `@deepseek` hosted trigger
-  - 还没有 background worktree runner / app-cloud task surface
-  - dogfood 只有 `39` runs，距离 100+ live CLI runs 和关键 slice `>=90%` 成功率门槛仍不足
-- CLI-only 实现面已接近高个位数差距，但完成声明仍被真实在线 dogfood 厚度阻塞
-
-## 已验证
-
-这些能力已经在当前本地环境里验证过：
-
-- `cargo test --offline`（611 项单测全部通过）
-- `DEEPSEEK_API_KEY_ENV=DEEPSEEK_API_KEY_OFFLINE cargo run --offline -- benchmark`：默认 manifest `70/70`
-- benchmark trend gate：`skipped (need at least 3 prior comparable runs, found 0)`，因为 case 数从 `67` 扩到 `70`
-- benchmark live gate：`pass (runs=39)`
-- `.dscode/dogfood/latest.md`：`39` runs；最新 6 条 Phase 12A replay 均为 success
-- `cargo run --offline -- doctor` 输出五段诊断（workspace / model / api key / network / hints）
-- `cargo run --offline -- smoke` 与 `cargo run --offline -- smoke --flavor anthropic` 在缺少 key 时给出预检失败
-- `cargo run --offline -- "inspect repository"`
-- `cargo run --offline -- run --skill fix-tests "fix tests"`
-- 文本替换式编辑在放行写入审批时可执行
-- unified diff patch 的基础单测已通过
-
-这些能力已经接入代码，但尚未在当前会话里完成在线验证：
-
-- DeepSeek `OpenAI-compatible` 真实远端 tool-calling
-- DeepSeek `Anthropic-compatible` 真实远端调用
-- GitHub 远端创建与推送
-
-原因：
-
-- 当前执行环境的外网访问受限，无法直接访问 `api.deepseek.com` 或 `api.github.com`
-
-## 近期计划
-
-### P0: 本机可验证性
-
-这部分优先级最高，直接决定这个项目能不能在真实环境里快速试用。
-
-- 扩展 `doctor`：已完成
-  - 检查 `DEEPSEEK_API_KEY` 是否存在并以掩码显示
-  - 显示当前 `base_url` 模式（OpenAI vs Anthropic 兼容）与对应 endpoint
-  - 提示 OpenAI / Anthropic 两条路径
-  - 通过 curl HEAD 探测联网前置条件
-- 增加 `smoke` 命令：已完成
-  - 单次最小远端调用，输出 http_status / duration_ms / 助手回复（截断）
-  - 通过 `--flavor openai|anthropic` 单独验证两条路径
-  - 缺 key、curl 缺失、非 2xx 都给出明确错误
-- 提供 `.dscode/config.toml` 示例文件：已完成
-  - `.dscode/config.example.toml` 含完整 key 注释与两种 base_url 模式说明
-  - 通过 `cp .dscode/config.example.toml .dscode/config.toml && deepseek doctor` 验证可解析
-
-### P1: 真实编辑能力增强：已完成
-
-- 扩展 `apply_patch`：已完成
-  - 失败诊断分类（缺文件 / hunk 失败 / 已应用 / 格式错误）
-  - 多文件 patch 支持，成功摘要按 modified / created / deleted / renamed 分项
-  - cwd 路径范围限制（拒绝 `..` 逃逸与 cwd 外绝对路径）
-  - 支持 git 风格 `a/` `b/` 前缀与 `/dev/null` 创建/删除标记
-- 让 planner 能生成 patch 模式编辑：已完成
-  - 离线 planner 通过 `build_single_line_diff` 构造单行 unified diff，附正确 `cwd` 与基名
-  - patch 不可构造（多匹配 / 多行 / 缺文件）时回退到文本替换
-- 在 patch 应用后自动查看 diff、必要时继续修复：已完成基础版
-  - `Observation` 增加 `Ok` / `Failed` 状态，agent loop 把工具异常转为观察项而非中断
-  - `git_diff` 复核仅在 `apply_patch` 成功后才触发
-  - patch 模式失败 + 同一编辑可文本替换时单次回退重试
-
-### P2: Anthropic 兼容路径补齐：已完成
-
-- ~~将 Anthropic 路径从 JSON plan 回退升级为正式 tool use~~（已完成）
-- ~~对齐 OpenAI-compatible 路径的能力边界~~（已完成；同一组工具描述同时下发）
-- ~~统一远端结果到同一个 `ModelAction` 抽象~~（已完成；两条路径都返回 `ModelAction::CallTool` / `Finish`）
-
-### P3: 审批与执行体验：已完成基础版
-
-- ~~增加真正的审批交互~~（已完成）
-  - `apply_patch` 写入前在 stderr 输出 `Apply patch in <path>? [y/N]:` 并读 stdin
-  - `run_shell` 执行前输出 `Run shell command in <cwd>: \`<command>\`? [y/N]:`
-  - agent `mcp_call` 调用远端 MCP tool 前输出 `Call MCP tool <server>/<tool>? [y/N]:`
-  - `approval.mcp_call_allowlist` 可把 agent MCP 调用限制到 `server/tool`、`server/*` 或 `*/tool`
-  - 非 TTY 默认拒绝（安全 fallback）；`DSCODE_AUTO_APPROVE_WRITES=1` / `DSCODE_AUTO_APPROVE_SHELL=1` / `DSCODE_AUTO_APPROVE_MCP=1` 可一次性放行对应类别
-- ~~区分”策略拒绝”和”工具失败”~~（已完成）
-  - `AppErrorKind` 枚举：`Other / PolicyDenied / ToolFailure`
-  - `policy_denied()` / `tool_failure()` 构造器
-  - agent loop renderer 输出 `✓/✗/⊘ name [observation_kind]`（TTY）或 `OK:/ERR:/DENIED: name [observation_kind]`（非 TTY），区分 PolicyDenied 与 ToolFailure
-- 更清晰的错误输出：进行中（DENIED 与 FAILED 区分已落地）
-
-### P4: 上下文与稳定性：已完成基础版
-
-- ~~更细的 observation 类型划分~~（已完成，含 file_excerpt / listing / search_results / patch / diff / shell_output / other 七类）
-- ~~更稳定的摘要与裁剪策略~~（已完成；shell 尾部 / 文件头部 / diff 保留 hunk header）
-- ~~降低大输出反复回填造成的上下文污染~~（已完成；同类只保留最新观察，旧的转为 superseded 桩）
-
-## 完整 Roadmap
-
-### Phase 0: 项目打底
-
-- 项目定位和范围确定
-- 架构分层
-- 文档建立
-
-状态：已完成
-
-### Phase 1: Rust CLI 骨架
-
-- CLI 命令入口
-- 配置和 session 基础设施
-- 模块边界和目录结构
-
-状态：已完成
-
-### Phase 2: 本地工具闭环
-
-- 文件读取
-- 搜索
-- patch
-- shell
-- diff
-
-状态：已完成基础版
-
-### Phase 3: 离线 planner loop
-
-- 任务输入
-- observation 回填
-- 工具调用循环
-- skill 提示增强
-
-状态：已完成基础版
-
-### Phase 4: DeepSeek 远端能力
-
-- 远端调用接入
-- OpenAI-compatible path
-- Anthropic-compatible path
-- 回退机制
-
-状态：
-
-- OpenAI-compatible：已完成基础版，已接正式 tool-calling
-- Anthropic-compatible：已完成基础版，已接正式 `tool_use` content block
-
-### Phase 5: 执行策略与安全
-
-- `allowed_tools`
-- `shell_allowlist`
-- 写入/命令审批
-
-状态：已完成基础版
-
-### Phase 6: 体验打磨
-
-- doctor：已完成扩展版（workspace / model / api key / network / hints / github 六段输出）
-- smoke：已完成（OpenAI 与 Anthropic 兼容路径独立验证）
-- diff 展示：基础版已具备
-- 更好的报错：已完成基础版
-  - `AppError` 增加 `hint` 与 `source` 字段
-  - 关键失败模式自动附 actionable 提示（gh auth / branch mismatch / non-TTY 拒绝 / 等 9 类）
-  - 因果链通过 `<dyn Error>::source()` 暴露给下游
-- 更好的上下文摘要：已完成基础版
-  - superseded 观察保留首行 + 行数（截 80 字符）而非通用桩，给 planner 留弱信号
-
-状态：基础版完成
-
-### Phase 7: 更强编辑能力
-
-- 多文件 patch：已完成（含路径范围校验与失败分类）
-- 更稳定的 edit-retry loop：已完成基础版（apply_patch patch→text 单次回退；工具异常变观察项）
-- 更像真实 code agent 的最小步编辑策略：已完成基础版
-  - `build_single_line_diff` 直接成功时 planner 跳过 list_files / read_file
-  - 简单 `replace "X" with "Y" in path` 任务 4 步压到 2 步（apply_patch + git_diff）
-
-状态：基础版完成
-
-### Phase 8: 高级能力
-
-- PR/CI 集成（v1，已完成基础版）
-  - `deepseek pr review <pr>` —— 只读 review，输出 markdown 到 stdout / 文件 / `gh pr comment`
-  - `deepseek pr fix <pr>` —— 抓首个失败 CI job，本地复现并迭代修复（12 步预算）
-  - `deepseek pr patch <pr>` —— 提改动到工作区；`--commit` 在干净工作区时自动 commit（不 push）
-  - 三命令共享 `gh auth` 检查、PR 上下文获取、prefilled observations 注入
-  - 所有写入与 shell 仍走 P3 confirm
-  - `deepseek doctor` 新增 `[github]` 段，显示 `gh` 版本与 auth 状态
-- 更强语言特化：未开始
-- IDE 集成：未开始
-- 多 agent：未开始
-
-状态：进行中（PR/CI 一项基础版完成）
-
-### Phase 9: 交互式体验
-
-- REPL (`deepseek chat` / `deepseek repl` / `deepseek interactive`)：v1 已完成
-  - 后续默认入口已调整：真实 TTY 中裸 `deepseek` 进入 TUI workbench
-  - 持久化 stdin 循环 + `> ` 提示符 + TTY 守门
-  - 跨轮 transcript 完整传给 LLM (user / assistant / tool 三类 turn)
-  - 9 个 slash 命令：`/quit /help /clear /budget /skill /diff /save /load /cost`
-  - 默认 20 步预算，`/budget N` 可调 (1..200)
-  - JSON 单文件 session 持久化（`/save` 原子写入；`/load` 严格校验）
-  - Token usage 累计（OpenAI / Anthropic 兼容路径）
-  - 老 turn 裁剪：assistant 保留最新 3 条全文；tool 输出走 `summarize_for_kind`
-- v2 已完成：流式 token 输出（DeepSeek SSE，2026-04-30）
-  - `util::sse::read_frame` 通用 SSE 框解析器
-  - `StreamEvents` trait + `TtyRenderer`（cyan / yellow / green / red ANSI conditional on TTY）
-  - `ModelClient::respond` 接收 `&mut dyn StreamEvents`
-  - DeepSeek 流式 OpenAI + Anthropic 双协议（curl `-N`）
-  - 离线 planner 也走 `StreamEvents`，颜色一致
-  - 175 → 198 测试，0 新依赖
-- v3 候选（未开始）：
-  - 上下箭头历史（rustyline 或自写 raw mode）
-  - Ctrl+C 优雅中断
-  - 自动保存 / `/sessions` 列表
-
-状态：v2 完成（streaming SSE）
-
-### Phase 10a — TodoTool
-
-- `todo_write` 工具：Claude Code 风格 task list；LLM 主动维护，跨 REPL 轮持久
-- `Todos:` 块每轮注入 user prompt（`render_for_prompt`）
-- 强 nudge 注入 system prompt（3+ 步任务必用、`in_progress` 唯一性、active_form 用于 in_progress 渲染）
-- session schema v1 → v2，自动迁移；v1 加载到空 todos，下次 `/save` 升级为 v2
-- `/todos` slash 命令读检视当前列表
-- transcript elision：旧的 `todos` ObservationKind 同类只保留最新
-- CR-1 解耦：user 看完整 list（`output.summary`），observation/transcript 走 trim（防 context 泄漏）
-- AgentLoop 增 `run_with_client<C: ModelClient>` 注入 seam（regression test 验证 CR-1）
-- 221 → 264 tests，0 新依赖
-
-状态：已完成（2026-05-01）
-
-### Phase 10c — Agent loop 实用性补强（dogfood 驱动）
-
-`dscode run` 多 agent dashboard dogfood（2026-05-02）暴露 4 类需求：
-
-**已完成：**
-
-- **10c-1 (`feat/todo-tool` merged)** — `recent_steps` replay：`AgentLoop::run_with_client` 把最近 3 步 assistant message 注入下一轮 `ModelRequest`，`build_user_prompt` 渲染 "Recent agent steps" block。补齐 `dscode run` 与 REPL transcript 的能力差。+2 tests。
-- **10c-1 周边** — `dscode run --budget N` flag（与 REPL `/budget` 对齐 1..=200）；`run_shell` allowlist 扩 `curl/wget/gh/mkdir/cat/echo/head/tail`（agentic 调研工作流）。
-- **10c-2 (`feat/loop-progress`)** — repeat-call detection：滑窗 3 步内同 `(tool_name, args)` 指纹，第 2 次执行后 observation summary 追加 `[stuck-warning]`，第 3 次直接短路返 `tool_failure(repeated identical tool call detected)`。+3 tests, 269 total。
-  - dogfood 实测（2026-05-03 retry research）：机制完全工作，stuck-warning 正确触发，第 3 次正确短路。**但暴露下一层问题**：v4-pro 写"Let me start fresh with actual research"却继续做 mkdir/todo_write 振荡（ABAB 模式绕过 fingerprint 检测），**从未调用 gh/curl**。LLM planning 短板，不是机制问题。
-
-**已完成：**
-
-- **10c-3 (`main`, 2026-05-05)** — Empty workspace bootstrap：当 `workspace` 为空、task 命中 research 关键词、且 `run_shell` 可用时，agent loop 注入强制 research bootstrap nudge。
-  - Step 1 必须是真实 research 调用：`gh search ...` 或 `curl -sSL ...`
-  - 禁止以 `todo_write / mkdir / list_files / setup-only shell` 开局
-  - 仅在“空工作区 + 调研任务”触发，避免污染正常代码任务
-  - 单测覆盖关键词识别、空目录判定、`run_shell` 可用性守门
-- **10c-4 (`main`, 2026-05-05)** — LLM-driven planner（v1）：复杂任务进入 explicit planning mode，先产出 todo plan，再按 `in_progress` 步执行。
-  - `AgentLoop` 基于 task/skill/tool 可用性启发式开启 `planning_mode`
-  - 首轮 system prompt 强制 `todo_write` 产出 3-7 条 concrete plan；已有 plan 时切换为 execution nudge
-  - `build_user_prompt` 渲染 `Execution plan` 与 `Current plan step`
-  - 离线 fallback planner 在 `planning_mode` 下也先生成 `todo_write` 计划，保持远端/离线路径一致
-  - 300 tests 全绿，含 planning heuristic / prompt rendering / offline bootstrap 回归
-- **10b (`main`, 2026-05-05)** — Sub-agent dispatch（v1）：新增 `dispatch_subagent` tool，把独立子任务派发给 child loop，带独立 todo list / budget / transcript。
-  - `ToolRegistry` 按 depth 注入 `dispatch_subagent`，当时默认只允许一层 child，避免无限递归（后续 Phase 11+ 已放宽为两层上限）
-  - child loop 复用同一套 runtime / skill / policy 逻辑，但关闭 banner、stream 输出与 session 持久化
-  - `dispatch_subagent` 支持 `task`、可选 `skill`、可选 `steps`，返回 child tool calls + final message 摘要
-  - system prompt 新增 sub-agent delegation nudge，仅在工具可用时出现
-  - 离线 planner 在“已有 plan + 探索型 todo + 尚未 dispatch”场景下会主动派一次 child，避免该能力只在强模型上可见
-  - child 成功返回后，若命中当前 `in_progress` exploration step，父 todo 会自动标完成并推进到下一条 pending
-  - 319 tests 全绿，含 registry depth guard、tool schema、nested loop 回归、offline dispatch heuristic、parent todo auto-advance
-
-**dogfood 累计发现的真实数据：**
-- ✅ DeepSeek v4-pro 在 **bounded 多步任务**上完全 work（todo_write 列表、状态切换 in_progress→completed、跨步 transcript replay 都正确）
-- ❌ DeepSeek v4-pro 在 **open-ended bootstrap** 上做不到从 setup 切到 research（mkdir+todo_write 振荡，10c-2 机制对但 LLM 不用）
-- ❌ DeepSeek v4-flash 不主动用 todo_write（v4-pro 主动用）— nudge 在小模型上效果弱
-- ✅ DeepSeek v4-flash + v4-pro 都用 OpenAI 并行 tool calls — Phase 10a C3 fail-loud 守门救了我们
-- ✅ `dscode chat` REPL transcript 始终工作正常；`dscode run` 多步任务能力差距由 10c-1 部分修复，10c-2 进一步加固
-
-状态：进行中（10b + 10c-1 + 10c-2 + 10c-3 + 10c-4 完成）
-
-### Phase 10d — Skills 拓展
-
-**10d-1 (`feat/skills-expansion`) — 已完成 (2026-05-03)**：
-- 12 个新 skill toml ship 到仓库 `skills/` （research / refactor / debug / write-tests / dependency-update / rust-clippy / python-mypy / pr-fix-feedback / brainstorm / verify-changes / commit-message / readme-update）
-- 用户级目录 `~/.config/dscode/skills/` 加载支持，可经 `workspace.user_skills_dir` 配置
-- last-wins 撞名语义（user override repo）
-- `SkillRegistry::load_dirs(&[paths])` + `LoadStats` 报告 per-path 计数 + override 列表
-- `dscode doctor` 加 `[skills]` 段
-- 273 → 285 tests, 0 新依赖
-
-**10d-2 (`main`, 2026-05-05) — 已完成**：
-- `SkillSpec` schema v2：新增 `triggers` / `initial_todos` / `references`
-- 手写 TOML loader 兼容老 schema，同时支持 `[[initial_todos]]` tables
-- `AgentLoop` 选中 skill 时可用 `initial_todos` seed 首轮 todo plan
-- skill `references` 进入 prompt 和 CLI 输出，便于给模型稳定上下文
-- 仓库代表性 skills（research / debug / refactor / write-tests / verify-changes）已升级到 v2 字段
-- 300 tests 全绿，含 schema 解析、todo seed、prompt rendering 回归
-
-**10d-3 (`main`, 2026-05-05) — 已完成**：
-- 当用户未显式传 `--skill` 时，`resolve_skill` 会基于 `triggers` 从 task 文本自动匹配最相关 skill
-- 显式 `--skill` 仍然优先，不会被 auto-select 覆盖
-- `AgentLoop` 会打印 skill 来源：`explicit` 或 `auto (trigger match)`
-- auto-select 与 10d-2 的 `initial_todos` / `references` 联动，自动选中的 skill 也能 seed todo 和补 prompt 上下文
-- 覆盖 resolver 排序、显式优先、runtime auto-seed 回归；全量测试更新到 308+
-
-状态：10d-1 + 10d-2 + 10d-3 完成
-
-### Phase 10e — Benchmark / Dogfood 基线
-
-**10e-1 (`main`, 2026-05-05) — 已完成基础版**：
-- 新增 `benchmark` CLI，读取无依赖 manifest，顺序执行一组 task case，并输出 markdown report
-- manifest 支持 `name / task / skill / budget / expect_tool / expect_message_contains`
-- 默认路径：
-  - manifest: `.dscode/benchmarks.txt`
-  - report: `.dscode/benchmarks/latest.md`
-- 新增示例文件 [`.dscode/benchmarks.example.txt`](/home/willamhou/codes/DeepseekCode/.dscode/benchmarks.example.txt)
-- 新增 `deepseek` binary launcher，和 `dscode` 共用同一入口逻辑
-
-**10e-2 (`main`, 2026-05-05) — 已完成基础版**：
-- benchmark manifest 扩展了更强约束：
-  - `forbid_tool`
-  - `min_tool_calls`
-  - `max_tool_calls`
-  - `max_failed_tools`
-  - `notes`
-- report 现在会输出：
-  - 总 tool calls
-  - 总 failed tool calls
-  - 每 case 的 failure summary
-- 仓库新增默认基线 [`.dscode/benchmarks.txt`](/home/willamhou/codes/DeepseekCode/.dscode/benchmarks.txt)，覆盖 repo inspection / code search / roadmap read / explicit planning 四类只读任务
-- 该基线允许“带失败地暴露回归”，不是只追求全绿；它的作用是给 planner/recovery 提供稳定对比面
-
-**10e-3 (`main`, 2026-05-05) — 已完成基础版**：
-- planner 对 lookup / planning-only 任务做了首轮收敛：
-  - code lookup 类 task 优先 `search_text`，不再先 `dispatch_subagent` 或错误地从 `Cargo.toml` 开始
-  - “before acting / report execution steps” 类 task 在 `todo_write` 后直接收尾，不再继续 repository inspection
-- 默认 benchmark 基线从 `2/4` 提升到 `4/4`
-- 该轮优化直接减少了无效 tool hops，为下一步 failure-recovery 留出更干净的 baseline
-
-**10e-4 (`main`, 2026-05-05) — 已完成基础版**：
-- runtime 现在会在常见断点后注入结构化 `recovery_hint` observation，而不是只把错误文本丢给模型自己发挥：
-  - `search_text` 无结果
-  - `read_file` 失败
-  - `dispatch_subagent` 失败
-  - `run_shell` 非零退出 / 失败
-- offline planner 会优先消费最新的 `recovery_hint`，走确定性的恢复路径：
-  - `search_text -> list_files`
-  - `read_file -> search_text`
-  - `run_shell -> git_diff/read_file/search_text`
-- lookup 路径也补了正常收敛：已有 `search_text` 命中时，优先 `read_file` 读取匹配文件，而不是退回 `Cargo.toml` 或重复 `list_files`
-- 默认 benchmark 基线保持 `4/4`，说明 recovery 没把前一轮压下来的 tool hops 拉回去
-- 全量测试更新到 `332 passed, 0 failed`
-
-**10e-5 (`main`, 2026-05-05) — 已完成基础版**：
-- benchmark manifest 新增 `expect_tool_sequence`，可验证关键 tool 链路是否按顺序出现，而不只是“有没有调用过”
-- benchmark report 新增 `Tool Trace` 列，直接输出每个 case 的实际 tool 序列，便于看 planner 是否退化成额外 hops
-- 默认基线新增 `recover-empty-search` case，稳定覆盖自然恢复路径：
-  - `todo_write -> search_text -> list_files -> read_file`
-  - 关键断言是 `search_text -> list_files`
-- `search-subagent-flow` 也收紧成顺序断言：要求至少出现 `search_text -> read_file`
-- 默认 benchmark 基线扩到 `5` 个 case，并保持 `5/5` 通过
-
-### Phase 10f — Failure Recovery / Validation Signals
-
-**10f-1 (`main`, 2026-05-05) — 已完成基础版**：
-- `run_shell` 现在会在输出头部附带结构化元数据：
-  - `meta.command_kind`
-  - `meta.exit_code`
-  - `meta.result`
-  - `meta.failure_kind`
-  - `meta.failed_tests`
-  - `meta.stderr_summary`
-- shell observation trim 不再把这些头部字段截掉；即使 stdout/stderr 很长，planner 仍能稳定看到结构化信号
-- 当前支持的失败类型至少区分：
-  - `test_failure`
-  - `lint_failure`
-  - `build_failure`
-  - `command_failure`
-- cargo test / pytest 的失败测试名现在会被抽取到 `meta.failed_tests`
-- recovery reason 也会消费这些字段：test failure 会把 failing test 名带进 hint，而不是只给一个泛化的 “command failed”
-- 全量测试更新到 `338 passed, 0 failed`
-
-**10f-2 (`main`, 2026-05-05) — 已完成基础版**：
-- runtime 现在会在两类场景下注入结构化 `replan_hint` observation：
-  - 最近步骤里连续出现多个 `recovery_hint`
-  - `dispatch_subagent` 返回 `child outcome: blocked`
-- `dispatch_subagent` 摘要新增：
-  - `child failed tool calls`
-  - `child outcome`
-- offline planner 遇到最新 observation 为 `replan_hint` 时，会优先回到 `todo_write` 重排父计划，而不是沿着旧 todo 继续硬顶
-- replan 后的新计划会以 “Reassess the plan using the latest blocker or recovery signal” 开头，再接常规搜索 / 阅读 / 修改 / 验证步骤
-- 全量测试更新到 `342 passed, 0 failed`
-
-**10f-3 (`main`, 2026-05-05) — 已完成基础版**：
-- benchmark 现在支持 `seed_observations`，可以在 case 开始前注入结构化 observation，稳定复现 recovery / replan 前态，而不需要依赖真实失败环境
-- manifest 支持 `\n` 转义，因此可以内嵌多行 shell observation（含结构化 `meta.*` 字段）
-- 默认 benchmark 基线新增 2 个 seeded recovery case：
-  - `recover-read-file-failure`
-  - `recover-failed-validation`
-- 这样当前默认基线一共覆盖 7 个 case，其中恢复类同时覆盖：
-  - 自然 `search_text -> list_files`
-  - seeded `read_file failed -> search_text`
-  - seeded `failed validation -> git_diff`
-- benchmark report 新增 `Notes` 列，便于直接区分 baseline / natural recovery / seeded recovery 的意图
-- 默认 benchmark 基线更新为 `7/7` 通过
-
-**10f-4 (`main`, 2026-05-05) — 已完成基础版**：
-- shell recovery 现在会按 `failure_kind` 分流，而不是对所有 `run_shell` 失败统一走 `git_diff / read_file / search_text`
-- `test_failure` 路线：
-  - 若刚成功过 `apply_patch` 且可用 `git_diff`，优先复核 diff
-  - 若 `meta.failed_tests` 能提取出文件路径，优先 `read_file <that file>`
-  - 否则退回 `primary_file` 或通用恢复路径
-- `lint_failure / build_failure` 路线：
-  - 若 `meta.stderr_summary` 能提取标识符或引用符号，优先 `search_text <derived query>`
-  - 否则优先回到 `primary_file`
-- `recovery_hint` 现在可携带结构化 `query=` 与 `path=`；offline planner 会严格消费这些字段，而不是重新从 task 文本猜参数
-- 默认 benchmark 基线扩到 `9` 个 case，新增：
-  - `recover-lint-failure`
-  - `recover-test-file-path`
-- 默认 benchmark 结果提升并稳定在 `9/9` 通过
-- 全量测试继续保持全绿，并覆盖 failure-kind-aware routing 与 planner 参数消费回归
-
-### Phase 10g — Todo Matching / Fixture Realism
-
-**10g-1 (`main`, 2026-05-05) — 已完成基础版**：
-- parent todo auto-advance 不再靠 `delegated_task.contains(todo.content)` 这种宽松字符串包含关系命中
-- planner 生成的 `dispatch_subagent` 任务现在显式携带：
-  - `Delegated todo step: <exact parent todo>`
-  - `Parent task: <full user task>`
-- `TodoList::complete_in_progress_matching_subagent_task` 优先解析并精确匹配这个 delegated marker；只有旧格式 fallback 才允许“规范化后完全相等”的保守匹配
-- 这避免了“内容相似但语义不同”的 delegated task 错误地把父级当前 `in_progress` todo 自动标为完成
-- 新增回归覆盖：
-  - marker 精确命中会推进
-  - 相似但不同的步骤不会误命中
-  - 旧格式 fallback 仍兼容 exact normalized match
-- 默认 benchmark 保持 `9/9` 通过
-- 全量测试更新到 `351 passed, 0 failed`
-
-**10g-2 (`main`, 2026-05-05) — 已完成基础版**：
-- benchmark manifest 新增 `workdir` 字段，case 可以切到 manifest 相对路径下的真实 fixture 目录运行，而不再只依赖 seeded observations
-- benchmark runner 现在会：
-  - 将 `workdir` 解析为 manifest-relative 路径
-  - 在受锁保护的临时 cwd 中执行单个 case
-  - 在 report 里输出 `Workdir` 列，便于直接区分 repo-root baseline 与 fixture baseline
-- 仓库新增真实小仓库 fixture：
-  - [`.dscode/fixtures/rust-cli-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/rust-cli-mini/Cargo.toml)
-- 默认 benchmark 基线扩到 `13` 个 case，其中新增 4 个 fixture case，覆盖：
-  - read-only inspection
-  - `search_text -> read_file`
-  - 自然 `search_text -> list_files` recovery
-  - seeded `test_failure -> read_file` recovery in fixture workdir
-- 这一轮还顺手修了一个 planner 偏差：lookup-heavy task 不再因为 Rust profile 自动把 `cargo test` 塞进 todo plan 或执行路径
-- 默认 benchmark 更新为 `13/13` 通过
-
-**10g-3 (`main`, 2026-05-05) — 已完成基础版**：
-- `dispatch_subagent` 摘要现在在顶部输出结构化 `meta.child_*` 行，而不只是自由文本：
-  - `meta.child_task`
-  - `meta.child_skill`
-  - `meta.child_budget`
-  - `meta.child_tool_calls`
-  - `meta.child_failed_tool_calls`
-  - `meta.child_outcome`
-  - `meta.child_files`
-  - `meta.child_final_message`
-- child files 会从 subagent 的 `read_file / search_text / list_files` 结果里抽取去重后的文件列表，减少父 planner 从长文本里猜上下文
-- parent runtime 的 blocker 检测现在优先按 `meta.child_outcome=blocked` 解析，而不是只做脆弱的字符串包含判断
-- 这给后续 parent planner 消费 child findings / files / blockers 留出了稳定落点
-
-**10g-4 (`main`, 2026-05-06) — 已完成基础版**：
-- 新增 `dogfood` CLI：
-  - `deepseek dogfood run [--skill <name>] [--budget <n>] [--outcome success|failed|stuck|manual] [--manual-intervention] [--notes "..."] "<task>"`
-  - `deepseek dogfood report [--out <file>] [--limit <n>]`
-- live dogfood ledger 采用 append-only `jsonl`：
-  - [`.dscode/dogfood/ledger.jsonl`](/home/willamhou/codes/DeepseekCode/.dscode/dogfood/ledger.jsonl)
-  - 每条记录包含 task / skill / budget / model / workdir / outcome / manual_intervention / tool_calls / failed_tool_calls / repeated_call_failures / used_subagent / final_message / tool_trace
-- `dogfood run` 会在真实 `AgentLoop` 执行后自动：
-  - 推导默认 outcome（`stuck` 优先于 `failed`，否则 `success`）
-  - 追加 ledger 记录
-  - 刷新 markdown 汇总报告 [`.dscode/dogfood/latest.md`](/home/willamhou/codes/DeepseekCode/.dscode/dogfood/latest.md)
-- `dogfood report` 会重新汇总历史记录，输出：
-  - success rate
-  - failed rate
-  - stuck rate
-  - manual intervention rate
-  - average tool calls
-- 这一步把 benchmark 之外的真实任务结果也纳入可比对面，后续 planner / recovery / subagent 迭代不再只靠 synthetic baseline
-
-**10h-1 (`main`, 2026-05-06) — 已完成基础版**：
-- parent planner 现在会先消费 `dispatch_subagent` 的结构化 child findings，再决定下一步：
-  - 优先读 `meta.child_files`
-  - 若没有显式 child files，也会从 `meta.child_final_message` 里回收形如 `src/main.rs` 的路径
-  - 只有任务本身没有 query、child 也没给文件、而且 parent 还没进入 `read_file` 时，才会把 `meta.child_final_message` 里的符号当成 fallback `search_text` query
-- 这轮顺手收紧了 child query 的使用顺序，避免 parent 在已经读过相关文件后，又被 child final message 拖去多跑一轮 `search_text`
-- 新增回归覆盖：
-  - child summary 直接给文件路径时，parent 会优先 `read_file`
-  - child summary 只给符号查询时，parent 才会 fallback 到 `search_text`
-  - parent 已经读取相关文件后，不会再回头消费 child query
-  - child final message 内嵌文件路径时，也能被提取成 follow-up read target
-- 默认 benchmark 保持并验证为 `13/13` 通过，其中 real fixture inspection case 回到预期链路：
-  - `todo_write -> dispatch_subagent -> list_files -> read_file`
-
-**10h-2 (`main`, 2026-05-06) — 已完成基础版**：
-- benchmark fixture 家族从单一 Rust read-only 样例扩到三类真实小仓库：
-  - Rust read/recovery fixture：[`rust-cli-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/rust-cli-mini/Cargo.toml)
-  - Python CLI fixture：[`python-cli-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/python-cli-mini/pyproject.toml)
-  - JavaScript CLI fixture：[`js-cli-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/js-cli-mini/package.json)
-- benchmark manifest 新增 `isolate_workdir = true`，runner 会把该 fixture 复制到临时目录执行，再清理副本：
-  - 这让 write+validate case 可以真实调用 `apply_patch` 和 `run_shell`
-  - 同时避免污染源 fixture 或仓库工作区
-- isolated case 在 runner 内会临时开启 `DSCODE_AUTO_APPROVE_WRITES=1`、`DSCODE_AUTO_APPROVE_SHELL=1` 和 `DSCODE_AUTO_APPROVE_MCP=1`，消除非交互 benchmark 被 confirm prompt 卡死的问题
-- 新增真实 write+validate fixture：
-  - [`rust-write-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/rust-write-mini/Cargo.toml)
-  - 基线链路稳定为 `apply_patch -> git_diff -> run_shell`
-- 这一轮还顺手修了两个真实会影响线上行为的问题：
-- direct edit parser 现在支持反引号 quoted segments，并会截断 trailing `and validate ...` / `then run ...` 子句
-- skill auto-select 在 task 已经明确要求 direct edit 时，会跳过不允许 `apply_patch` 的 skill，避免 `validate` 误触发 `verify-changes` 把写入任务带偏
-- 默认 benchmark 基线扩到 `18` 个 case，并保持 `18/18` 通过
-
-**10h-3 (`main`, 2026-05-06) — 已完成基础版**：
-- `dogfood` CLI 新增：
-  - `deepseek dogfood export-benchmark [--out <file>] [--limit <n>] [--outcome success|failed|stuck|manual]`
-- dogfood ledger 现在除了 task / outcome / tool_trace 之外，还会持久化 `benchmark_seed_observations`
-  - 内容来自最近 3 个 tool events 的可回放 seed 串
-  - 格式直接兼容 benchmark manifest 的 `seed_observations = "... || ..."` 写法
-- `dogfood report` 新增 `Benchmark seed candidates` 统计，能快速看当前 live dogfood 里有多少失败/stuck/manual 运行适合反推成 benchmark case
-- `dogfood export-benchmark` 会把失败 / stuck / manual 运行导出成可直接追加到 manifest 的草稿：
-  - 自动生成 case 名
-  - 保留 task / skill / budget
-  - 尽量把 repo-root workdir 转成相对路径
-  - 写出可复用的 `seed_observations`
-- 这轮让 dogfood -> benchmark 不再是纯手工抄写，而是有了稳定的“先跑真实任务，再抽取 regression seed”出口
-
-**10h-4 (`main`, 2026-05-06) — 已完成基础版**：
-- fixture-backed write + validate 路线现在不只覆盖 happy path，还覆盖了真实 failed-validation recovery：
-  - 成功链路：`apply_patch -> git_diff -> run_shell`
-  - 失败链路：`apply_patch -> git_diff -> run_shell -> read_file`
-- 为了让这条失败链路稳定收敛，planner recovery 做了两个收紧：
-  - 若 failed validation 的 `recovery_hint` 仍指向 `git_diff`，但 diff 在本轮已经看过，就直接回到刚刚 patch 过的文件
-  - `preferred_read_path` 现在会优先识别最近成功 `apply_patch` 的真实输出，包括 `patched <path>` 和 unified patch 的 `Applied unified patch in ... / modified:` 组合，避免 recovery 退回到无关的 `primary_file`
-- 新增真实 isolated fixture case：
-  - `fixture-recover-write-validate-rust-mini`
-  - 任务会故意把 `a - b` 改成 `a * b`，让 `cargo test` 失败，再验证 planner 是否会回读 `src/lib.rs`
-- 默认 benchmark 基线扩到 `19` 个 case，并保持 `19/19` 通过
-
-**10h-5 (`main`, 2026-05-06) — 已完成基础版**：
-- benchmark 结果现在不再只保留 latest report，还会把每次运行的汇总指标 append 到：
-  - [`.dscode/benchmarks/history.jsonl`](/home/willamhou/codes/DeepseekCode/.dscode/benchmarks/history.jsonl)
-- 每条历史记录会持久化：
-  - manifest 路径
-  - `passed/cases`
-  - `total_tool_calls`
-  - `total_failed_tool_calls`
-  - `duration_ms`
-  - 运行当时的 dogfood snapshot（`runs/success/failed/stuck/manual`）
-- benchmark report 现在新增两块趋势信息：
-  - `Previous benchmark: ... Δ ...`，直接比较最近两轮的通过数、tool calls、failed tools
-  - `## Recent Runs`，展示最近 5 次 benchmark 历史
-- 这让 benchmark 和 dogfood 不再只是“看 latest.md”，而是开始有连续的趋势面
-
-**10h-6 (`main`, 2026-05-06) — 已完成基础版**：
-- `dogfood` CLI 新增：
-  - `deepseek dogfood promote-benchmark [--manifest <file>] [--limit <n>] [--outcome success|failed|stuck|manual] [--dry-run]`
-- promotion workflow 现在是闭环的：
-  - 从 dogfood ledger 读取 replayable `benchmark_seed_observations`
-  - 对照正式 benchmark manifest 做去重
-  - 自动处理 case name 冲突
-  - 非 dry-run 模式下直接 append 到目标 manifest
-- 去重不是只看 case name，而是按 `task + skill + workdir + seed_observations` 做 identity 判断，避免同一条 regression seed 被重复提升
-- 这一步把 `dogfood run -> export seed -> 手工拷贝到 benchmarks.txt` 进一步收敛成 `dogfood run -> promote-benchmark` 的直接路径
-
-**10h-7 (`main`, 2026-05-06) — 已完成基础版**：
-- real fixture 的 failed-validation 路线现在不只会“读回刚 patch 的文件”，还会在任务明确要求“修到通过”为止时继续给出第二次修复尝试
-- planner 新增了一个很窄的 retry 通道：
-  - 仅在 task 明确包含 `until the tests pass` / `keep fixing` 这类意图时开启
-  - 仅在已经发生 `apply_patch -> git_diff -> run_shell(failed) -> read_file` 后触发
-  - 当前只对简单 arithmetic replace 做修复推断，依据 failing test 名里的 `add/sub/mul/div` 语义选择目标 operator
-- `git_diff` 和 `run_shell` 现在不再是“每轮只允许一次”，而是按“每次成功 patch 后最多再跑一次”计数，这让 retry patch 后的 diff / validation 能稳定继续执行
-- `run_shell` 会补上 `~/.cargo/bin` 到 PATH，避免 fixture benchmark 在非 login shell 下把 `cargo test` 错判成环境缺失
-- 新增真实 isolated fixture case：
-  - `fixture-retry-write-validate-rust-mini`
-- 预期链路：`apply_patch -> git_diff -> run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-- 默认 benchmark 基线扩到 `20` 个 case，并提升到 `20/20` 通过
-
-**10i-1 (`main`, 2026-05-06) — 已完成基础版**：
-- benchmark 现在不再只展示历史，而是会对“同 manifest + 同 case 数”的最近可比运行做 trend gate
-- gate 规则保持很窄，优先抓明显回归：
-  - 当前 `passed` 低于最近窗口里的可比 best 时直接判回归
-  - 当前 `total_failed_tool_calls` 高于可比中位数时判回归
-  - 当前 `total_tool_calls` 高于可比中位数加容忍度时判回归
-    - 容忍度当前取 `max(3, ceil(median * 10%))`
-- gate 只在有足够历史时启用：
-  - 仅比较最近 `5` 次可比运行
-  - 至少需要 `3` 次 prior comparable runs，否则 report 标成 `skipped`
-- benchmark report 现在会直接输出：
-  - `Trend gate: pass ...`
-  - 或 `Trend gate: FAILED ...`
-  - 或 `Trend gate: skipped ...`
-- CLI 语义也收紧了：
-  - report 和 history 仍然会照常写出
-  - 但如果 trend gate 失败，`deepseek benchmark`（兼容别名 `dscode benchmark`）会返回非零退出，便于直接挂到 CI 或本地回归门禁上
-
-**10i-2 (`main`, 2026-05-06) — 已完成基础版**：
-- `promote-benchmark` 现在不再默认把所有 replayable non-success seed 都提升进正式基线，而是增加了一层 promotion policy
-- 默认 policy 只接受更像真实 regression 的记录：
-  - outcome 默认只接受 `failed` / `stuck`
-  - 必须有真实失败信号：`failed_tool_calls > 0`，或 `repeated_call_failures > 0`
-  - 必须有真实 tool trace，且总 tool calls 不超过 `8`
-- `manual` 记录不会再被默认 promote；只有显式传 `--outcome manual` 时才允许进入 promotion 流程
-- `export-benchmark` 仍保持宽松，继续作为“先导出草稿再人工挑选”的出口；收紧只发生在 `promote-benchmark`
-- promotion 命令输出现在会把筛选结果拆开显示：
-  - `duplicates skipped`
-  - `policy skipped`
-  - `selected`
-  这样更容易看出是“没有候选”，还是“候选被策略拦住了”
-
-**10i-3 (`main`, 2026-05-06) — 已完成基础版**：
-- benchmark case schema 新增了：
-  - `expect_last_tool_output_contains = "..."`
-- 这让 fixture write+validate case 的判定不再只停留在“tool trace 对了”，而是能直接要求最终关键工具输出满足某个结果语义
-- 默认基线里三条 Rust write/validate 相关 case 已切到更强的成功语义：
-  - `fixture-write-validate-rust-mini` 现在要求最后一个 `run_shell` 输出包含 `meta.result=ok`
-  - `fixture-recover-write-validate-rust-mini` 现在要求最后一个 `read_file` 真的读回了坏改动后的内容
-  - `fixture-retry-write-validate-rust-mini` 现在要求最后一个 `run_shell` 输出包含 `meta.result=ok`
-- 这一步让 benchmark 开始验证“最终命令真的成功/失败到了预期状态”，而不是只验证 planner 有没有走过看起来像对的链路
-
-**10i-4 (`main`, 2026-05-06) — 已完成基础版**：
-- benchmark case schema 现在支持显式 `category = "..."`，默认基线已经按能力切成：
-  - `read_only`
-  - `write_validate`
-  - `recovery`
-  - `subagent`
-  - `planning`
-- benchmark history 现在不只落总量，还会把每个 category 的：
-  - `cases`
-  - `passed`
-  - `total_tool_calls`
-  - `total_failed_tool_calls`
-  一起写进 history record
-- benchmark report 新增 `## Category Slices` 表，直接展示每个 slice 的：
-  - 当前通过数 / case 数
-  - 当前 tool calls / failed tools
-  - 相对上一次 benchmark 的 delta
-  - 该 slice 自己的 trend gate 状态
-- trend gate 也从“只看总量”升级成“总量 + category slice”双层判断：
-  - overall 规则保持不变
-  - 每个 category 会单独对比最近可比运行
-  - 如果某个 slice 的 tool calls / failed tools / passed 出现明显回归，即使总量看起来没坏，也会把 gate 打红
-- 这一步解决了一个关键盲点：read-only baseline 的稳定性不再能掩盖 write/recovery/subagent 的真实退化
-
-**10i-5 (`main`, 2026-05-07) — 已完成基础版**：
-- `dogfood promote-benchmark` 现在不只输出：
-  - `selected`
-  - `duplicates skipped`
-  - `policy skipped`
-  还会在存在 policy reject 时打印 `policy skip reasons`
-- explainability 现在按原因聚合，并为每类原因带一条示例 task，当前覆盖的 policy reason 包括：
-  - `manual outcome requires --outcome manual`
-  - `tool trace too long (>8 calls)`
-  - `missing real tool trace`
-  - `missing failed/stuck/manual signal`
-- policy 本身没有放宽；改动只发生在 explainability 层，所以 promotion 结果不会因为这轮变化而变得更松或更紧
-- dogfood promotion 的 case identity 和导出 block 仍然保留 `category`，因此不同 slice 的 regression seed 不会因为 explainability 改动被错误去重
-- 这一步解决的是“为什么没 promote 进去”不可见的问题，让人工挑 seed 时能快速判断：
-  - 是默认 policy 太严格
-  - 还是这条 dogfood 记录本身就不像一个好 benchmark regression seed
-
-**10i-6 (`main`, 2026-05-07) — 已完成基础版**：
-- benchmark manifest 新增：
-  - `expect_tool_output_contains = "tool_name:needle"`
-  - `expect_tool_input_contains = "tool_name:key=value"`（2026-05-13 remote PR semantic review slice）
-- 语义是：目标 tool 最后一次调用的输出必须包含给定 `needle`
-- input 断言语义是：目标 tool 最后一次调用的指定 input key 必须包含给定 value
-- 这让 benchmark 不再只能断言：
-  - final message
-  - last tool output
-  而是可以稳定检查“中间某一步关键工具到底产出了什么”
-- 默认 write+validate 基线已经接上这套更细的断言：
-  - `fixture-recover-write-validate-rust-mini` 现在会显式要求 `run_shell` 输出包含 `meta.failure_kind=test_failure`
-  - `fixture-retry-write-validate-rust-mini` 现在会显式要求中间那次 `read_file` 真的读回了坏改动内容
-- 这一步把 benchmark 从“路径看起来像对了”进一步推到“关键中间状态也真的出现了”
-
-**10i-7 (`main`, 2026-05-07) — 已完成基础版**：
-- dogfood ledger 现在会直接持久化：
-  - `benchmark_category`
-- 新产生的 dogfood 记录会在写入时就把 benchmark category 算好，而不是等到 `export-benchmark` / `promote-benchmark` 时再临时推断
-- `dogfood export-benchmark` 现在会直接输出：
-  - `category = "..."`
-  并优先使用 ledger 里的真实 category
-- `dogfood promote-benchmark` 也同样优先吃 ledger category；老的 ledger 行如果没有这个字段，会在读取时自动 fallback 推断并补回内存对象
-- 这一步的价值不在“今天导出的文本多一行”，而在于：
-  - category 决策点前移到 dogfood 采集时
-  - export / promote / de-dup / later analytics 都开始共享同一份 category truth，而不是各自猜一遍
-
-**10i-8 (`main`, 2026-05-07) — 已完成基础版**：
-- benchmark 的 category slice trend gate 现在支持 warmup：
-  - 当历史里既有老的 `version=1` overall run，又只有少量新的 `version=2` category-aware run 时
-  - 会优先用已有的 category-aware baseline 去保守投影旧 v1 run 的 slice metrics
-- 这个投影只用于：
-  - 让 `comparable_runs` 更快达到门槛
-  - 让 `planning / read_only / recovery / subagent / write_validate` 更早开始做 slice-level gate
-- 它不会用当前 run 反推自己，也不会在完全没有真实 category baseline 时硬猜：
-  - 没有至少一条历史 category-aware 记录时，slice gate 仍然保持 `skipped`
-- 真实效果是：
-  - category 区块已经从 `skipped (0/3)` 收紧成 `pass vs 5 runs`
-  - 而不是继续等更多纯 v2 history 自然积累
-- 这一步的价值在于：
-- mixed-history 迁移阶段更快进入可比状态
-- slice regression 更早暴露
-- 但 overall/category 的 pass/fail 阈值没有被放松
-
-**10i-9 (`main`, 2026-05-07) — 已完成基础版**：
-- benchmark manifest 现在支持：
-  - `assertion_bundle = "..."`
-- bundle 只提供默认断言组合，case 里显式写的字段仍然优先覆盖，因此它的作用是：
-  - 收短重复的 `expect_tool / expect_tool_sequence / max_tool_calls / max_failed_tools`
-  - 但不损失那些 case-specific 的中间状态断言
-- 当前内置 bundle 覆盖了最常见的几类链路：
-  - `read_only_inspect`
-  - `read_only_search`
-  - `recovery_search_fallback`
-  - `recovery_readback_then_search`
-  - `recovery_diff_then_readback`
-  - `recovery_search_then_readback`
-  - `write_validate_ok`
-  - `write_validate_failure_readback`
-  - `write_validate_retry_ok`
-  - `planning_todo_only`
-- 默认 benchmark manifest 和 example manifest 已经切到这套 bundle：
-  - 重复断言明显减少
-  - 但像 `read_file:2     a * b` 这种 case-specific output check 仍然保留在具体 case 上
-- parser 回归覆盖了三条关键边界：
-  - bundle 默认值会生效
-  - 显式字段可以覆盖 bundle 默认值
-  - 未知 bundle 会直接报错
-- 真实 benchmark 继续保持 `20/20` 通过，说明这是结构收敛，不是语义漂移
-
-**10i-10 (`main`, 2026-05-07) — 已完成基础版**：
-- dogfood markdown report 现在新增：
-  - `## Category Breakdown`
-- 会按 ledger 里的 `benchmark_category` 聚合每个 slice 的：
-  - `runs`
-  - `success`
-  - `failed`
-  - `stuck`
-  - `manual`
-  - `avg tool calls`
-  - `seed candidates`
-- 明细表也新增了 `Category` 列，因此从 summary 到单条 run 都能直接看到真实任务落在哪个 slice
-- 这一步的意义是把 benchmark 的 category 视角延伸到 live dogfood：
-  - benchmark 已经知道 `read_only / recovery / write_validate / planning / subagent`
-  - dogfood 现在也能按同一套切面展示真实任务分布和成功率
-- 当前真实 ledger 样本还很小，所以 category 分布暂时不代表趋势；但报表结构已经稳定，可以开始持续积累
-- 这一步还顺手暴露了一个后续该修的问题：
-  - 当前部分 read-only dogfood 任务因为 trace 里先出现 `todo_write`，会被 heuristic 归到 `planning`
-  - 这是 category inference 精度问题，不是 report 聚合问题
-
-**10i-11 (`main`, 2026-05-07) — 已完成基础版**：
-- dogfood category inference 不再把“任何带 `todo_write` 的任务”都粗暴归到 `planning`
-- 新的优先级更接近 task 语义，而不是只看第一步用了什么 tool：
-  - `write_validate` 仍优先由 `apply_patch / run_shell` 判定
-  - `recovery` 仍优先由 `recovery_hint / failed_tool_calls / repeated_call_failures` 判定
-  - `subagent` 主要看 task 是否真的在讨论 subagent / parent-child loop
-  - `planning` 只保留给明确的 plan-only task，或 trace 里只有 `todo_write` 这类前置 planning 行为
-  - 其他正常只读分析任务回到 `read_only`
-- 为了不让旧 ledger 长期带着错误标签，读取 dogfood 记录时还加了一层保守纠偏：
-  - 如果历史记录存的是 `planning`
-  - 但按新规则它明显不是 planning
-  - report/export/promote 会优先用纠偏后的 category
-- 真实效果是：
-  - 现有 dogfood report 里那条 `inspect repository layout ...` 已经从 `planning` 修正为 `read_only`
-  - planning-only case 仍然保持 `planning`
-
-**10i-12 (`main`, 2026-05-08) — 已完成基础版**：
-- benchmark trend gate 现在已经能挂到更接近真实开发入口的命令上，而不只是在手工跑 `deepseek benchmark` 时才生效
-- 新增显式 hook：
-  - `deepseek dogfood run --benchmark-gate ...`
-  - `deepseek pr fix ... --benchmark-gate`
-  - `deepseek pr patch ... --benchmark-gate`
-- 语义保持很直接：
-  - 先执行原始 dogfood / PR 任务
-  - 再自动跑默认 benchmark baseline
-  - 如果 benchmark trend gate 失败，整个入口命令也会非零退出
-- 这一步没有默认把 gate 强塞到所有入口，仍然要求显式打开：
-  - 避免把一次普通探索或 review 变成意外的长链 benchmark 运行
-  - 但需要时已经可以把“真实任务 + 基线回归门禁”串成同一条命令链
-- 真实验证已经跑过：
-  - `deepseek dogfood run --benchmark-gate "inspect repository layout ..."` 会在 dogfood ledger/report 写完后继续跑 benchmark，并打印 trend gate 结果
-
-**10i-13 (`main`, 2026-05-08) — 已完成基础版**：
-- benchmark history 现在不只会顺手带一个 dogfood 总量 snapshot，还会把 live dogfood 的 category slices 一起落盘
-- 新增的 benchmark report 区块：
-  - `## Dogfood Slices`
-  - 会按 category 展示 `runs / success / failed / stuck / manual / avg tool calls`
-- `## Recent Runs` 也会追加 `Dogfood Categories` 摘要列，方便直接看最近几轮 benchmark 对应看到了哪些真实 dogfood slice
-- 这一步把“实验室 benchmark slice”与“真实任务 dogfood slice”第一次放进了同一份 benchmark history/report：
-  - benchmark 的 `Category Slices` 负责看基线能力是否退化
-  - `Dogfood Slices` 负责看最近真实任务主要落在哪些能力面上
-- review 后顺手补了一层兼容：
-  - 对旧 dogfood ledger，如果缺少显式 `benchmark_category`
-  - benchmark 侧会复用 dogfood 的 fallback inference
-  - 避免 report 里继续积累 `unknown` slice
-- 真实验证已经跑过：
-  - `deepseek benchmark --out /tmp/deepseek-bench-dogfood-slices.md`
-  - 当前 report 已经稳定显示 `Dogfood Slices`，且最新一轮不再出现新的 `unknown` category
-
-**10i-14 (`main`, 2026-05-08) — 已完成基础版**：
-- dogfood report 不再只有一份静态 `Category Breakdown`，现在新增了 `## Category Trend`
-- 这块趋势视图直接基于现有 ledger 计算：
-  - recent 5 runs
-  - previous 5 runs
-  - 不额外引入新的 dogfood history 文件
-- 当历史足够时，会按 category 输出：
-  - `Recent Runs / Prev Runs`
-  - `Recent Success / Prev Success`
-  - `Δ Success pp`
-  - `Recent Avg Tools / Prev Avg Tools`
-  - `Δ Tools`
-  - `Recent Seeds / Prev Seeds`
-- 当历史不够时，不会硬算趋势，而是显式打印：
-  - `Status: insufficient history`
-- 这一步的目标是先把“真实任务的 slice 级变化”稳定展示出来：
-  - 方便和 benchmark 的 `Category Slices` 对照看
-  - 后续如果要做 dogfood-side gate，也有稳定的窗口语义可以直接复用
-- 真实验证已经跑过：
-  - `deepseek dogfood report --out /tmp/deepseek-dogfood-trend.md`
-  - 当前真实 ledger 样本只有 2 条，所以 report 会正确显示 `insufficient history`
-
-**10i-15 (`main`, 2026-05-08) — 已完成基础版**：
-- `benchmark gate` 现在不只挂在 `dogfood run` 和 `pr fix/patch` 上，`run` 主入口也支持显式打开
-- 新增 CLI 语义：
-  - `deepseek run --benchmark-gate "..."`
-- 行为与前两条入口保持一致：
-  - 先执行原始任务
-  - 再自动跑默认 benchmark baseline
-  - 如果 trend gate 失败，`run` 命令本身也会非零退出
-- 这一步保持 opt-in，而不是默认打开：
-  - 避免把普通一次性任务都拖进一轮完整 baseline
-  - 但当需要“真实任务后立即做回归门禁”时，主入口已经能直接承担这件事
-- 真实验证已经跑过：
-  - `deepseek run --budget 4 --benchmark-gate "inspect repository layout and summarize the main entrypoints for a new contributor"`
-  - 命令会先跑主任务，再自动跑 benchmark，并打印 trend gate 结果
-
-**10i-16 (`main`, 2026-05-08) — 已完成基础版**：
-- benchmark 的复杂任务样本库不再主要集中在 Rust
-- 新增了 3 条真实 isolated cross-language write+validate case：
-  - `fixture-write-validate-python-mini`
-  - `fixture-recover-write-validate-python-mini`
-  - `fixture-write-validate-js-mini`
-- 对应新增 fixture：
-  - `.dscode/fixtures/python-write-mini`
-  - `.dscode/fixtures/js-write-mini`
-- 这让 `write_validate` slice 从原来的 `3` 条扩到 `6` 条，并且覆盖：
-  - Python `pytest`
-  - JavaScript `npm test`
-  - failed-validation readback（Python）
-- 真实 baseline 已经跑过：
-  - `deepseek benchmark --out /tmp/deepseek-bench-complex-samples.md`
-  - 结果是 `23/23` 通过
-- 这一步的意义不是单纯增加 case 数，而是确认：
-  - planner 的“replace -> diff -> validate”闭环不是 Rust-only 假象
-  - recovery 逻辑至少已经在第二种语言上真实成立
-
-**10i-17 (`main`, 2026-05-08) — 已完成基础版**：
-- subagent follow-up 现在不再只盯第一条 `meta.child_files`
-- parent planner 会基于“最新一次 dispatch_subagent 之后已经发生了多少次成功 `read_file`”来推进 child file 列表
-- 结果是：
-  - child 返回多个相关文件时
-  - parent 会继续读下一个还没消费的 child file
-  - 而不是过早退回 `list_files` 或盲搜
-- 这一步主要收紧的是复杂探索任务里的 orchestration：
-  - 减少无效 hop
-  - 提高 child 结果 merge-back 的利用率
-- 定向验证已经跑过：
-  - 新增单测覆盖“读完第一个 child file 后继续读第二个 child file”
-  - baseline benchmark 继续保持通过
-
-**10i-18 (`main`, 2026-05-08) — 已完成基础版**：
-- baseline benchmark 现在新增了 `pr_workflow` slice
-- 默认 manifest 已覆盖两条 seeded PR 任务：
-  - PR review readback
-  - PR fix recovery
-- `dogfood` 的 benchmark category inference 现在会优先把：
-  - `pull request`
-  - `review feedback`
-  - `failed ci`
-  - `pr #...`
-  - `github pr`
-  这类任务归到 `pr_workflow`
-- 这一步的意义不是“CLI 里有 pr 子命令”而已，而是：
-  - benchmark history 开始对 PR 向任务单独记账
-  - trend gate 可以看见这条任务线有没有退化
-- 定向验证已经跑过：
-  - 默认 baseline 扩到 25 个 case
-  - 连续 warmup 后 `pr_workflow` slice 已进入 `pass vs 3 runs`
-
-**10i-19 (`main`, 2026-05-08) — 已完成基础版**：
-- `run_shell` 现在会把 `node --test` 识别成正式 `test` 命令，而不是普通 shell
-- `meta.failed_tests` 已支持从 Node TAP 风格输出抽取失败用例与文件路径
-- 新增了 JavaScript recovery baseline：
-  - seeded `node --test` failure
-  - planner 应该先 `read_file` 读取失败测试文件，再扩展搜索
-- 这一步的价值是把：
-  - Rust `cargo test`
-  - Python `pytest`
-  - JavaScript `node --test`
-  三条语言路径统一进同一套 failure-kind / recovery_hint 机制
-- 定向验证已经跑过：
-  - Node test failure parsing 单测通过
-  - 默认 baseline 扩到 26 个 case 并保持通过
-
-**10i-20 (`main`, 2026-05-08) — 已完成基础版**：
-- category slice trend gate 现在会先检查 `category.cases` 是否与当前运行可比
-- 如果历史里同名 slice 的 case 数不同，就不再直接拿旧 `total_tool_calls` 强比
-- 这样可以避免一种假回归：
-  - baseline 新增了 recovery case
-  - category 总 tool calls 自然上涨
-  - 但 gate 却把“分母变了”误判成性能退化
-- 修正后，这类结构变化会先回到 warmup / insufficient history，再等待新的同构历史积累
-- 定向验证已经跑过：
-  - 新增单测覆盖“同 manifest 同 category 但 case 数不同”时跳过强比较
-  - 默认 baseline trend gate 已恢复通过
-
-**10i-21 (`main`, 2026-05-08) — 已完成基础版**：
-- JavaScript write+validate 现在不只支持 happy path，也支持真实 retry 闭环
-- `run_shell` 已支持解析 Node 默认测试输出里的：
-  - `test at test/foo.test.js:1:1`
-  这类失败文件路径
-- JS failed-validation recovery 现在会优先读取失败测试文件，而不是只看 diff
-- retry planner 也不再只靠 `meta.failed_tests` 里的测试名推断修复方向；必要时会从刚读回的测试文件内容继续推断
-- 结果是 `npm test` 场景现在也能走通：
-  - `apply_patch -> git_diff -> run_shell -> read_file(test) -> apply_patch -> git_diff -> run_shell`
-- 定向验证已经跑过：
-  - Node 默认输出解析单测通过
-  - JS recovery directive 单测通过
-  - JS retry planner 单测通过
-  - 临时 JS retry benchmark 已真实走通 7-step 链路
-
-**10i-22 (`main`, 2026-05-08) — 已完成基础版**：
-- `pr_workflow` baseline 不再只有：
-  - PR review readback
-  - failed test fix
-- 现在又补上了第三类样本：
-  - failed CI lint/build log -> search -> read_file
-- 这一步的重点不是增加 case 数，而是把 PR / CI 任务的入口扩成三种不同恢复模式：
-  - diff 驱动 review
-  - failing test path 驱动 readback
-  - stderr symbol 驱动 search/read
-- 定向验证已经跑过：
-  - 临时 seeded CI lint benchmark 通过
-  - 已收进默认 baseline，等待同构 history warmup
-
-**10i-23 (`main`, 2026-05-08) — 已完成基础版**：
-- PR patch / review-feedback 路径现在也有了专门 baseline
-- offline planner 对这类任务做了两处收紧：
-  - 有 `git_diff + list_files` 的 PR 上下文时，优先 `read_file` 读取 changed file
-  - 不再把单引号 PR title 里的自然语言短语误当成代码搜索词
-- 同时，对这类“上下文已经足够明确”的 PR 任务，planner 不再强制先 `todo_write`
-- 结果是 seeded patch case 从：
-  - `todo_write -> dispatch_subagent -> search_text -> list_files`
-  收敛成：
-  - `read_file`
-- 定向验证已经跑过：
-  - 新增单测覆盖“PR patch 优先读 changed file”
-  - 新增单测覆盖“读完 changed file 后不再搜索 PR title”
-  - 新增单测覆盖“PR patch 在 planning mode 下也跳过初始 todo_write”
-  - 临时 patch benchmark 已压到单步 `read_file`
-
-**10i-24 (`main`, 2026-05-08) — 已完成基础版**：
-- `pr_fix` 路径也收紧成了“targeted readback first, then stop”
-- 对已经能从 PR / CI 上下文精确定位文件的离线任务：
-  - 不再继续 `search_text`
-  - 不再追加 `list_files`
-  - 也不再掉进无意义的 `todo_write` replanning
-- 结果是：
-  - Rust `pr_fix` seeded case 从 `read_file -> search_text -> todo_write -> todo_write` 收敛到 `read_file`
-  - JavaScript `pr_fix` seeded case 也能直接落到失败测试文件 `read_file`
-- 这一步把 `pr_workflow` 从“多形态但主要是 Rust”推进成了“至少开始跨语言”
-- 定向验证已经跑过：
-  - 新增单测覆盖“PR fix 在 targeted readback 后直接 finish”
-  - 临时 Rust / JS PR fix benchmark 都通过
-  - JavaScript case 已并入默认 baseline
-
-**10i-25 (`main`, 2026-05-08) — 已完成基础版**：
-- `pr_workflow` 不再只覆盖 seeded readback；现在开始进入真实 `patch + validate` 链路
-- 关键修正不是 patch tool 本身，而是 explicit planning gate：
-  - 之前只对 `replace ... with ... in ...` 开头的任务跳过 `todo_write`
-  - PR 风格的 direct edit 任务即使能明确解析出 edit request，也会先掉进 `todo_write -> dispatch_subagent`
-- 现在 planning heuristic 改成：
-  - 只要 task 能解析出 direct edit request，就直接关闭 explicit planning
-- 结果是 PR 风格的 replace+validate 任务从：
-  - `todo_write -> dispatch_subagent -> list_files -> read_file -> apply_patch -> run_shell`
-  收敛成：
-  - `apply_patch -> git_diff -> run_shell`
-- 默认 baseline 现在新增了一条真实 `pr_workflow` patch+validate case：
-  - `fixture-pr-patch-validate-rust-mini`
-- 定向验证已经跑过：
-  - 新增单测覆盖“PR 风格 direct edit task 也跳过 explicit planning”
-  - 临时 Rust PR patch+validate benchmark 已真实走通 3-step 链路
-  - 已并入默认 baseline
-
-**10i-26 (`main`, 2026-05-08) — 已完成基础版**：
-- `pr_workflow` 现在不只会做一次 patch + validate，也开始覆盖 failed-validation 后的一次 retry 收敛
-- 新增真实 baseline：
-  - `fixture-pr-retry-validate-rust-mini`
-- 它走的不是 seeded readback，而是完整链路：
-  - `apply_patch -> git_diff -> run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-- 这说明前一轮为 PR 风格 direct edit 任务收紧的 planning gate 不只是能过 happy path，也不会挡住后续 retry 逻辑
-- 定向验证已经跑过：
-  - 临时 Rust PR retry benchmark 已真实走通 7-step 链路
-  - 已并入默认 baseline
-
-**10i-27 (`main`, 2026-05-08) — 已完成基础版**：
-- `pr_workflow` 的 retry 闭环现在不只是一门语言
-- 新增真实 baseline：
-  - `fixture-pr-retry-validate-js-mini`
-- 它和 Rust retry case 保持同一条结构：
-  - `apply_patch -> git_diff -> run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-- 这一步的意义是把：
-  - Rust `cargo test`
-  - JavaScript `npm test`
-  在 PR 风格 direct edit + failed-validation retry 这条线上对齐
-- 定向验证已经跑过：
-  - 临时 JavaScript PR retry benchmark 已真实走通 7-step 链路
-  - 已并入默认 baseline
-
-**10i-28 (`main`, 2026-05-08) — 候选验证，未晋级默认 baseline**：
-- 尝试把 `pr_workflow` 的 direct-edit retry 扩到 Python：
-  - `fixture-pr-retry-validate-python-mini`
-- 单 case benchmark 能真实走通：
-  - `apply_patch -> git_diff -> run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-- 但在完整 baseline 里会出现不稳定：
-  - 单跑通过
-  - 全量 baseline 中偶发掉到 `last tool run_shell output did not contain meta.result=ok`
-- 处理结论：
-  - 保留它作为候选 case
-  - 暂不放进默认 benchmark gate，先保证主基线稳定
-
-**10i-29 (`main`, 2026-05-08) — 已完成基础版**：
-- `dogfood run` 现在支持 `--workdir`
-- 行为是：
-  - 任务执行目录可以切到指定 fixture / repo
-  - 但 dogfood ledger、report、benchmark gate 仍然落在当前仓库
-- 这让“主仓库记账 + 临时 fixture 执行”第一次真正可用
-- 已做真实 live 验证：
-  - 在临时复制的 `rust-write-mini` 里跑 `pr_workflow` retry 任务
-  - 真实走通 `apply_patch -> git_diff -> run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-  - 记录已写回主仓库 `.dscode/dogfood/ledger.jsonl`
-  - `post-task benchmark gate` 也通过
-- 当前 dogfood report 已不再只有 `read_only`：
-  - `pr_workflow` live history 已开始积累
-
-**10i-30 (`main`, 2026-05-08) — 已完成基础版**：
-- benchmark / dogfood 对 tool output 的判定现在使用真实原始输出，而不是 observation summary
-- 原因是之前 `ToolEvent.output` 复用了给模型的摘要版：
-  - shell 输出较长时，`meta.result=ok` 这类头部信号有机会在摘要裁剪里丢失
-  - 结果就是 planner 真实成功，但 benchmark 断言偶发误判
-- 现在 runtime 改成：
-  - observation 继续走 `summarize_for_kind(...)`
-  - `ToolEvent.output` 保留原始工具输出
-- 这一步的价值不是增加功能，而是把 benchmark / dogfood 的“验证面真值”与“prompt 压缩视图”解耦
-- 定向验证已经跑过：
-  - 新增 benchmark 单测覆盖“长 shell 输出也能命中 `meta.result=ok`”
-  - 默认 33-case baseline 继续保持通过
-
-**10i-31 (`main`, 2026-05-08) — 已完成基础版**：
-- `pr_workflow` 的 merge-back 现在不再要求任务文本里显式写出 `subagent` / `child loop`
-- 只要任务本身就是 PR workflow，并且 parent 已收到 `dispatch_subagent` 的 `meta.child_files`：
-  - 读完第一份 child file 后
-  - 父循环也会继续读第二份 child file
-- 这一步的意义是把：
-  - “有 child files，但只有带 `continue from the subagent findings` 字样的任务才继续消费”
-  收紧成：
-  - “PR workflow 天然允许继续消费 child files”
-- 已补回归：
-  - 新增单测覆盖“PR 任务无 subagent wording 也继续读第二个 child file”
-  - 新增 seeded baseline `fixture-pr-followup-rust-cli-mini`
-- review / 收尾：
-  - baseline 首轮失败不是 merge-back 逻辑错误，而是 benchmark manifest 的 `expect_tool_sequence = ["..."]` 语法被当成原始字符串切分
-  - benchmark parser 现已兼容 bracketed string arrays，与文档/示例保持一致
-  - `fixture-pr-followup-rust-cli-mini` 也改成结果导向断言：
-    - 只要求 follow-up `read_file` 真的读到第二个 child file
-    - 不再把随后的一步 `list_files` 探索误判成失败
-  - 修正后默认 baseline 回到 `34/34`，`pr_workflow` slice gate 恢复为 `pass vs 5 runs`
-
-**10i-32 (`main`, 2026-05-08) — 已完成**：
-- `pr_workflow` 的 child-file merge-back 又收紧了一层：
-  - parent 在 PR follow-up 场景里把最后一个 `meta.child_files` 文件读完后
-  - 现在会直接停止并总结
-  - 不再掉回通用 `list_files` 探索
-- 这一步把 `fixture-pr-followup-rust-cli-mini` 从“能继续读第二个 child file”推进成了“读完 child files 就收敛”
-- 已补回归：
-  - 新增单测覆盖“PR child files 全部消费后直接 Finish”
-  - seeded baseline 断言也重新收紧：
-    - `forbid_tool = "list_files"`
-    - `max_tool_calls = 2`
-- 验证结果：
-  - 默认 benchmark 继续 `34/34`
-  - `fixture-pr-followup-rust-cli-mini` trace 收到 `todo_write -> read_file`
-  - 全量测试通过
-
-**10i-33 (`main`, 2026-05-08) — 已完成**：
-- `dogfood run` 现在支持 `--isolate-workdir`
-- 打开后会把指定 `--workdir` 复制到临时目录执行：
-  - live dogfood 任务可以直接跑 fixture-backed write/validate / pr_workflow
-  - 同时不污染仓库内的固定 fixture
-- 默认行为不变：
-  - 不传 `--isolate-workdir` 时仍在原 workdir 直接运行
-- 已补回归：
-  - CLI 解析覆盖 `--isolate-workdir`
-  - `prepare_run_workdir` 单测覆盖“隔离模式会复制 fixture”
-- 已做真实 dogfood：
-  - 用 `rust-write-mini` 跑通一条 isolated fixture-backed `pr_workflow` retry 任务
-  - benchmark gate 继续通过
-  - live dogfood 报表现在累计到 `5` 条记录，其中 `pr_workflow = 3`
-
-**10i-34 (`main`, 2026-05-08) — 已完成**：
-- `dogfood run` 现在支持 `--from-benchmark <case>`
-- 这条路径会直接复用 benchmark manifest 里的：
-  - `task`
-  - `skill`
-  - `budget`
-  - `workdir`
-  - `isolate_workdir`
-  - `notes`
-- 同时保持命令行 override 优先：
-  - 如果手动传了 `--skill / --budget / --workdir / --isolate-workdir / --notes`
-  - 就不会被 manifest 默认值覆盖
-- review / 收尾：
-  - 首轮真实 replay 暴露出 `workdir` 解析口径不一致：
-    - benchmark case 的 `workdir` 是相对 manifest 目录
-    - `dogfood run --from-benchmark` 一开始误按 repo 根目录解析
-  - 现已修正为按 manifest 目录解析 inherited `workdir`
-- 已补回归：
-  - CLI 解析覆盖 `--from-benchmark` / `--manifest`
-  - 单测覆盖“从 benchmark case 继承默认值”
-  - 单测覆盖“显式命令行参数优先于 benchmark 默认值”
-- 已做真实 dogfood：
-  - `deepseek dogfood run --from-benchmark fixture-pr-retry-validate-rust-mini --benchmark-gate`
-  - 已真实跑通一条 fixture-backed `pr_workflow` retry live run
-  - 默认 benchmark gate 继续通过
-  - live dogfood 报表现在累计到 `6` 条记录，其中 `pr_workflow = 4`
-
-**10i-35 (`main`, 2026-05-08) — 已完成**：
-- 新增 `deepseek dogfood replay-benchmark`
-- 这条命令会批量重放 benchmark manifest 中“真正可 live replay”的 case：
-  - 必须有真实 `workdir`
-  - 必须没有 `seed_observations`
-  - 可选 `--category`
-  - 可选 `--limit`
-  - 可选 `--benchmark-gate`
-- 这一步的目标不是替代 benchmark，而是更快把：
-  - `write_validate`
-  - `pr_workflow`
-  - 后续的其它 fixture-backed 类别
-  的 live dogfood 历史做厚
-- 已补回归：
-  - CLI 解析覆盖 `replay-benchmark`
-  - 单测覆盖“seeded-only case 不应进入 live replay”
-  - 单测覆盖 `category + limit` 过滤
-- 已做真实 replay：
-  - `deepseek dogfood replay-benchmark --category write_validate --limit 2 --benchmark-gate`
-  - 已真实追加两条 `write_validate` live 记录
-  - 默认 benchmark gate 继续通过
-
-**10i-36 (`main`, 2026-05-08) — 已完成**：
-- `dogfood` 的失败判定现在不再只看 `ObservationStatus::Failed`
-- 对于 `run_shell` 这类“工具调用本身成功返回，但结构化结果是失败”的场景：
-  - 只要输出里有 `meta.result=failed`
-  - dogfood ledger 也会把它计入 `failed_tool_calls`
-  - 默认 `outcome` 也会从 `success` 修正成 `failed`
-- review / 收尾：
-  - 这条修复不是从单测里猜出来的，而是 `replay-benchmark` 的真实 replay 暴露的：
-    - `fixture-recover-write-validate-rust-mini` 一开始被误记成 `success`
-    - 修正后同一条 live run 已正确落成 `failed`
-- 已补回归：
-  - 新增单测覆盖“`meta.result=failed` 应计为 failed outcome”
-- 修正后的 live dogfood 报表：
-  - 现在累计 `9` 条记录
-  - `write_validate = 3`，其中 `failed = 1`
-  - `Benchmark seed candidates = 1`
-
-**10i-37 (`main`, 2026-05-09) — 已完成**：
-- `recovery` 现在不再只有一条自然 fixture replay 路径
-- 默认 benchmark 新增：
-  - `fixture-recover-empty-search-js`
-  - 用小型 JavaScript CLI fixture 自然覆盖 `search_text -> list_files -> read_file`
-- review / 验证：
-  - 默认 benchmark 扩到 `35` 个 case，结果 `35/35`
-  - 因为 case 数从 `34` 变成 `35`，benchmark trend gate 进入新的 warmup；这不是退化，而是 comparability 集合变化
-  - 真实 replay 已跑：
-    - `deepseek dogfood replay-benchmark --category recovery --limit 2`
-  - replay 后 live dogfood 已真实追加两条 recovery 记录
-
-**10i-38 (`main`, 2026-05-09) — 已完成**：
-- `dogfood` 的 category 纠偏又补了一层：
-  - 自然 search-miss fallback 类型任务
-  - 例如 “if there are no matches inspect the repository layout instead”
-  - 现在会被识别成 `recovery`
-- 这一步的价值不是新增能力，而是把 live dogfood 报表里的 recovery slice 口径纠正到和 benchmark / replay 语义一致
-- review / 收尾：
-  - 首轮 replay 后，这两条自然 recovery 记录被旧 ledger 标签显示成了 `read_only`
-  - 修正后：
-    - `benchmark_case_category` 会把旧 `read_only` 标签纠偏到 `recovery`
-    - `dogfood report` 已重新渲染，分类正确
-- 验证结果：
-  - 全量测试通过，`434 passed, 0 failed`
-  - 最新 dogfood 报表累计 `11` 条记录
-  - 其中：
-    - `pr_workflow = 4`
-    - `recovery = 2`
-    - `write_validate = 3`
-    - `read_only = 2`
-
-**10i-39 (`main`, 2026-05-09) — 已完成**：
-- `recovery` 现在新增了一条非 seeded 的自然 failing-test fixture：
-  - `fixture-recover-failing-js-test`
-  - 基于新的 `js-cli-failing-mini`
-  - 真实覆盖 `run_shell -> read_file`
-- 这轮不是只加 fixture，也顺手把两处真实误配收紧了：
-  - skill auto-select 不再把 failing-test / lint recovery 任务误路由到 `research`
-  - offline planner 对这类任务会先复现失败，再读 failing file，然后直接停止，不再漂移到无根据的 `search_text`
-- review / 收尾：
-  - 首轮 benchmark 暴露出这条任务先被 `research` skill 吃掉，甚至把 `npm test` 跑成 policy denied
-  - 修正后：
-    - auto-skill 会优先落到 `debug`
-    - plan 的首步会变成 `Reproduce the failing validation command`
-    - 真实 dogfood 已稳定走到 `run_shell -> read_file -> finish`
-- 验证结果：
-  - 默认 benchmark 扩到 `36/36`
-  - trend gate：`pass against 4 comparable runs`
-  - 全量测试通过，`438 passed, 0 failed`
-
-**10i-40 (`main`, 2026-05-09) — 已完成**：
-- `dogfood` 的 category 纠偏又补了一层：
-  - 像 “investigate why npm test fails ...” 这种 failure repro / readback 任务
-  - 即使 trace 里有 `run_shell`
-  - 只要没有 `apply_patch`，并且本质是 recovery
-  - 就不再被误记成 `write_validate`
-- 同时 benchmark baseline 也和新 planner 行为对齐了：
-  - `recovery_readback_then_search` 这组旧 case 不再强行要求 `read_file -> search_text`
-  - 现在按“readback 后即可停止”的新语义判定
-- review / 收尾：
-  - 首轮 dogfood report 暴露出 3 条历史 failing-test replay 被错误压进了 `write_validate`
-  - 修正后：
-    - `dogfood report` 已重新渲染
-    - recovery slice 现在累计 `7` 条，其中 `failed = 3`
-    - write_validate slice 回落到真正的 patch+validate 任务
-- 验证结果：
-  - 最新 dogfood 报表：
-    - `recovery = 7`
-    - `write_validate = 3`
-    - `pr_workflow = 4`
-    - `read_only = 2`
-  - 全量测试通过，`439 passed, 0 failed`
-
-状态：10e-1 + 10e-2 + 10e-3 + 10e-4 + 10e-5 + 10f-1 + 10f-2 + 10f-3 + 10f-4 + 10g-1 + 10g-2 + 10g-3 + 10g-4 + 10h-1 + 10h-2 + 10h-3 + 10h-4 + 10h-5 + 10h-6 + 10h-7 + 10i-1 + 10i-2 + 10i-3 + 10i-4 + 10i-5 + 10i-6 + 10i-7 + 10i-8 + 10i-9 + 10i-10 + 10i-11 + 10i-12 + 10i-13 + 10i-14 + 10i-15 + 10i-16 + 10i-17 + 10i-18 + 10i-19 + 10i-20 + 10i-21 + 10i-22 + 10i-23 + 10i-24 + 10i-25 + 10i-26 + 10i-27 + 10i-29 + 10i-30 + 10i-31 + 10i-32 + 10i-33 + 10i-34 + 10i-35 + 10i-36 + 10i-37 + 10i-38 + 10i-39 + 10i-40 完成
-
-## Phase 11 进展（Claude / Codex gap closure）
-
-**11a-1 (`main`, 2026-05-09) — 已完成**：
-- `deepseek` 已收敛为主入口：
-  - `Cargo.toml` 默认运行目标切到 `deepseek`
-  - 历史上 `deepseek` / `deepseek chat` / `deepseek repl` / `deepseek interactive`
-    同入口；当前真实 TTY 中裸 `deepseek` 已改为进入 TUI workbench，REPL 改由
-    `deepseek chat` / `deepseek repl` / `deepseek interactive` 显式启动
-  - 主文档、PR/CI 文档、streaming/todos 文档、关键 runtime 提示统一为 `deepseek`
-  - `dscode` 退回兼容别名，不再作为主品牌展示
-- review / 收尾：
-  - 非 TTY 交互提示已统一为 `deepseek`
-  - `doctor` / `smoke` / `pr patch` 的用户可见字符串已完成品牌切换
-- 验证结果：
-  - 针对 `deepseek` 入口别名与 REPL binary-name 的单测通过
-  - 后续全量测试已覆盖在本轮 Phase 11 review 里
-
-**11c-1 (`main`, 2026-05-09) — 已完成**：
-- natural failure repro recovery 再收紧一层：
-  - 对 `investigate why ... test fails` 这类任务
-  - 只要有 `suggested_test_command`
-  - planner 就会优先 `run_shell` 复现失败，而不是先漂到 `search_text` / `list_files`
-- 这轮直接修掉了 benchmark 里唯一的 natural recovery miss：
-  - `fixture-recover-failing-js-test`
-  - 现在稳定走 `run_shell -> read_file -> finish`
-- review / 收尾：
-  - 默认 benchmark 从 `35/36` 回到 `36/36`
-  - trend gate 恢复为通过
-- 验证结果：
-  - 新增 recovery 回归单测，覆盖“即使已有 repo signal 也要先 repro”
-  - benchmark：`36/36`
-
-**11b-1 (`main`, 2026-05-09) — 已完成**：
-- `pr_workflow` baseline 新增真实 second-round review feedback fixture：
-  - [`rust-review-feedback-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/rust-review-feedback-mini/Cargo.toml)
-  - 初始状态模拟“上一轮错误 patch 已经落在工作区”
-  - 当前任务再执行一次真实 `apply_patch -> git_diff -> run_shell`
-- 新增 benchmark case：
-  - `fixture-pr-second-round-feedback-rust-mini`
-  - 这让 `pr_workflow` 不再只有 patch / retry / child-file follow-up，还多了一类真实 follow-up repair
-- review / 收尾：
-  - 默认 benchmark 扩到 `37` 条 case
-  - `pr_workflow` slice 扩到 `10` 条 case
-- 验证结果：
-  - benchmark：`37/37`
-  - trend gate：因 case 数变化进入新的 warmup，属预期行为
-
-**11b-2 (`main`, 2026-05-09) — 已完成**：
-- `pr_workflow` baseline 继续补厚，新增真实 JavaScript PR fix+validate case：
-  - `fixture-pr-fix-validate-js-cli-failing-mini`
-  - workdir 复用自然 failing fixture [`js-cli-failing-mini`](/home/willamhou/codes/DeepseekCode/.dscode/fixtures/js-cli-failing-mini/package.json)
-  - 真实链路稳定为 `apply_patch -> git_diff -> run_shell`
-- 这轮顺手收紧了 direct edit parser：
-  - `trim_edit_path_suffix` 现在支持 `and rerun` / `then rerun`
-  - `derive_edit_request` 会取最后两个 quoted segment，避免 PR / CI task 前半段的反引号噪声污染替换对
-- review / 收尾：
-  - 首轮 benchmark 暴露出新 case 漂成 `list_files -> read_file -> apply_patch -> run_shell`
-  - 修正后回到标准 direct-edit validate 路径
-- 验证结果：
-  - 默认 benchmark：`38/38`
-  - 全量测试：`449 passed, 0 failed`
-
-**11c-2 (`main`, 2026-05-09) — 已完成**：
-- `dogfood` 对“诊断型成功”已真实打通：
-  - failure repro / readback 任务现在可记录 `diagnostic_expected_failure=true`
-  - 自然 JS failing-test replay 已真实写入 ledger
-  - 该类 run 不再一律算作 `failed`
-- 最新 dogfood report：
-  - `Runs: 17`
-  - `Diagnostic expected-failure rate: 1/17 (5.9%)`
-  - `recovery: 8 runs, 5 success, 1 diagnostic, 3 failed`
-- 验证结果：
-  - `deepseek dogfood run --from-benchmark fixture-recover-failing-js-test`
-  - `deepseek dogfood report --out /tmp/deepseek-dogfood-phase11.md`
-
-**11e-1 (`main`, 2026-05-09) — 已完成**：
-- `dogfood run --from-benchmark` 在 isolated fixture replay 场景下现在会临时开启：
-  - `DSCODE_AUTO_APPROVE_WRITES=1`
-  - `DSCODE_AUTO_APPROVE_SHELL=1`
-  - `DSCODE_AUTO_APPROVE_MCP=1`
-- 作用范围是刻意收窄的：
-  - 只对 benchmark replay 生效
-  - 只对 isolated fixture workdir 生效
-  - 普通 dogfood / 手工 run 不改 approval 语义
-- 这让 live replay 测到的是 agent workflow，而不是非交互 confirm prompt
-- review / 收尾：
-  - 同一条 `fixture-pr-second-round-feedback-rust-mini` dogfood replay
-  - 修复前会因为非交互 auto-deny 记成 `failed`
-  - 修复后稳定走 `apply_patch -> git_diff -> run_shell` 并写成 `success`
-- 验证结果：
-  - 新增 `benchmark_replay_auto_approve_env_is_temporary` 单测
-  - 全量测试：`448 passed, 0 failed`
-- 最新 live dogfood：
-    - `Runs: 20`
-    - `pr_workflow: 7 runs, 6 success, 1 failed`
-
-**11f-1 (`main`, 2026-05-09) — 已完成**：
-- 新增产品级最小版本/安装入口：
-  - `deepseek version`
-  - `deepseek --version`
-  - `deepseek -V`
-- CLI 不再只靠 README 猜版本或 binary 来源，安装后可直接做：
-  - `deepseek version`
-  - `deepseek doctor`
-- 文档新增：
-  - [安装指南](./install.md)
-  - README 增加 `cargo install --path .` 的快速开始
-- review / 收尾：
-  - 版本命令输出当前包版本，例如 `deepseek 0.1.0`
-  - 解析覆盖 subcommand 与 flag 两条入口
-- 验证结果：
-  - `cargo run --bin deepseek -- version`
-  - `cli_from_argv_routes_version_subcommand`
-  - `cli_from_argv_routes_version_flags`
-
-**11d/11e/11f 收口前状态回填（2026-05-09）**：
-- 本轮重新跑默认 benchmark 后，当前真实 baseline 是：
-  - cases：`39`
-  - passed：`38/39`
-  - total tool calls：`127`
-  - total failed tools：`0`
-  - trend gate：`skipped`，因为 39-case comparable history 只有 1 条
-- 当前唯一 benchmark 红点：
-  - `fixture-pr-reproduce-fix-rust-cli-failing-mini`
-  - trace 已到 `run_shell -> read_file -> apply_patch -> git_diff`
-  - 缺口是 patch 后没有最后一次 `run_shell` validate
-- 当前 live dogfood snapshot：
-  - `Runs: 20`
-  - `Success: 15`
-  - `Failed: 5`
-  - `Stuck: 0`
-  - `Manual: 0`
-  - `pr_workflow: 7 runs, 6 success, 1 failed`
-  - `write_validate: 6 runs, 2 success, 4 failed`
-- 这说明 Phase 11 后半段不能只补文档：
-  - `11d` 要把 subagent v2 的 child summary / next-action / parent merge-back 收成稳定契约
-  - `11e` 要让 benchmark failed expectation 和 dogfood 新增 live failure 都能非零退出，真正成为阻断门禁
-  - `11f` 要把 release / upgrade / rollback 路径写清楚，而不是停留在 `cargo install --path .`
-
-**11d-1 (`main`, 2026-05-09) — 已完成**：
-- subagent v2 的 summary schema 增加 `meta.child_next_action`：
-  - `read_file:<path>`
-  - `search_text:<query>`
-  - `replan_parent`
-  - `continue_parent`
-- parent planner 现在优先消费 `meta.child_next_action`：
-  - `read_file:<path>` 会排在 `meta.child_files` 前面
-  - `search_text:<query>` 会排在 child final message 的自由文本猜测前面
-- `dispatch_subagent` 文本摘要也显示 `child next action`，方便人工看 trace
-- 新增 seeded baseline：
-  - `subagent-next-action-mergeback`
-  - 目标是验证 parent 能按 child next-action 继续读回关键文件
-- 验证结果：
-  - targeted tests 覆盖 next-action summary / parent merge-back
-  - benchmark subagent slice 扩到 `2` 条 case
-
-**11e-2 (`main`, 2026-05-09) — 已完成**：
-- benchmark gate 现在真正具备阻断能力：
-  - 任意 benchmark case expectation 失败都会让 `deepseek benchmark` 非零退出
-  - 不再只是在 markdown report 里显红
-- live gate 已接入 benchmark：
-  - 当前 benchmark run 会读取 dogfood ledger snapshot
-  - 与最近一次 benchmark 保存的 dogfood snapshot 对比
-  - 如果 failed / stuck / manual 计数增加，gate 失败并非零退出
-  - category 级 failed / stuck / manual 增量也会写进失败原因
-- report 新增：
-  - `Live gate: ...`
-  - 失败原因会明确写出是 overall 还是某个 category 增加
-- 同步修复了 `fixture-pr-reproduce-fix-rust-cli-failing-mini`：
-  - repro-first 任务里的第一次 `run_shell` 不再误抵消 patch 后 validation
-  - 现在稳定走 `run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-- 验证结果：
-  - 默认 benchmark：`40/40`
-  - trend gate：`pass against 4 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=20)`
-  - 全量测试：`463 passed, 0 failed`
-
-**11f-2 (`main`, 2026-05-09) — 已完成**：
-- 安装文档从“能安装”补到完整 release / upgrade story：
-  - 发布前检查：`cargo fmt --check` / `cargo test` / `deepseek benchmark` / `deepseek version` / `deepseek doctor`
-  - release binary 路径：`cargo build --release`
-  - 发布产物应包含 binary、commit SHA、版本输出、平台说明、安装升级说明
-  - 源码安装升级：`git pull` + `cargo install --path . --force`
-  - release binary 升级前保留 rollback copy
-  - 回滚流程覆盖 binary 回滚与源码 commit 回滚
-- README 快速开始补了源码升级命令，避免用户只知道首次安装。
-
-## 建议的下一个顺序
-
-当前这一轮按顺序列出的 11d / 11e / 11f 收口任务已经完成。下一阶段更值得做的是：
-
-1. 继续补真实 PR / review / CI fix 方向的 fixture 或 live dogfood 样本，而不是只做仓库内局部代码任务
-2. 把 `pr_workflow` 从“已有 Rust/JS patch + validate + retry + child-file follow-up”继续推向更完整的 live follow-up / merge-back 链路，并单独排查 Python retry 的 baseline 稳定性
-3. 继续积累非 `read_only` 的 live dogfood category history，尤其是 `pr_workflow / write_validate / recovery`
-
-**Phase 11+ baseline hardening (`main`, 2026-05-09) — 已完成**：
-- 按上述下一阶段顺序，先补 Python retry baseline，确认不是只靠 Rust / JavaScript 路径过关：
-  - 新增 `fixture-retry-write-validate-python-mini`
-  - 新增 `fixture-pr-retry-validate-python-mini`
-- 两条 case 都跑在 isolated `fixtures/python-write-mini` 上，覆盖：
-  - `apply_patch -> git_diff -> run_shell -> read_file -> apply_patch -> git_diff -> run_shell`
-  - pytest 失败后的 readback 与 corrective patch
-  - 普通 `write_validate` 和 PR review feedback 语境下的 retry
-- 最新 benchmark：
-  - 默认 benchmark：`42/42`
-  - total tool calls：`145`
-  - failed tool calls：`0`
-  - trend gate：`pass against 4 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=22)`
-- 已把两条 Python retry baseline replay 成 live dogfood：
-  - `fixture-retry-write-validate-python-mini` 记为 `write_validate / success`
-  - `fixture-pr-retry-validate-python-mini` 记为 `pr_workflow / success`
-  - live dogfood 当前累计 `22` runs，`17` success，`5` failed，`0` stuck，`0` manual
-- replay 过程中暴露并修正了一个 dogfood 口径问题：
-  - retry 流里的第一次 failed validation 不再把最终通过的任务误记成 `failed`
-  - `failed_tool_calls` 仍然保留中间失败次数，用于诊断和 seed 候选判断
-  - benchmark snapshot 与 dogfood report 现在共用同一套 category 纠偏；category 重分桶不会在没有新增 run 时误触发 live gate
-- 这一步把 roadmap 里“单独排查 Python retry baseline 稳定性”的风险点收掉，并把对应 live dogfood 样本补进 ledger。
-
-**Phase 11+ live gate hardening (`main`, 2026-05-09) — 已完成**：
-- 与 Claude Code / Codex 的二次差距复盘显示，本地 benchmark 已经稳定，但 live dogfood gate 还有一个产品级缺口：
-  - 新增 live dogfood failed / stuck 记录后，旧实现会先把失败 benchmark 的 dogfood snapshot 写入 history
-  - 下一次 benchmark 可能因此把未处理的 live 失败当成已接受基线
-- 新增显式 live baseline 接受语义：
-  - 普通 `deepseek benchmark` 只有在 case expectations / trend gate / live gate 全通过时才写入 benchmark history
-  - live gate 失败时不会自动推进 history baseline
-  - 排查后如果确实要接受当前 live snapshot，必须显式运行 `deepseek benchmark --accept-live-baseline`
-  - report / console 会标出 `(accepted by --accept-live-baseline)`，避免误以为 live gate 自然通过
-- 最新验证：
-  - 全量测试：`479 passed, 0 failed`
-  - `deepseek benchmark --accept-live-baseline`：`42/42`，trend gate pass，live gate failed but explicitly accepted
-  - 随后的普通 `deepseek benchmark`：`42/42`，trend gate pass，live gate pass（runs=33，无新增 dogfood 记录）
-- 当前差距判断：
-  - CLI 本地修改 / 验证 / recovery / PR fixture baseline 与 Claude Code / Codex 的核心闭环差距已收敛到“小到中”
-  - 真实在线模型稳定性、IDE/编辑器配套、外部 PR/CI live 样本厚度仍不是“小差距”，后续应继续按 live dogfood 暴露的问题推进
-
-**Phase 11+ custom slash commands (`main`, 2026-05-09) — 已完成**：
-- 二次对照 Claude Code / Codex 后补齐一个高频 REPL UX 缺口：
-  - Claude Code 支持项目/用户级 prompt-backed slash commands / skills
-  - DeepseekCode 原先只有内置 slash 命令与 TOML skills，缺少“把常用提示保存成 `/name` 命令”的轻量入口
-- 新增 custom slash command：
-  - 项目命令：`.dscode/commands/<name>.md`
-  - 用户命令：`~/.config/dscode/commands/<name>.md`，可通过 `workspace.user_commands_dir` 配置
-  - 支持 namespace：`.dscode/commands/pr/fix.md` -> `/pr/fix`
-  - 支持参数替换：`$ARGUMENTS`、`$ARGUMENTS[N]`、`$0`、`$1`
-  - 无参数占位符时自动追加 `ARGUMENTS: ...`
-- 这一步把“常用 workflow prompt 复用”从写 TOML skill 降低到写 markdown 文件，缩小了与 Claude Code 交互体验的差距。
-
-**Phase 11+ workspace instructions (`main`, 2026-05-09) — 已完成**：
-- 再次对齐 Claude Code / Codex 后补齐持久项目指令入口：
-  - Codex 使用 `AGENTS.md` 作为 repo/project instruction 层
-  - Claude Code 使用 `CLAUDE.md` / `.claude/CLAUDE.md` 作为项目记忆与团队规则
-  - DeepseekCode 原先只做 profile hints / skill append，没有稳定的“每个任务都读”的 repo 指令文件
-- 新增 workspace instruction loader：
-  - 用户级：默认 `~/.config/dscode/AGENTS.md`，可通过 `workspace.user_instructions_file` 改路径或设空禁用
-  - 项目级：从 git root 到当前目录逐层读取
-  - 每层优先级：`AGENTS.override.md` > `AGENTS.md` > `CLAUDE.md` > `.claude/CLAUDE.md`
-  - 每个文件最多注入 32 KiB，并在系统 prompt 中标出来源路径
-- 这一步把项目规则、构建约定、review/checklist 等长期上下文从“口头重复”移动到可版本化文件，进一步缩小与 Codex/Claude 的基础协作体验差距。
-
-**Phase 11+ local hooks (`main`, 2026-05-09) — 已完成**：
-- 对照 Claude Code / Codex 的 hooks 扩展面，补齐一个保守的本地 hook 机制：
-  - 默认关闭，必须显式配置 `hooks.enabled = true`
-  - 支持 project/user hook dirs：`.dscode/hooks` 与 `~/.config/dscode/hooks`
-  - 支持事件：`user_prompt_submit`、`pre_tool_use`、`post_tool_use`
-  - hook 以可执行脚本形式存在，stdin 接收 JSON payload，环境变量 `DSCODE_HOOK_EVENT` 标注事件
-  - `user_prompt_submit` / `pre_tool_use` 非零退出会阻断 prompt/tool；`post_tool_use` 非零退出作为 advisory observation 返回
-- 这一步不试图复制完整 MCP/plugin 生态，但给团队策略、审计、上下文注入和危险 tool gate 留出了稳定扩展点。
-
-**Phase 11+ config bootstrap (`main`, 2026-05-09) — 已完成**：
-- 新增 `deepseek config init [--force]`，首次使用不再需要手动复制 `.dscode/config.example.toml`：
-  - 创建项目级 `.dscode/config.toml`
-  - 创建 `.dscode/sessions` 与 `.dscode/commands`
-  - 创建 `.dscode/hooks/user_prompt_submit`、`.dscode/hooks/pre_tool_use`、`.dscode/hooks/post_tool_use`
-  - 默认 hooks 仍保持关闭，避免 clone 仓库后隐式执行脚本
-- `deepseek config --print-default` 同步输出 workspace user dirs / instruction file / hooks 配置，避免新字段在诊断时不可见。
-- 这一步把安装和首次配置从“读文档复制模板”推进到“命令初始化”，缩小产品化 gap；但 IDE/编辑器、MCP/plugin 生态、外部 PR/CI live 样本厚度仍不是小差距。
-
-**Phase 11+ live coverage gate (`main`, 2026-05-09) — 已完成**：
-- `deepseek benchmark` 的 live gate 不再只检查新增 failed / stuck / manual 记录：
-  - 当 dogfood snapshot 达到 `12` 条 run 后，关键 live slice 必须保持最低覆盖
-  - 当前要求 `pr_workflow`、`recovery`、`write_validate` 各至少 `3` 条 run
-  - 覆盖不足会让普通 benchmark 非零退出，仍可用 `--accept-live-baseline` 显式接受已排查的 snapshot
-- 这一步把 live gate 从“只挡坏结果”推进到“也挡关键 workflow 样本过薄”，避免 read-only 或单一 category 掩盖真实产品风险。
-
-**Phase 11+ dogfood environment transport guard (`main`, 2026-05-09) — 已完成**：
-- 继续积累 live PR/CI 样本时，受限网络下的 `dogfood replay-benchmark --category pr_workflow --limit 1` 暴露出一个 gate 噪声问题：
-  - 模型 API DNS failure 会在 agent 尚未执行任何工具前失败
-  - 旧逻辑会把这类环境失败写入 dogfood ledger，污染 live gate 的 `pr_workflow` failed 增量
-- dogfood run / replay 现在会识别 DNS、network unreachable、connection timeout、curl 6/7/28 这类 environment transport failure：
-  - 命令仍返回错误，避免误报任务成功
-  - 但不追加 dogfood ledger 记录，report 保持不变
-- 最新验证：
-  - 同一条受限网络 replay 返回错误，但输出 `ledger: ... (skipped: environment transport failure)`
-  - `.dscode/dogfood/ledger.jsonl` 维持 `33` 行
-  - 全量测试：`541 passed, 0 failed`
-  - 默认 benchmark：`48/48`
-  - trend gate：`pass against 4 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-
-**Phase 11+ benchmark asset reproducibility / Go baseline (`main`, 2026-05-09) — 已完成**：
-- 审计发现默认 `.dscode/benchmarks.txt` 与 fixture corpus 仍被 ignore；这会导致 fresh checkout 缺少可复现的默认 benchmark，和 roadmap/spec 中“fixture-backed benchmark”的描述不一致
-- `.gitignore` 现在只继续忽略生成物：
-  - benchmark report/history
-  - dogfood ledger/report
-  - sessions 与 fixture 内嵌 session
-  同时允许默认 benchmark manifest、example manifest 和 `.dscode/fixtures/**` 进入版本控制
-- baseline 新增 Go fixture：
-  - `fixtures/go-write-mini`
-  - `fixture-write-validate-go-mini`
-  - `fixture-pr-patch-validate-go-mini`
-- 这一步把已存在的 Go language profile 从“只会检测”推进到默认 benchmark 可验证：
-  - `go test ./...`
-  - `apply_patch -> git_diff -> run_shell`
-  - 普通 write/validate 与 PR patch/validate 两种语境
-- 最新 benchmark：
-  - 默认 benchmark：`44/44`
-  - total tool calls：`151`
-  - failed tool calls：`0`
-  - trend gate：`pass against 5 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-
-**Phase 11+ Go PR CI reproduce fixture (`main`, 2026-05-09) — 已完成**：
-- 延续 PR/CI fixture thickening，本轮补上 Go 的自然失败复现修复样本：
-  - 新增 `fixtures/go-cli-failing-mini`
-  - 新增 `fixture-pr-reproduce-fix-go-cli-failing-mini`
-  - case 要求先执行 `go test ./...` 复现失败，再读取 `main.go`、把 `run bench` 修成 `run benchmark`、查看 diff 并重新跑测试
-- 这让默认 `pr_workflow` baseline 从 `15` 条扩到 `16` 条，并把自然失败复现修复链路扩到 Rust / JavaScript / Python / Go。
-- 最新验证：
-  - fixture 原始状态下 `go test ./...` 会按预期失败，失败输出指向 `run bench` 与期望 `run benchmark`
-  - 全量测试：`540 passed, 0 failed`
-  - 默认 benchmark：`47/47`
-  - total tool calls：`162`
-  - failed tool calls：`0`
-  - trend gate：`skipped (need at least 3 prior comparable runs, found 0)`，因为 case 数从 `46` 到 `47`，当前没有同 case 数历史
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-- 当前边界仍明确：PR/CI fixture baseline 已更厚，但外部真实 PR/CI live 样本和在线模型稳定性还不是小差距。
-
-**Phase 11+ product gap planning guard (`main`, 2026-05-09) — 已完成**：
-- 对照当前 gap review，open-ended 产品化请求还有一类常见说法此前没有明确回归样本：`make X more like Y`、`close the gap`、`production-ready` / `product ready`。
-- explicit planning heuristic 现在会把这些短模糊请求纳入 first-turn todo plan，而不是直接进入仓库搜索或随手编辑。
-- 新增 benchmark：
-  - `plan-product-gap-closure`
-  - task：`make DeepseekCode more like Claude Code`
-  - 断言：`planning_todo_only`
-- 最新验证：
-  - 全量测试：`540 passed, 0 failed`
-  - 默认 benchmark：`48/48`
-  - total tool calls：`163`
-  - failed tool calls：`0`
-  - trend gate：`skipped (need at least 3 prior comparable runs, found 0)`，因为 case 数从 `47` 到 `48`，当前没有同 case 数历史
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-- 当前边界仍明确：这把 open-ended gap 从“没有覆盖产品化模糊请求”推进到“有首轮规划保护”，但仍是 heuristic，不等于复杂开放任务已稳定收敛。
-
-**Phase 11+ bounded nested subagents (`main`, 2026-05-09) — 已完成基础版**：
-- 对照 gap review，subagent 仍被明确记为“单层、保守”。本轮先把硬性单层限制放宽为有上限的两层：
-  - root agent 可 dispatch child
-  - 第一层 child 也可在自己任务内 dispatch 一个清晰可分的 nested child
-  - 第二层之后 registry 不再暴露 `dispatch_subagent`，避免无界递归
-- subagent prompt nudge 同步说明 nested dispatch 是 bounded，只应在 child 内部有明确独立子任务时使用。
-- 最新验证：
-  - 全量测试：`540 passed, 0 failed`
-  - 默认 benchmark：`48/48`
-  - total tool calls：`163`
-  - failed tool calls：`0`
-  - trend gate：后续已恢复为 `pass against 3 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-- 当前边界仍明确：这不是成熟多 agent 调度器；只是从硬单层推进到保守、可控的两层拆分。
-
-**Phase 11+ IDE bootstrap (`main`, 2026-05-09) — 已完成基础版**：
-- 二次差距审计显示，Claude Code / Codex 的 IDE/app/cloud surface 仍是大差距；本轮先补一个很小但可版本化的 VS Code 入口
-- 新增 [`editors/vscode`](/home/willamhou/codes/DeepseekCode/editors/vscode/README.md)：
-  - `DeepseekCode: Open Chat`
-  - `DeepseekCode: Run Task`
-  - `DeepseekCode: Explain Selection`
-  - `DeepseekCode: Run Benchmark`
-  - `DeepseekCode: Show Dogfood Report`
-- extension 只负责把当前 workspace、当前文件路径和可选 selection 组织成 `deepseek` CLI 命令并在 VS Code terminal 中运行
-- 这一步没有引入 npm dependency，也不声明已具备完整 IDE agent 体验；它只是把 IDE gap 从“几乎没有”推进到“有可试用入口”
-
-**Phase 11+ VS Code quick actions (`main`, 2026-05-09) — 已完成基础版**：
-- 延续最小 VS Code extension，本轮把入口从“隐藏在命令面板里”推进到更可发现：
-  - 新增状态栏 `DeepseekCode` action
-  - 新增 `DeepseekCode: Quick Action` quick-pick，集中启动 chat / task / selection explain / benchmark / dogfood report
-  - 新增 editor title 和 editor context menu 入口，选中文本时可直接 explain selection
-  - extension manifest 为命令补齐 product icons，并保持无外部 npm dependency
-- 当前边界仍明确：这仍是 terminal-backed launcher，不是完整 IDE agent sidebar / inline diff / diagnostics / chat panel；IDE/app/cloud surface 仍不是小差距
-
-**Phase 11+ VS Code Explorer view (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 terminal-backed VS Code extension，本轮补一个常驻侧栏入口：
-  - Explorer sidebar 新增 `DeepseekCode` view
-  - view 内提供 open chat / run task / explain selection / benchmark / dogfood report clickable actions
-  - 保持无外部 npm dependency
-- 最新验证：
-  - `node --check editors/vscode/extension.js`
-  - `jq -c . editors/vscode/package.json`
-  - 全量测试：`540 passed, 0 failed`
-  - 默认 benchmark：`48/48`
-  - trend gate：`pass against 3 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-- 当前边界仍明确：这提升 IDE 可发现性，但仍不是完整 agent sidebar / inline diff / diagnostics / chat panel。
-
-**Phase 11+ VS Code agent panel (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 terminal-backed VS Code extension，本轮把侧栏从“动作列表”推进到“可直接输入任务”的轻量 agent panel：
-  - Explorer sidebar 新增 `DeepseekCode Agent` webview panel
-  - panel 支持输入 task，并把当前文件路径和选中文本作为上下文传给 `deepseek run`
-  - 同一 panel 暴露 chat / explain selection / benchmark / dogfood report 快捷入口
-  - `DeepseekCode: Quick Action` 和 editor title 也可直接 focus 该 panel
-  - 保持无外部 npm dependency
-- 最新验证：
-  - `node --check editors/vscode/extension.js`
-  - `jq -c . editors/vscode/package.json`
-  - 全量测试：`541 passed, 0 failed`
-  - 默认 benchmark：`48/48`
-  - trend gate：`pass against 5 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-- 当前边界仍明确：这已经是可试用的 IDE 侧栏任务入口，但仍不是完整 inline diff / diagnostics / native chat / patch review 体验。
-
-**Phase 11+ VS Code diagnostics/diff actions (`main`, 2026-05-09) — 已完成基础版**：
-- 继续缩小 IDE 侧缺口，本轮补两个轻量但贴近日常修复/审阅的入口：
-  - 新增 `DeepseekCode: Explain Diagnostics`，读取当前文件的 VS Code diagnostics，汇总成 `deepseek run` task context
-  - 新增 `DeepseekCode: Show Active Diff`，用 VS Code diff editor 比较 `HEAD` 与当前 editor 内容，支持未保存 editor 内容
-  - Explorer actions、agent panel、quick action 和 editor context menu 均暴露这两个入口
-  - 仍保持 terminal-backed launcher 和无外部 npm dependency
-- 最新验证：
-  - `node --check editors/vscode/extension.js`
-  - `jq -c . editors/vscode/package.json`
-  - `git diff --check`
-  - 默认 benchmark：`48/48`
-  - trend gate：`pass against 5 comparable runs`
-  - live gate：`pass (no new dogfood records since previous snapshot, runs=33)`
-- 当前边界仍明确：diagnostics 和 diff 已有手动入口，但还没有自动诊断修复、inline patch review 或 native chat loop。
-
-**Phase 11+ MCP config surface (`main`, 2026-05-09) — 已完成基础版**：
-- 二次差距审计里 MCP/plugin ecosystem 仍是大差距；本轮先补最小可验证配置面，而不是一次性实现完整 MCP transport
-- 新增 `deepseek mcp`：
-  - `deepseek mcp init` 创建 `.dscode/mcp.json`
-  - `deepseek mcp list` 展示 project/user MCP server 定义
-  - `deepseek mcp doctor` 校验 server schema，并对同名 server 使用 project 覆盖 user 的语义
-- 配置格式采用常见 `mcpServers` JSON object，支持 `stdio` / `http` / `streamable-http` / `sse` 的基础字段校验
-- `deepseek config init` 同步创建 disabled 示例 MCP config；`.dscode/config.example.toml` 与安装文档已补充 `mcp.enabled` / `mcp.project_file` / `mcp.user_file`
-- 这一轮边界明确：只做发现与校验，还不会把 MCP tools 注入 agent tool registry
-
-**Phase 11+ MCP stdio tool discovery (`main`, 2026-05-09) — 已完成基础版**：
-- 延续上一轮 MCP config surface，本轮把 MCP 从“配置可见”推进到“能真实握手并枚举 stdio server tools”
-- 新增 `deepseek mcp tools [server]`：
-  - 启动 enabled stdio server subprocess
-  - 按 MCP lifecycle 发送 `initialize` 与 `notifications/initialized`
-  - 执行 `tools/list`，支持 `nextCursor` 分页，并展示 tool name / description / input schema
-- 已用临时 stdio fake server 做 smoke：`fake [stdio]: 1 tool(s)`，能列出 `echo` tool 与 schema
-- 这一轮边界仍明确：只做 stdio `tools/list`，HTTP/SSE transport、agent tool registry 注入和审批模型还没接入；MCP/plugin ecosystem 仍不是小差距
-
-**Phase 11+ MCP manual tool call (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 stdio tool discovery，本轮继续补显式调用入口，避免 MCP 只能“看见工具”但不能验证工具结果
-- 新增 `deepseek mcp call <server> <tool> [json-args]`：
-  - 启动 enabled stdio server 并完成 `initialize` / `notifications/initialized`
-  - 发送 `tools/call`，参数要求为 JSON object
-  - 输出 text content、structuredContent 和 tool-level `isError`
-- 已用临时 stdio fake server 做 smoke，能调用 `echo` tool 并展示返回内容
-- 当前边界仍明确：这还是人工指定 server/tool/arguments 的调试入口；MCP tools 还没有进入 agent tool registry，也没有对应审批/策略模型
-
-**Phase 11+ MCP agent bridge (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 manual `tools/call`，本轮把 MCP 从 CLI 调试入口推进到 agent 可见的 generic bridge
-- 当 project/user MCP config 文件存在时，默认 registry 会暴露：
-  - `mcp_list_tools`：枚举 configured MCP server tools 和 input schema
-  - `mcp_call`：按 server/tool/JSON arguments 调用 stdio MCP tools
-- OpenAI / Anthropic tool schema 都已加入这两个 bridge tools；没有 MCP config 文件时不会暴露，避免无 MCP 项目的默认 prompt 膨胀
-- 当时边界明确：远端 MCP tools 还不是动态独立 agent tools；HTTP transport、完整 MCP permission UX 和 plugin ecosystem 仍未接入
-
-**Phase 11+ MCP call approval/allowlist policy (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 MCP agent bridge，本轮把远端 tool 调用接回现有 approval/policy 入口：
-  - 新增 `approval.require_mcp_confirmation`，默认 `true`
-  - 新增 `approval.mcp_call_allowlist`，支持 `server/tool`、`server/*`、`*/tool` 和 `*/*`
-  - 新增 `DSCODE_AUTO_APPROVE_MCP=1`，用于非交互 benchmark / dogfood replay 或用户显式放行
-  - agent 通过 `mcp_call` 调用远端 MCP tool 前会确认 `server/tool`
-- `mcp_list_tools` 仍保持只读发现能力，不要求确认
-- 用户直接执行的 `deepseek mcp call <server> <tool> [json-args]` 不走该 prompt，因为它已经是显式 CLI 命令意图
-- 当时边界明确：这只是 MCP bridge 级别的安全闸；还没有把每个远端 MCP tool 动态注入为独立 agent tool，也没有 HTTP/SSE runtime 或更完整的 permission UX
-
-**Phase 11+ MCP HTTP JSON-RPC transport (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 MCP bridge 与 policy，本轮把已能被配置识别的 `http` / `streamable-http` server 推进到可实际调用：
-  - `deepseek mcp tools [server]` 可对 HTTP MCP endpoint 执行 `initialize` / `notifications/initialized` / `tools/list`
-  - `deepseek mcp call <server> <tool> [json-args]` 可通过 HTTP JSON-RPC POST 执行 `tools/call`
-  - 会续传服务端返回的 `Mcp-Session-Id`
-  - HTTP response 如果是 `text/event-stream` 形态，会读取 `data:` 中的 JSON-RPC response
-- agent bridge 的 `mcp_list_tools` / `mcp_call` 复用同一 summary 函数，因此 HTTP MCP server 也进入 agent 可用路径，并继续受 confirmation / allowlist 保护
-- 当前边界仍明确：旧式 `sse` transport 已在下一轮补上；远端 MCP tools 还不是动态独立 agent tools，permission UX 也仍是 bridge 级别
-
-**Phase 11+ MCP legacy SSE transport (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 HTTP MCP transport，本轮把旧式 `sse` server 从“配置可识别”推进到可实际调用：
-  - `deepseek mcp tools [server]` 可打开 SSE event stream，读取 `endpoint` 事件，然后向该 endpoint POST `initialize` / `notifications/initialized` / `tools/list`
-  - `deepseek mcp call <server> <tool> [json-args]` 可通过同一 SSE session 执行 `tools/call`
-  - SSE stream 上的 JSON-RPC response 会按 request id 匹配，并跳过 endpoint / heartbeat / 非目标 response
-- agent bridge 的 `mcp_list_tools` / `mcp_call` 复用同一路径，因此 SSE MCP server 也进入 agent 可用路径，并继续受 confirmation / allowlist 保护
-- 当前边界仍明确：动态独立 agent tools 已在下一轮补 opt-in 初版；完整 schema 注入、permission UX、plugin ecosystem 和云端/外部任务面仍未接入
-
-**Phase 11+ opt-in MCP dynamic tool exposure (`main`, 2026-05-09) — 已完成基础版**：
-- 延续 stdio / HTTP / SSE MCP 调用路径，本轮补上保守的动态 tool 注入：
-  - 新增 `mcp.expose_remote_tools`，默认 `false`，避免 agent 启动时隐式执行不受信任的 MCP server discovery
-  - 开启后，registry 会发现 enabled MCP server tools，并以 `mcp__server__tool` 名称注入为独立 agent tool
-  - 动态 tool 接收 `arguments` JSON object string，内部复用 `deepseek mcp call` 的实际调用路径
-  - 动态 tool 仍按真实 `server/tool` 走 `approval.require_mcp_confirmation` 与 `approval.mcp_call_allowlist`
-  - 单次最多注入 `24` 个动态 MCP tools；发现失败的 server 会被跳过，避免单个坏 server 阻断整个 agent registry
-- 当前边界仍明确：动态 tool schema 还是通用 `arguments` wrapper，尚未把远端 input schema 逐个注入模型 schema；permission UX 仍偏 bridge 级别，完整 plugin ecosystem 和云端/外部任务面仍未接入
-
-**Phase 12 gap audit v2 (`working tree`, 2026-05-10) — Phase 12A baseline 已完成**：
-- 新增 gap audit/spec：
-  - `docs/superpowers/specs/2026-05-10-claude-codex-gap-audit-v2.md`
-  - 对照 Claude Code / Codex 官方资料重新评估完整产品面差距
-  - Phase 12A 后当前相对完整产品面估计约 `34%` gap；只有继续按 Phase 12B-12E 补齐后才可压到 `8% - 10%`
-- 新增执行计划：
-  - `docs/superpowers/plans/2026-05-10-claude-codex-gap-closure-v2.md`
-  - Phase 12A：quality baseline
-  - Phase 12B：native VS Code workbench
-  - Phase 12C：GitHub automation
-  - Phase 12D：MCP/hooks/subagent hardening
-  - Phase 12E：background worktree runner and distribution
-- 同步新增 `plan-product-readiness` benchmark，覆盖 `productionize DeepseekCode for daily coding work` 这类产品化/daily-use 请求
-- Phase 12A 已追加 6 条 dogfood replay：
-  - product gap planning
-  - product readiness planning
-  - failed-validation retry
-  - Rust / JavaScript / Python PR retry validation
-  - 这些 replay 在当前 sandbox 中使用 offline fallback；真实 live API 访问因 DNS failure 未写入 ledger
-- 最新验证：
-  - 全量测试：`611 passed, 0 failed`
-  - 默认离线 benchmark：`67/67`
-  - total tool calls：`200`
-  - failed tool calls：`0`
-  - trend gate：`pass against 3 comparable runs`
-  - live gate：`pass (runs=39)`
-  - dogfood report：`39` runs，新增 Phase 12A replay 未引入 failed/stuck/manual
-- 当前边界仍明确：Phase 12A 质量基线已收口，但还没有实际收掉 IDE workbench、GitHub automation、background worktree/app/cloud、true live dogfood 厚度这些主差距。
-
-**Phase 12 CLI-only gap audit (`working tree`, 2026-05-10) — 首轮实现已完成**：
-- 按最新口径，只对比 Claude Code CLI / Codex CLI，不计入 IDE、Codex app/cloud、GitHub Action 或其他 hosted automation
-- 新增 CLI-only audit/spec：
-  - `docs/superpowers/specs/2026-05-10-cli-only-gap-audit.md`
-  - 实施前 CLI-only residual gap 估计为 `22% - 28%`
-  - CLI-12 首轮实现后 residual gap 估计为 `12% - 16%`
-  - live JSONL、20 个 subagent benchmark cases、parallel subagent/thread management、MCP prompt slash commands、native image payloads、REPL `/compact` 与 release/install verifier 补齐后 residual gap 估计为 `6% - 9%`
-  - 主要差距收敛为 6 类：scriptable CLI UX、subagent orchestration、hooks events、MCP schema UX、model/context、install/update
-- 新增 CLI-only closure plan：
-  - `docs/superpowers/plans/2026-05-10-cli-only-gap-closure.md`
-  - Phase CLI-12A：scriptable CLI contract
-  - Phase CLI-12B：subagent CLI maturity
-  - Phase CLI-12C：hooks event parity
-  - Phase CLI-12D：MCP schema and permission UX
-  - Phase CLI-12E：model context and distribution
-- 已落地首轮 CLI gap closure：
-  - `deepseek exec`：stdin、JSONL、resume follow-up、skill/budget、image file refs
-  - `deepseek agents`：project/user agent files 的 list/show/validate
-  - hooks：session、prompt、tool、permission、subagent、pre-compact event 面与结构化 allow/deny/add_context
-  - MCP：动态 schema cache/injection、fallback wrapper、argument-aware permission prompt、`prompts/list` / `prompts/get` 和 REPL MCP prompt slash commands
-  - model/context：`doctor` capability reporting、`exec --image` 路径验证和 OpenAI/Anthropic native image payloads
-  - install/update：`deepseek update --check/--print-command`
-- 追加补齐：
-  - `deepseek exec --json` 现在 live 输出 `assistant_delta`、`tool_call`、`permission_request`、`tool_result`，完成后输出 `assistant_final`
-  - 默认 benchmark manifest 扩到 67 cases，其中 `subagent` category 为 20 cases；subagent-only verifier 已跑通 `20/20`
-  - `deepseek mcp prompts [server]`、`deepseek mcp prompt <server> <prompt> [json]`、REPL `/mcp/<server>/<prompt>` 和 Claude 风格 `/mcp__server__prompt` 已接入 MCP prompt flow
-  - `deepseek exec --image` 对 OpenAI-compatible vision 模型发送 `image_url` data URL，对 Anthropic-compatible Claude 模型发送 base64 `image` content block；DeepSeek text-only profile 保留文件引用
-  - REPL `/compact` 会先触发 `pre_compact` hook，再把旧 transcript turns 压成一个 summary turn 并保留最近 8 个 turns
-  - `deepseek update package` 生成本地 release package；`install-package` 安装并备份当前 binary；`rollback` 恢复备份；`verify-install` 在隔离目录跑 version/config/doctor/exec JSONL/benchmark sample
-  - `dispatch_subagents` 支持最多 4 个 child tasks 并发执行，返回 consolidated per-thread metadata，并写入 `.dscode/agent-threads/*.md`
-  - `deepseek agents threads/show-thread/switch/current/clear-current` 提供 thread artifact inspection 和 active thread switching
-  - 全量测试：`611 passed, 0 failed`
-  - 默认 benchmark：`67/67`，trend gate `pass against 4 comparable runs`，live gate `pass (runs=39)`
-- 当前边界仍明确：CLI-only 实现面已降到“高个位数”差距，但还不能标记完成。剩余硬差距是 100+ true live dogfood 证据。
-
-**DeepSeek-TUI parity track (`working tree`, 2026-05-10) — Phase A 启动**：
-- 新增源码级追平计划：
-  - `docs/superpowers/plans/2026-05-10-deepseek-tui-parity.md`
-  - 对照本地拉取的 `Hmbown/DeepSeek-TUI` HEAD `506343f`
-  - 明确 8 个产品面 deliverables：true TUI、durable runtime、tool surface、DeepSeek-native UX、LSP diagnostics、subagent/RLM、MCP/runtime API、packaging
-- 第一块低风险基础能力已落地：
-  - `deepseek doctor --json`
-  - JSON 输出 version、workspace、model、capabilities、api key presence、skills、MCP、network probe 状态和本地 binary availability
-  - JSON 模式不会做 live network probe，方便本地 supervisor、future workbench、release automation 稳定消费
-  - JSON 模式不输出真实 API key 或 key 尾号，只报告 `present/source` 与 `masked: redacted`
-- 发布文档已把 `deepseek doctor --json` 加入 source / artifact release checks
-- 验证：
-  - `cargo fmt --check`
-  - `cargo test --offline doctor`：`12 passed`
-  - `deepseek doctor --json`：本地实际执行成功
-- 第二块工具面补齐已落地：
-  - 新增只读 agent tools：`git_log`、`git_show`、`git_blame`
-  - 三个工具均支持显式 `cwd`，输出 `meta.git_command` / `meta.result`，并限制默认输出长度
-  - OpenAI/Anthropic tool schema 已暴露这三种工具
-  - offline planner 会把直接的 recent commits / git show / git blame 请求路由到专用工具，而不是退回 shell
-  - 默认 benchmark manifest 已新增三条 Git history read-only cases
-- 最新验证：
-  - `/home/willamhou/.cargo/bin/cargo fmt --check`
-  - `/home/willamhou/.cargo/bin/cargo test --offline`：`624 passed`
-  - `DEEPSEEK_API_KEY_ENV=DEEPSEEK_API_KEY_OFFLINE /home/willamhou/.cargo/bin/cargo run --offline -- benchmark`：`70/70`
-  - live gate：`pass (runs=39)`
-  - trend gate：`skipped`，因为 70-case comparable history 还不足 3 条
-- 当前边界仍明确：这只是 runtime/workbench integration contract 的第一步；还没有 agent-connected full TUI、SQLite durable runtime、live SSE task runtime 或 release artifact matrix。
-- Phase A integration contract 补齐：
-  - `deepseek serve --http` skeleton 已提供 `/health` 与 `/runtime`
-  - `docs/runtime.md` 记录 HTTP runtime schema、REPL session JSON v2、legacy exec snapshot、subagent thread artifacts、durable runtime draft 和 public readiness checklist
-  - 安装与发布文档已链接 runtime skeleton，并要求 release notes 包含 health/runtime 输出
-- 最新本地验证：
-- `/home/willamhou/.cargo/bin/cargo test`：`666 passed`
-- Phase B durable runtime 第一片已落地：
-  - 新增 `src/core/runtime.rs`，以 `.dscode/runtime/sessions`、`threads`、`turns`、`items`、`tasks`、`automations`、`events`、`usage` 保存 file-backed durable records
-  - `serve --http` 新增 `/v1/automations`、`/v1/automations/{id}`、`/v1/automations/{id}/trigger`、`/v1/sessions`、`/v1/sessions/{id}`、`/v1/sessions/{id}/automations`、`/v1/sessions/{id}/threads`、`/v1/sessions/{id}/tasks`、`/v1/tasks`、`/v1/tasks/{id}`、`/v1/tasks/{id}/claim`、`/v1/tasks/{id}/cancel`、`/v1/tasks/{id}/pause`、`/v1/tasks/{id}/resume`、`/v1/threads`、`/v1/threads/{id}`、`/v1/threads/{id}/automations`、`/v1/threads/{id}/items`、`/v1/threads/{id}/items/{item_id}`、`/v1/threads/{id}/turns`、`/v1/threads/{id}/turns/{turn_id}/items`、`/v1/threads/{id}/tasks`、`/v1/threads/{id}/events`、`/v1/threads/{id}/events/stream`、`/v1/threads/{id}/usage`、`/v1/threads/{id}/usage/summary`、`/v1/usage` 和 `/v1/usage/summary`
-  - `deepseek exec` 成功运行后会追加 durable session、linked user/assistant turns、matching message items、completed task record 和 token/cache/cost usage record
-  - `deepseek agents run-task <task-id>` 会 claim pending thread-linked runtime task，在 thread workspace 执行 agent loop，把 user/assistant turns、tool_result items、usage 和 completed/failed task status 写回同一 durable thread，并为 git worktree 创建 pre-run rollback snapshot
-  - `deepseek agents daemon [--interval-ms 1000] [--budget N]` 会轮询 `.dscode/runtime`，触发 `next_run_at` 到期的 active automation，支持 `every:60s` / `every:5m` / `@every 1h` recurring schedule，并复用 `run-task` 路径执行一个 thread-linked pending task per tick；`run-task`/daemon 的 permissioned write/shell/MCP 调用会写 durable `permission_request` 并等待同 thread 的 `permission_response`
-  - `/runtime` capability 现在对 `sessions` / `threads` / `turns` / `items` / `events` / `events_write` / `cancellation_events` / `events_sse` / `events_sse_wait` / `events_sse_follow` / `diagnostics` / `diagnostics_changed` / `diagnostics_broker` / `tasks` / `task_claim` / `task_cancel` / `task_pause` / `task_resume` / `task_updates` / `automations` / `automation_trigger` / `usage` / `usage_summary` 报 `true`；`events/stream?wait_ms=N` 支持 bounded SSE live wait，`events/stream?follow=1` 支持单连接连续输出多个 runtime event frame；非 `--once` HTTP listener 会并发处理连接，等待/订阅中的 SSE 不会阻塞其它 runtime write；`POST /v1/diagnostics` 提供 runtime-hosted diagnostics broker，同一 HTTP runtime 进程内对同 workspace 复用 warmed stdio LSP session；`deepseek tui --runtime-url http://HOST:PORT` 可从 HTTP runtime 构建快照、把前台 action 写回 HTTP，并对已知 thread 订阅 `follow=1` 事件流，HTTP TUI 的 `diagnostics [--changed|paths...]` 会走 runtime broker；usage summary 聚合 cache hit/miss、recognized DeepSeek V4 USD micro-cost estimate 和 1M-context policy；active automation 可手动 trigger 成 pending task，pending/running task 可被外部 runner、HTTP client 或本地 daemon claim/cancel，pending task 还可 pause/resume 做队列级控制；`deepseek agents service` 可生成 systemd/launchd runtime+daemon+diagnostics+shell-supervisor supervisor 文件；`deepseek update homebrew-formula` 可从 release matrix `.sha256` 文件渲染 Homebrew formula，减少手工发布步骤；Release Matrix 在 `v*` tag 上会创建/更新 GitHub Release、上传平台 archive 与 checksum assets、发布 GHCR Docker image，并在配置 `CARGO_REGISTRY_TOKEN` / `NPM_TOKEN` / `HOMEBREW_TAP_*` 后分别执行 crates.io/npm/Homebrew tap 发布
-  - 全量测试更新为 `/home/willamhou/.cargo/bin/cargo test -- --test-threads=1`：`765 passed`
-- Phase D TUI 第一片已落地：
-  - `Cargo.toml` 新增 `ratatui` / `crossterm`
-  - 新增 `src/tui.rs` 和 `src/cli/commands/tui.rs`
-  - `deepseek tui` 启动全屏 workbench shell；`deepseek tui --demo --once` 输出可测试快照
-  - 已有 Plan / Agent / YOLO mode tabs、sidebar、transcript/composer frame、task panel、command palette、session picker、approval modal
-  - session picker 读取 `.dscode/runtime/sessions` 的 durable session metadata，并在启动时预加载 linked threads 与 item timelines
-  - session picker 和 thread navigator 可以切换当前可见 durable transcript snapshot；composer focus/input 会把用户消息追加成 active thread 的 durable user turn 和 message item，并在 interactive TUI 中启动后台 agent response；command palette 已支持 UI/runtime 命令：`mode plan|agent|yolo`、`sessions`、`threads`、`thread next|prev|<id>`、`task <summary>`、`task pause [id]`、`task resume [id]`、`diagnostics [--changed|paths...]`、`restore snapshot|list|show`、`revert turn <id|last> [--apply]`、`approval`、`cancel`、`compact [tail]`
-  - interactive TUI 会轮询刷新 file-backed runtime sessions / threads / item timelines，并启动本地 runtime watcher，把外部 durable runtime 写入转换成 foreground draw loop 的 full snapshot live event；同时读取直接写入或通过 `POST /v1/threads/{id}/events` 写入的 durable `permission_request` events 打开真实 tool/kind/target approval modal；approval accept/deny 会写入 durable `permission_response` event，已响应请求刷新后不会再弹出；TUI-started agent run 的 write/shell/MCP permission gate 会等待并消费这些 response events；`--once` 仍保持 deterministic snapshot
-  - 后台 TUI agent run 会先创建 running assistant message item，将 assistant deltas 通过 durable item updates 写入该 item，同时通过前台 live event channel 在每次 draw 前直接 upsert 可见 item；随后再把 final assistant message、tool_result items、usage 和 completed/failed task records 写回 active durable thread
-  - 后台 TUI agent run 现在会创建 running runtime task；TUI 会加载 active thread 的 recent runtime task records 并在 task panel 中显示 kind/status/summary；command palette 的 `task <summary>` 可在当前 thread 下创建 pending agent task；同时会加载 active-thread automations，command palette 的 `automation trigger [id] [prompt]` 可触发 automation 生成 pending runtime task；`c` / `cancel` 会为当前 running assistant turn 写入 durable `cancel_requested` event，AgentLoop 在 model/tool step 与 approval wait checkpoints 消费该事件，并把 turn/item/task 标记为 `cancelled`
-  - task panel 会从 durable usage records 显示 active thread 的 token total、cache hit rate、cache chart、estimated cost、input/output cost split、cost chart 和 1M-context strategy；当策略进入 compaction 区间时会提示 `:compact [tail]`
-  - 当前边界仍明确：外部 runtime writer 已能通过本地 watcher 更快反映到当前 TUI，但还不是跨进程 push/SSE subscription；还缺更细的 progress controls 或通用 external command execution；`run_shell` 已通过 cancel-aware process-group kill 支持 in-flight 中止，remote model stream stdout 也通过 cancel-aware pipe reader 避免阻塞在下一帧/transport timeout
-- Packaging 第一片已落地：
-  - 新增 `Dockerfile` / `.dockerignore`，支持从源码构建本地 Docker image；Release Matrix tag run 现在会发布 GHCR image
-  - 新增 `npm/package.json`、`npm/bin/deepseek.js`、`npm/README.md`、`npm/platforms/*/package.json` 和 npm staging/test scripts；root npm wrapper 会优先解析当前平台 optional binary package，也可转发到 packaged target-triple binary 或 `DEEPSEEK_BINARY`
-  - `Cargo.toml` 已补 `description`、`readme`、`license-file`、repository/homepage、keywords、categories；新增 `LICENSE`
-  - 新增 `.github/workflows/release.yml`，覆盖 Linux x64、macOS x64、macOS arm64、Windows x64 release build/test/package matrix，并验证 Cargo metadata、`cargo package`、Cargo/npm/Homebrew version sync、npm wrapper、root/platform npm dry-pack、Homebrew formula syntax 和 Docker artifact smoke；每个 release archive 会上传旁路 `.sha256`，并为 archive/checksum 创建 GitHub signed artifact attestation；每个平台 build 还会把 binary stage 成对应 npm platform package 并 smoke-run staged package binary，随后打出 platform tarball，tag run 在配置 `NPM_TOKEN` 后会先发布平台包再发布 root wrapper 包；配置 `HOMEBREW_TAP_REPOSITORY` / `HOMEBREW_TAP_TOKEN` 后会在 GitHub Release assets 发布后渲染并推送 tap formula
-  - 新增 `packaging/homebrew/deepseek.rb` Homebrew formula 模板，覆盖 macOS arm64/x64 和 Linux x64 release assets
-  - 新增 `packaging/systemd/` 与 `packaging/launchd/` runtime/daemon/diagnostics/shell-supervisor service placeholders；`deepseek update package` 会带上 `SERVICES.md` 和 `services/` 模板，`deepseek agents service` 可按目标 workspace 渲染实际 supervisor 文件
-  - `docs/install.md` 与 `docs/release.md` 已记录 Docker/npm wrapper/Homebrew/release matrix 验证命令
-  - `cargo package --allow-dirty` 已可本地完成 package verify；`Cargo.toml.orig` 是 Cargo 生成的 package manifest 副本，不是仓库残留文件
-  - 仍缺 actual npm/Cargo/Homebrew 外部发布凭据、真实 tag 发布验证，以及实际已发布的 registry/tap 记录
-- Phase F rollback 第一片已落地：
-  - 新增 `src/language/diagnostics.rs`，提供 diagnostics runner；有具体文件且语言服务器可用时优先走 stdio LSP `textDocument/publishDiagnostics`，失败/超时再回退到 compiler/type-check fallback
-  - `deepseek diagnostics [--changed] [paths...]` 和 `deepseek diagnostics --watch [--interval-ms N] ...` 已接入 CLI；watch 模式在同一进程内复用 warmed stdio LSP session 并对后续 tick 发送 `didChange`；`deepseek agents service` 会生成 `diagnostics --watch --changed` 和 `agents shell-supervisor --json` 的 systemd/launchd 常驻 worker；agent registry 暴露只读 `diagnostics` tool，OpenAI/Anthropic tool schema 已包含该工具
-  - 新增 `diagnostics.post_edit = true` 配置开关；开启后成功 `apply_patch` 会把 post-edit diagnostics 附加到 tool result；agent-loop `apply_patch` 会在同一个 tool 实例内复用 warmed stdio LSP session
-  - 新增 `src/core/rollback.rs`，把 tracked combined/staged/unstaged diffs 和 untracked regular files 快照保存到 `.dscode/rollback/snapshots/<id>/`
-  - `deepseek restore snapshot [label]`、`restore list`、`restore show <id> [--patch]`、`restore revert-turn <id> [--apply]` 已接入 CLI
-  - REPL 新增 `/restore snapshot [label]`、`/restore list`、`/restore show <id|last>` 和 `/revert_turn <id|last> [--apply]`；每轮 prompt 会尝试创建 pre-turn rollback snapshot，并把 `last` 指向最近一轮
-  - restore 默认 dry-run，`--apply` 会先反转当前 tracked diff，再应用 snapshot patch；要求当前 git `HEAD` 等于 snapshot 捕获的 commit
-  - restore `--apply` 后会恢复 staged-index / unstaged-worktree split、captured untracked regular files、列出恢复后的 changed files，并用 fallback diagnostics 对这些文件跑 post-restore diagnostics
-  - `deepseek exec` 在 git worktree 内会创建 pre-run rollback snapshot，并在成功后绑定到 durable assistant turn id；TUI-started agent run 也会创建 pre-run snapshot，并在 assistant turn 创建后立即绑定；REPL live turns 通过 `/revert_turn last` 暴露最近 pre-turn snapshot；本地 file-backed TUI command palette 暴露 `diagnostics [--changed|paths...]`、`restore snapshot [label]`、`restore list [limit]`、`restore show <id|last>` 和 `revert turn <id|last> [--apply]`，其中 TUI `last` 解析为 active thread 的 latest durable turn id；`restore show` / `restore revert-turn` 可用 snapshot id、bound turn id 或 REPL/TUI `last`
-  - 当前边界仍明确：diagnostics watch、service-rendered diagnostics worker、agent-loop post-edit diagnostics 和 HTTP runtime diagnostics broker 都能在各自进程内复用 warmed stdio LSP session，HTTP clients 可通过 `/v1/diagnostics` 共享 runtime broker；但还没有 standalone diagnostics daemon protocol；rollback 的 untracked 覆盖普通文件，不覆盖 symlink/目录；legacy snapshots 没有 split patch 时不能恢复 staged-index fidelity；HTTP-runtime TUI 不会隐式回滚远端 host，rollback command palette 仅支持本地 file-backed TUI
-- DeepSeek-native UX 第一片已落地：
-  - `model.model = "auto"` / `DEEPSEEK_MODEL=auto` 会把简单任务路由到 `deepseek-v4-flash`，把规划、审查、架构、安全、迁移和恢复类复杂任务路由到 `deepseek-v4-pro`；usage 会记录实际 resolved model
-  - `model.reasoning_effort = "off|high|max|auto"` / `DEEPSEEK_REASONING_EFFORT` 会映射到官方 DeepSeek V4 thinking mode 与 reasoning effort 参数；默认 `off`，直到 provider-native reasoning transcript replay 和更完整的 thinking/tool-call 兼容性验证完成
-  - OpenAI-compatible stream 会把 `delta.reasoning_content` 作为独立 reasoning delta 输出；Anthropic-compatible stream 会处理 `thinking_delta` / `reasoning_delta`
-  - Agent loop 会捕获每步 reasoning delta，把最近几步 reasoning 摘要和 assistant message 一起回放进后续请求；TUI runtime stream 会把 reasoning delta 保存为 linked durable `reasoning` item
-  - usage records 和 summary 已聚合 prompt cache hit/miss、recognized DeepSeek V4 USD micro-cost estimate、unpriced record count 和 1M-context strategy
-- 当前边界仍明确：已有 non-destructive runtime thread compaction endpoint、
-  TUI active-thread compaction command、`thread_compacted` audit event 和
-  daemon-side 800k-token proactive compaction scheduler，并有 agent-loop
-  近期 reasoning replay / TUI durable reasoning item；
-  但还没有 provider-native full reasoning transcript replay 或 model-generated
-  automatic compaction policy
-
-## 最近里程碑
-
-- `d9b3ae4` `Initialize project docs`
-- `589a5c6` `Bootstrap Rust CLI scaffold`
-- `5cd434a` `Implement basic repository tools`
-- `f20534f` `Add offline planning loop`
-- `3a8d633` `Wire skills into CLI flow`
-- `6d01256` `Add DeepSeek transport and policy enforcement`
-- `efdb191` `Upgrade patching and remote protocol parsing`
-- `a1c45fb` `Use tool calling for OpenAI-compatible DeepSeek`
-- `046106c` `Document roadmap and project status`
+Everything else belongs to the broader product-hardening backlog.
diff --git a/docs/superpowers/README.md b/docs/superpowers/README.md
new file mode 100644
index 0000000..59c7c8a
--- /dev/null
+++ b/docs/superpowers/README.md
@@ -0,0 +1,17 @@
+# Superpowers Archive
+
+This directory contains historical plans and execution specs used while building
+DeepSeekCode. They are useful for audit trails and implementation context, but
+they are not the current status source of truth.
+
+Use these current documents first:
+
+- [Current status](../current-status.md)
+- [Roadmap](../roadmap.md)
+- [Release checklist](../release.md)
+- [Install](../install.md)
+
+The files under `plans/` and `specs/` may intentionally mention old command
+names, old evidence counts, temporary blockers, or gaps that have since been
+closed. When those historical notes conflict with current docs, prefer the
+current docs.
diff --git a/docs/todos.md b/docs/todos.md
index 13ed840..9047d49 100644
--- a/docs/todos.md
+++ b/docs/todos.md
@@ -23,7 +23,7 @@ The system prompt nudges the LLM to use `todo_write` when:
 - it spans multiple files / non-trivial refactoring, OR
 - it requires running tests or shell commands as part of completion.
 
-The LLM is prompted to mark exactly one todo as `in_progress` at a time. DeepseekCode
+The LLM is prompted to mark exactly one todo as `in_progress` at a time. DeepSeekCode
 does **not** strictly validate this — the renderer shows multiple
 in_progress items if they appear, so the user can see the LLM going off track.