vdavid
diff --git a/‎Cargo.lock‎
Lines changed: 2 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apps/desktop/src-tauri/Cargo.toml‎
Lines changed: 5 additions & 0 deletions b/‎apps/desktop/src-tauri/Cargo.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎apps/desktop/src-tauri/src/ai/CLAUDE.md‎
Lines changed: 31 additions & 7 deletions b/‎apps/desktop/src-tauri/src/ai/CLAUDE.md‎
Lines changed: 31 additions & 7 deletions
diff --git a/‎apps/desktop/src-tauri/src/ai/client.rs‎
Lines changed: 59 additions & 1 deletion b/‎apps/desktop/src-tauri/src/ai/client.rs‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎apps/desktop/src-tauri/src/ai/client_real_anthropic_test.rs‎
Lines changed: 88 additions & 0 deletions b/‎apps/desktop/src-tauri/src/ai/client_real_anthropic_test.rs‎
Lines changed: 88 additions & 0 deletions
@@ -74,6 +74,11 @@ reqwest = { version = "0.13", features = ["json", "rustls", "stream", "multipart
 axum = "0.8"
 tokio = { version = "1", features = ["rt-multi-thread", "net", "time", "sync", "macros"] }
 futures-util = "0.3"
+# tokio-util: provides `CancellationToken` for cooperative cancellation of streaming
+# AI suggestions (`ai/suggestions.rs::stream_folder_suggestions`). The `rt` feature
+# enables runtime integration (token wakeups via tokio's reactor). 0.7.18 is the latest
+# stable (published 2026-01-04). MIT license; tracks tokio's release cadence.
+tokio-util = { version = "0.7.18", features = ["rt"] }
 tower-http = { version = "0.6", features = ["cors"] }
 tauri-plugin-updater = "2"
 tauri-plugin-process = "2"
 
@@ -12,18 +12,21 @@ Three provider modes:
 | File | Purpose |
 |---|---|
 | `mod.rs` | Types (`AiStatus`, `AiState`, `DownloadProgress`, `ModelInfo`), model registry (`AVAILABLE_MODELS`, `DEFAULT_MODEL_ID`), `is_local_ai_supported()` gate |
-| `manager.rs` | Central coordinator. Global `Mutex<Option<ManagerState>>` singleton. Most Tauri commands live here. Stores provider + cloud-AI config (`cloud_api_key`/`cloud_base_url`/`cloud_model`). Exposes `resolve_backend() -> BackendResolution` so callers don't reinvent provider routing. |
+| `manager.rs` | Central coordinator. Global `Mutex<Option<ManagerState>>` singleton. Most Tauri commands live here. Stores provider + cloud-AI config (`cloud_api_key`/`cloud_base_url`/`cloud_model`). Exposes `resolve_backend() -> BackendResolution` so callers don't reinvent provider routing. Also owns the `STREAM_CANCEL_TOKENS` registry (`register_stream`/`unregister_stream`/`cancel_stream`) for in-flight `stream_folder_suggestions` cancellation. |
 | `download.rs` | HTTP streaming download with Range-based resume. Emits `ai-download-progress` events (200ms throttle). Cooperative cancellation via function parameter (`Fn() -> bool`). |
 | `extract.rs` | Copies bundled `llama-server` binary + dylibs from `resources/ai/` to the AI data dir. Sets Unix permissions, handles symlinks. |
 | `process.rs` | Spawns child process with `DYLD_LIBRARY_PATH` set. Instant SIGKILL to stop (llama-server is stateless; macOS reclaims all GPU/mmap resources). `kill_process` for fire-and-forget (quit, orphans), `kill_and_reap_in_background` for normal operation (reaps zombie in bg thread). `kill_stale_llama_servers` for belt-and-suspenders orphan cleanup by process name. Port discovery via `bind(:0)`. |
-| `client.rs` | `genai`-backed chat client. `AiBackend` is a struct bundling a long-lived `genai::Client` with a model name; built via `AiBackend::local(port)` or `AiBackend::remote(api_key, base_url, model)`. The model name picks the adapter (`claude-*` → Anthropic native, `gemini-*` → Gemini native, `gpt-5*`/`*-pro`/`*-codex` → OpenAI Responses API, etc.). Auto-omits `temperature`/`top_p` for OpenAI Responses adapter and for chat-completions reasoning models (`o1*`, `o3*`, `o4*`, `chatgpt-*`, `gpt-5*` defense-in-depth) and substitutes `ReasoningEffort::Low`. Local backend forces the OpenAI adapter via a `ServiceTargetResolver` pinning endpoint to `http://127.0.0.1:<port>/v1/`. |
+| `client.rs` | `genai`-backed chat client. `AiBackend` is a struct bundling a long-lived `genai::Client` with a model name; built via `AiBackend::local(port)` or `AiBackend::remote(api_key, base_url, model)`. The model name picks the adapter (`claude-*` → Anthropic native, `gemini-*` → Gemini native, `gpt-5*`/`*-pro`/`*-codex` → OpenAI Responses API, etc.). Auto-omits `temperature`/`top_p` for OpenAI Responses adapter and for chat-completions reasoning models (`o1*`, `o3*`, `o4*`, `chatgpt-*`, `gpt-5*` defense-in-depth) and substitutes `ReasoningEffort::Low`. Local backend forces the OpenAI adapter via a `ServiceTargetResolver` pinning endpoint to `http://127.0.0.1:<port>/v1/`. Exposes both `chat_completion` (full response) and `chat_completion_stream` (returns a `BoxStream<Result<String, AiError>>` of content chunks; reasoning/thought-signature/tool-call chunks filtered out). |
 | `client_integration_test.rs` | `wiremock`-based tests covering request shape per adapter (chat completions vs Responses API), parsing, error mapping. Always run in CI. |
-| `client_real_openai_test.rs` | `#[ignore]`-gated smoke tests against `api.openai.com`. Run with `OPENAI_API_KEY=$(security find-generic-password -a "$USER" -s "OPENAI_API_KEY" -w) cargo nextest run --lib --run-ignored only ai::client_real_openai_test`. Costs ~$0.001 per full run. Use after refactors that touch `client.rs`. |
-| `suggestions.rs` | Builds few-shot prompt from listing cache, routes to configured backend, sanitizes response. |
+| `client_streaming_test.rs` | `axum`-based SSE mock server tests for `chat_completion_stream`: chunks arrive in order, empty streams end cleanly, drop-mid-stream closes the connection, HTTP 5xx maps to `ServerError`. Always run in CI. (Wiremock can't chunk-deliver SSE bodies — see Gotchas.) |
+| `client_real_openai_test.rs` | `#[ignore]`-gated smoke tests against `api.openai.com`, including streaming variants for `gpt-4o-mini`, `gpt-5-mini`, `o3-mini`. Run with `OPENAI_API_KEY=$(security find-generic-password -a "$USER" -s "OPENAI_API_KEY" -w) cargo nextest run --lib --run-ignored only ai::client_real_openai_test`. Costs ~$0.001 per full run. |
+| `client_real_anthropic_test.rs` | `#[ignore]`-gated smoke tests against `api.anthropic.com` (chat + streaming variants of `claude-3-5-haiku-latest`). Anthropic's native streaming protocol differs from OpenAI's SSE shape; without this we'd only test the OpenAI lineage. Run with `ANTHROPIC_API_KEY=$(security find-generic-password -a "$USER" -s "ANTHROPIC_API_KEY" -w) cargo nextest run --lib --run-ignored only ai::client_real_anthropic_test`. |
+| `suggestions.rs` | Builds few-shot prompt from listing cache, routes to configured backend, sanitizes response. Also exposes `stream_folder_suggestions` + `cancel_folder_suggestions` Tauri commands and a `StreamingSanitizer` that runs the per-line sanitizer on streamed chunks (line-buffers across chunk boundaries, dedupes case-insensitively against existing names + already-emitted, caps at `MAX_SUGGESTIONS`). |
+| `suggestions_streaming_test.rs` | Tests for the `manager::register_stream`/`unregister_stream`/`cancel_stream` registry — concurrent ids don't interfere, double-cancel is idempotent, missing id is a no-op. |
 
 ### Tauri commands
 
-Core: `get_ai_status`, `get_ai_model_info`, `get_ai_runtime_status`, `configure_ai`, `start_ai_server`, `stop_ai_server`, `check_ai_connection`, `start_ai_download`, `cancel_ai_download`, `get_folder_suggestions`. Note: `get_system_memory_info` moved to top-level `system_memory.rs`.
+Core: `get_ai_status`, `get_ai_model_info`, `get_ai_runtime_status`, `configure_ai`, `start_ai_server`, `stop_ai_server`, `check_ai_connection`, `start_ai_download`, `cancel_ai_download`, `get_folder_suggestions`, `stream_folder_suggestions`, `cancel_folder_suggestions`. Note: `get_system_memory_info` moved to top-level `system_memory.rs`.
 Legacy (still wired, used by toast): `uninstall_ai`, `dismiss_ai_offer`, `opt_out_ai`, `opt_in_ai`, `is_ai_opted_out`.
 
 ## Startup flow
@@ -116,6 +119,21 @@ privacy-focused users. The architecture doesn't fight this switch — it's just
 **Decision**: Use `genai` crate as the chat client instead of hand-rolled `reqwest` JSON.
 **Why**: We hit two production bugs that were per-provider quirks: (1) GPT-5/o-series chat models reject any non-default `temperature` (HTTP 400), and (2) `gpt-*-pro` / `*-codex` models only respond on `/v1/responses`, not `/v1/chat/completions` (HTTP 404). Each new model adds another quirk. `genai` normalizes ~20 providers, auto-routes Responses-API models, and gives us Anthropic / Gemini / xAI / OpenRouter for free with the same code path. Tradeoff: pinned at `0.5.3` (stable, ~3 months old) with a solo maintainer; mitigated by it being MIT/Apache-2.0 + small enough to fork if needed.
 
+**Decision**: Streaming uses `tauri::ipc::Channel<T>` per call, not the global `app.emit` pattern that downloads use.
+**Why**: User can open the new-folder dialog, cancel, and reopen quickly. Two streams could overlap if we used a global event — listeners from the second open would see chunks from the first. Channel scopes the events to a single command invocation, eliminating the race. Tauri 2 docs explicitly recommend `Channel<T>` for streaming events from a command.
+
+**Decision**: Streaming command `stream_folder_suggestions` always returns `Ok(())`; all signaling (suggestions, completion, cancellation, failure) goes through `Channel<SuggestionStreamEvent>`.
+**Why**: Mixing IPC `Result<_, String>` with channel events would split the error contract. One signaling path is simpler for both Rust and TypeScript callers. `#[tauri::command]` requires the `Result` return type purely for syntactic reasons here.
+
+**Decision**: Line-buffering and sanitization happen in Rust (`StreamingSanitizer`), not in the frontend.
+**Why**: AGENTS.md principle "smart backend, thin frontend." Sanitization rules (markdown stripping, numbering detection, dedupe by case-insensitive existing-names + emit-history) are non-trivial; replicating them in TypeScript would create two authorities that drift. Frontend just renders strings.
+
+**Decision**: Cancellation via explicit `cancel_folder_suggestions` command + `tokio_util::sync::CancellationToken`, not implicit drop detection on the Channel.
+**Why**: Tauri 2's `Channel::send` is fire-and-forget into the IPC queue. It does NOT report frontend handler GC or webview destruction back to the backend. Without an explicit cancel signal, the backend would keep streaming after the user closes the dialog — billing cloud providers and pegging local-LLM compute. `CancellationToken::cancel` is itself idempotent, so the same token can be canceled by an explicit cancel call AND by an implicit `Channel::send` failure in the same tick — both succeed.
+
+**Decision**: Cancel-token registry (`STREAM_CANCEL_TOKENS`) is a separate `LazyLock<Mutex<HashMap>>` in `manager.rs`, not part of `ManagerState`.
+**Why**: Streaming task lifecycle is orthogonal to file-manager AI state. Keeping it isolated lets us drop entries on task end without holding the wider `MANAGER` lock and without inflating `ManagerState`.
+
 ## Gotchas
 
 **Gotcha**: `genai` requires `base_url` to end with `/`. Without the trailing slash, `Url::join("chat/completions")` strips the last segment and you'd hit `https://api.openai.com/chat/completions` (404) instead of `/v1/chat/completions`. `client.rs::build_client` normalizes by appending `/` if missing.
@@ -139,8 +157,14 @@ privacy-focused users. The architecture doesn't fight this switch — it's just
 **Gotcha**: `wait_for_server_health` kills the process on timeout or early death — don't remove that cleanup.
 **Why**: Without it, a process that fails health check would be orphaned (PID tracked but never cleaned up until explicit stop).
 
+**Gotcha**: `Channel::send` returns `Err` only when the webview itself is gone (window closed); it succeeds silently after the JS-side handler is GC'd. Don't rely on send failure for liveness — use the explicit `cancel_folder_suggestions` command. Send-error in the streaming-suggestion `try_emit` callback triggers the cancel token as defense-in-depth implicit cancel.
+
+**Gotcha**: Cancel via `tokio::select!` drops the in-flight `stream.next()` future. For `genai`'s reqwest-backed SSE this is the desired terminal action — closes the connection, cuts billing. Single-poll cancel-safety is the only model we rely on; we never resume a previously-canceled stream.
+
+**Gotcha**: `wiremock` does not chunk-deliver SSE bodies in distinct frames; it writes the whole body in one HTTP response. That gives false confidence we'd be exercising multi-chunk parse paths. `client_streaming_test.rs` uses an `axum`-based mock SSE server with `tokio::time::sleep` between frames instead.
+
 ## Dependencies
 
-External: `genai` (chat normalization), `reqwest` (download streaming + `health_check`), `tokio`, `libc`, `futures_util`
-Dev: `wiremock` (HTTP mock for `client_integration_test.rs`)
+External: `genai` (chat normalization), `reqwest` (download streaming + `health_check`), `tokio`, `tokio-util` (`CancellationToken`), `libc`, `futures_util`
+Dev: `wiremock` (HTTP mock for `client_integration_test.rs`); `axum` is used in test-only mode for `client_streaming_test.rs`'s SSE mock.
 Internal: `crate::ignore_poison::IgnorePoison`, `crate::file_system::get_file_at`
@@ -12,8 +12,9 @@
 use std::sync::Arc;
 use std::time::Duration;
 
+use futures_util::stream::{BoxStream, StreamExt};
 use genai::adapter::AdapterKind;
-use genai::chat::{ChatMessage, ChatOptions, ChatRequest, ReasoningEffort};
+use genai::chat::{ChatMessage, ChatOptions, ChatRequest, ChatStreamEvent, ReasoningEffort};
 use genai::resolver::{AuthData, Endpoint, ServiceTargetResolver};
 use genai::{Client, ModelIden, ServiceTarget};
 
@@ -135,6 +136,63 @@ pub async fn chat_completion(
     Ok(text)
 }
 
+/// Streams a chat completion. Returns a boxed stream of content chunks.
+///
+/// Same per-model option fixups as [`chat_completion`] (reasoning models get
+/// `temperature`/`top_p` stripped and `ReasoningEffort::Low` substituted). Reasoning,
+/// thought-signature, and tool-call chunks are filtered out — callers only see the
+/// visible text content. Stream ends when `genai` emits `End` or errors; an empty
+/// stream (zero chunks) is valid and matches the same graceful-degradation contract
+/// as `chat_completion`'s "AI returned no text" case.
+///
+/// Cancellation: drop the returned stream. The `genai::ChatStreamResponse`'s reqwest
+/// body is closed, billing stops on cloud providers, local-LLM compute is freed.
+pub async fn chat_completion_stream(
+    backend: &AiBackend,
+    system_prompt: &str,
+    user_prompt: &str,
+    options: &ChatOptions,
+) -> Result<BoxStream<'static, Result<String, AiError>>, AiError> {
+    let target = backend
+        .client
+        .resolve_service_target(&backend.model)
+        .await
+        .map_err(map_genai_error)?;
+
+    let effective_options = adjust_for_model(options, &target);
+
+    let req = ChatRequest::new(vec![
+        ChatMessage::system(system_prompt.to_owned()),
+        ChatMessage::user(user_prompt.to_owned()),
+    ]);
+
+    log::debug!(
+        "AI chat_completion_stream: opening stream (adapter={:?}, model={})",
+        target.model.adapter_kind,
+        &*target.model.model_name
+    );
+
+    let res = backend
+        .client
+        .exec_chat_stream(&backend.model, req, Some(&effective_options))
+        .await
+        .map_err(map_genai_error)?;
+
+    // Map ChatStreamEvent → Option<String>: keep only visible content; drop reasoning,
+    // thought-signature, tool-call chunks; pass through errors mapped to AiError.
+    let stream = res.stream.filter_map(|item| async move {
+        match item {
+            Ok(ChatStreamEvent::Chunk(chunk)) => Some(Ok(chunk.content)),
+            Ok(ChatStreamEvent::Start | ChatStreamEvent::End(_)) => None,
+            Ok(ChatStreamEvent::ReasoningChunk(_) | ChatStreamEvent::ThoughtSignatureChunk(_)) => None,
+            Ok(ChatStreamEvent::ToolCallChunk(_)) => None,
+            Err(e) => Some(Err(map_genai_error(e))),
+        }
+    });
+
+    Ok(stream.boxed())
+}
+
 /// Per-model option fixup: reasoning-class models reject `temperature`. Returns a
 /// modified clone of `options` when needed; otherwise hands back a clone unchanged.
 fn adjust_for_model(options: &ChatOptions, target: &ServiceTarget) -> ChatOptions {
 
@@ -0,0 +1,88 @@
+//! Real-API smoke tests against Anthropic. **Not run in CI** — gated behind `#[ignore]`,
+//! requires a valid `ANTHROPIC_API_KEY` env var.
+//!
+//! Why a separate file from `client_real_openai_test.rs`: Anthropic's native streaming
+//! protocol differs from OpenAI's SSE shape (event names like `content_block_delta`
+//! vs `data:` JSON envelopes). Without exercising it, we'd be testing only the OpenAI
+//! lineage despite supporting Anthropic via `genai`.
+//!
+//! Run with:
+//! ```sh
+//! ANTHROPIC_API_KEY=$(security find-generic-password -a "$USER" -s "ANTHROPIC_API_KEY" -w) \
+//!   cargo nextest run --lib --run-ignored only ai::client_real_anthropic_test
+//! ```
+//!
+//! Costs ~$0.001 per full run.
+
+use futures_util::StreamExt;
+use genai::chat::ChatOptions;
+
+use super::client::{AiBackend, chat_completion, chat_completion_stream};
+
+const BASE_URL: &str = "https://api.anthropic.com/v1/";
+
+fn api_key_or_skip() -> Option<String> {
+    let key = std::env::var("ANTHROPIC_API_KEY").ok()?;
+    if key.trim().is_empty() {
+        return None;
+    }
+    Some(key)
+}
+
+fn opts() -> ChatOptions {
+    ChatOptions::default()
+        .with_temperature(0.3)
+        .with_max_tokens(200)
+        .with_top_p(0.9)
+}
+
+#[tokio::test]
+#[ignore = "real API call — set ANTHROPIC_API_KEY to run"]
+async fn smoke_claude_haiku_chat() {
+    let Some(api_key) = api_key_or_skip() else {
+        panic!("ANTHROPIC_API_KEY not set");
+    };
+    let backend = AiBackend::remote(api_key, String::from(BASE_URL), String::from("claude-3-5-haiku-latest"));
+
+    let res = chat_completion(
+        &backend,
+        "You answer in exactly one short sentence.",
+        "Say the word 'pong'.",
+        &opts(),
+    )
+    .await
+    .expect("real Anthropic call should succeed");
+
+    assert!(!res.trim().is_empty(), "response should be non-empty");
+    log::info!(target: "ai_smoke", "claude-3-5-haiku → {res}");
+}
+
+#[tokio::test]
+#[ignore = "real API call — set ANTHROPIC_API_KEY to run"]
+async fn smoke_claude_haiku_stream() {
+    let Some(api_key) = api_key_or_skip() else {
+        panic!("ANTHROPIC_API_KEY not set");
+    };
+    let backend = AiBackend::remote(api_key, String::from(BASE_URL), String::from("claude-3-5-haiku-latest"));
+
+    let mut stream = chat_completion_stream(
+        &backend,
+        "You answer in exactly one short sentence.",
+        "Say the word 'pong'.",
+        &opts(),
+    )
+    .await
+    .expect("stream open");
+
+    let mut text = String::new();
+    let mut chunks = 0;
+    while let Some(item) = stream.next().await {
+        let chunk = item.expect("chunk ok");
+        text.push_str(&chunk);
+        chunks += 1;
+    }
+
+    assert!(!text.trim().is_empty(), "expected non-empty assembled text");
+    assert!(chunks > 0, "expected at least one chunk");
+    log::info!(target: "ai_smoke", "claude-3-5-haiku stream → {chunks} chunks, total: {text}");
+}