Bugfix: Auto-dispatcher race when error logged before AppHandle wired

vdavid · vdavid · commit f069a712b7dd · 2026-04-24T08:41:49.000+02:00
If `log_error!` fired before `setup` ran, the debounce window opened but no
flush task was spawned (no `AppHandle` to clone yet). The window then sat
indefinitely until the next error arrived after init — not great.

`set_app_handle` now atomically peeks at the dispatcher state under the same
mutex `record_error` uses. If a window is active without a spawned flush, it
spawns one with the original deadline. Past-deadline windows fall through to
`sleep_until`'s no-op branch and `flush` runs immediately.

A `flush_spawned: bool` on `DebounceState` tracks the flag; `mark_flush_spawned`
in `on_error_logged` and the late-arrival path in `set_app_handle` race safely
on it (loser bails). New `simulate_late_app_handle_for_test` and
`flush_spawned_for_test` test seams plus three tests cover the late-arrival
path, the past-deadline case, and the no-active-window no-op.
diff --git a/apps/desktop/src-tauri/src/error_reporter/CLAUDE.md b/apps/desktop/src-tauri/src/error_reporter/CLAUDE.md
@@ -141,9 +141,14 @@ new user-visible errors. Do not bulk-migrate.
 
 The macro can't thread an `AppHandle` through every call site, so
 `auto_dispatcher::set_app_handle(handle)` stashes one in a `OnceLock` at app startup
-(called from `lib.rs::setup` right after `crash_reporter::init`). If the handle isn't set
-yet (init order, unit tests), the dispatcher still updates the debounce counter but
-silently skips the spawn — acceptable, and matches the "soft errors only" contract.
+(called from `lib.rs::setup` right after `crash_reporter::init`). If an error fires
+before the handle is wired, the debounce window opens normally but the flush task isn't
+spawned (no handle to hand to `tauri::async_runtime::spawn`). The state carries a
+`flush_spawned` flag for exactly this reason: when `set_app_handle` runs later, it picks
+up the orphaned window and spawns the flush task with the remaining time. If the
+deadline has already passed, the spawned task fires immediately. The `mark_flush_spawned`
+helper plus the late-arrival path in `set_app_handle` race against each other safely —
+the loser just bails.
 
 ## Gotchas
 
diff --git a/apps/desktop/src-tauri/src/error_reporter/auto_dispatcher.rs b/apps/desktop/src-tauri/src/error_reporter/auto_dispatcher.rs
@@ -27,8 +27,11 @@
 //! The macro can't pass an `AppHandle` (it'd require every `log_error!` site to thread
 //! one in). We stash a `tauri::AppHandle<tauri::Wry>` in [`APP_HANDLE`] at startup via
 //! [`set_app_handle`], called from `lib.rs::setup`. If the handle isn't set yet (before
-//! setup runs, or in unit tests), [`on_error_logged`] still bumps the counter so the
-//! debounce window is correct, but the spawn/upload is skipped.
+//! setup runs, or in unit tests), [`on_error_logged`] still bumps the counter and stores
+//! the debounce state, but skips the spawn (no handle to clone). The state's
+//! `flush_spawned` flag tracks this; when [`set_app_handle`] later runs, it picks up the
+//! orphaned window and spawns the flush task with the remaining time. If the deadline
+//! has already elapsed, [`sleep_until`] is a no-op and `flush` runs immediately.
 
 use crate::error_reporter::{self, BundleKind};
 use rand::Rng;
@@ -74,11 +77,14 @@ struct DebounceState {
     first_category: String,
     first_message: String,
     error_count: usize,
-    /// Wall-clock target for the flush. Used by tests to assert jitter bounds and by
-    /// the spawned task to know when to wake up. The spawned task captures this value
-    /// before storing the state, so the field itself is only read in the test seam.
-    #[allow(dead_code, reason = "Read via snapshot_for_test in cfg(test) builds")]
+    /// Wall-clock target for the flush. Read by the late-spawn path in
+    /// [`set_app_handle`] to compute the remaining delay when a window opened before
+    /// the AppHandle was ready, and by tests to assert jitter bounds.
     scheduled_send_at: Instant,
+    /// True once a flush task has been spawned for this window. If `set_app_handle`
+    /// runs after a window opened without the handle, this lets us spawn exactly once
+    /// without racing with [`on_error_logged`].
+    flush_spawned: bool,
 }
 
 static STATE: Mutex<Option<DebounceState>> = Mutex::new(None);
@@ -99,8 +105,37 @@ pub fn is_enabled() -> bool {
 
 /// Stash the app handle so the macro-driven entry point can spawn flush tasks without
 /// receiving an `AppHandle` argument. Called once from `lib.rs::setup`.
+///
+/// If a debounce window is already active and never got its flush task (because an error
+/// fired before the handle was wired up), spawn one now. Compute the remaining time
+/// from `scheduled_send_at`; if it's already past, fire immediately.
 pub fn set_app_handle(handle: AppHandle<Wry>) {
-    let _ = APP_HANDLE.set(handle);
+    if APP_HANDLE.set(handle.clone()).is_err() {
+        // Already set — nothing more to do. Tests reset the handle differently; in prod
+        // setup runs once.
+        return;
+    }
+    // Atomically peek at the state under the lock. If a window is open without a
+    // spawned flush, mark it as spawned and kick the task off.
+    let scheduled_at = {
+        let mut guard = match STATE.lock() {
+            Ok(g) => g,
+            Err(p) => p.into_inner(),
+        };
+        match guard.as_mut() {
+            Some(state) if !state.flush_spawned => {
+                state.flush_spawned = true;
+                Some(state.scheduled_send_at)
+            }
+            _ => None,
+        }
+    };
+    if let Some(deadline) = scheduled_at {
+        tauri::async_runtime::spawn(async move {
+            sleep_until(deadline).await;
+            flush(handle).await;
+        });
+    }
 }
 
 /// Records an error against the auto-dispatcher. If the opt-in flag is off, returns
@@ -120,19 +155,41 @@ pub fn on_error_logged(category: &str, message: &str) {
         None => return, // Already had an active debounce; only the counter changed.
     };
 
-    // Spawn the flush task only if the AppHandle has been wired up. In unit tests and
-    // during the brief window before `set_app_handle` runs in `setup()`, we'll capture
-    // the error in the debounce state but never fire the send. Acceptable — the next
-    // error after init will trigger one normally.
+    // Spawn the flush task only if the AppHandle has been wired up. If it's not, the
+    // debounce state is preserved with `flush_spawned = false` — when `set_app_handle`
+    // eventually runs, it'll spawn the task with the remaining time (or fire immediately
+    // if the deadline has already passed).
     let Some(app) = APP_HANDLE.get().cloned() else {
         return;
     };
+    if !mark_flush_spawned() {
+        // Lost the race: someone else (e.g. set_app_handle catching up) already spawned
+        // the flush task for this window. Don't double-spawn.
+        return;
+    }
     tauri::async_runtime::spawn(async move {
         sleep_until(scheduled_send_at).await;
         flush(app).await;
     });
 }
 
+/// Atomically mark the active window as having a spawned flush task. Returns `true` if
+/// this caller is the one that flipped the flag (and so should spawn), `false` if it was
+/// already set (someone else won the race).
+fn mark_flush_spawned() -> bool {
+    let mut guard = match STATE.lock() {
+        Ok(g) => g,
+        Err(p) => p.into_inner(),
+    };
+    match guard.as_mut() {
+        Some(state) if !state.flush_spawned => {
+            state.flush_spawned = true;
+            true
+        }
+        _ => false,
+    }
+}
+
 /// Lock the state, register the error, and return the scheduled flush time iff this
 /// call started a new debounce window. Returns `None` if a window was already active
 /// (in which case the caller should NOT spawn a duplicate flush task).
@@ -154,6 +211,7 @@ fn record_error(category: &str, message: &str) -> Option<Instant> {
         first_message: message.to_string(),
         error_count: 1,
         scheduled_send_at,
+        flush_spawned: false,
     });
     Some(scheduled_send_at)
 }
@@ -259,6 +317,36 @@ pub fn snapshot_for_test() -> Option<(String, String, usize, Instant)> {
     })
 }
 
+/// Test seam: returns `Some(true)` if a window is active and its flush task has been
+/// spawned, `Some(false)` if a window is active but no spawn happened yet, `None` if
+/// no window is active.
+#[cfg(test)]
+pub fn flush_spawned_for_test() -> Option<bool> {
+    let guard = match STATE.lock() {
+        Ok(g) => g,
+        Err(p) => p.into_inner(),
+    };
+    guard.as_ref().map(|s| s.flush_spawned)
+}
+
+/// Test seam: simulates the late-arriving AppHandle path without needing a Tauri runtime.
+/// Returns `Some(deadline)` if a window was active and not yet spawned (so the production
+/// `set_app_handle` would spawn a task for it), `None` otherwise.
+#[cfg(test)]
+pub fn simulate_late_app_handle_for_test() -> Option<Instant> {
+    let mut guard = match STATE.lock() {
+        Ok(g) => g,
+        Err(p) => p.into_inner(),
+    };
+    match guard.as_mut() {
+        Some(state) if !state.flush_spawned => {
+            state.flush_spawned = true;
+            Some(state.scheduled_send_at)
+        }
+        _ => None,
+    }
+}
+
 #[cfg(test)]
 pub fn jitter_window() -> (Duration, Duration) {
     (DEBOUNCE_BASE - JITTER, DEBOUNCE_BASE + JITTER)
diff --git a/apps/desktop/src-tauri/src/error_reporter/auto_dispatcher_tests.rs b/apps/desktop/src-tauri/src/error_reporter/auto_dispatcher_tests.rs
@@ -11,7 +11,8 @@
 //! parallel would race.
 
 use super::auto_dispatcher::{
-    jitter_window, pick_jitter_offset_for_test, record_error_for_test, reset_for_test, set_enabled, snapshot_for_test,
+    flush_spawned_for_test, jitter_window, pick_jitter_offset_for_test, record_error_for_test, reset_for_test,
+    set_enabled, simulate_late_app_handle_for_test, snapshot_for_test,
 };
 use std::sync::Mutex;
 use std::time::{Duration, Instant};
@@ -134,6 +135,93 @@ fn jitter_offset_is_within_double_jitter_band() {
     }
 }
 
+/// Late AppHandle wiring: an error logged before `set_app_handle` runs should leave the
+/// debounce state with `flush_spawned = false`. When the handle later arrives, simulating
+/// the production path, we should see the flag flip to true and a deadline returned so
+/// the caller can spawn the flush task.
+#[test]
+fn late_app_handle_picks_up_active_window() {
+    let _guard = lock_and_reset();
+    set_enabled(true);
+
+    // Simulate "error logged before AppHandle ready": record but don't spawn.
+    let scheduled =
+        record_error_for_test("cmdr_lib::network", "logged before setup").expect("first call should open a window");
+    assert_eq!(
+        flush_spawned_for_test(),
+        Some(false),
+        "test seam doesn't spawn — flag must remain false"
+    );
+
+    // Subsequent error in the same window must keep flush_spawned = false too,
+    // otherwise the late-arriving AppHandle would think the spawn is already covered.
+    let _ = record_error_for_test("cmdr_lib::other", "still no handle");
+    assert_eq!(
+        flush_spawned_for_test(),
+        Some(false),
+        "additional errors in the window must not flip the spawn flag"
+    );
+
+    // Now simulate the AppHandle arriving. The helper returns the deadline iff there was
+    // work to schedule, and flips the flag so a subsequent on_error_logged in the same
+    // window won't spawn a duplicate.
+    let deadline = simulate_late_app_handle_for_test().expect("expected a deadline for the active window");
+    assert_eq!(deadline, scheduled, "deadline should match the original schedule");
+    assert_eq!(
+        flush_spawned_for_test(),
+        Some(true),
+        "after the simulated AppHandle wiring, the flag must be set"
+    );
+
+    // A second call to the late-arrival helper is a no-op (idempotent / re-entrant safe).
+    assert!(
+        simulate_late_app_handle_for_test().is_none(),
+        "calling the late-arrival helper again must not double-spawn"
+    );
+
+    reset_for_test();
+}
+
+/// If the AppHandle arrives after the debounce deadline has already passed, the late
+/// path still returns a deadline (in the past) so the spawned task fires immediately
+/// via `sleep_until` (which is a no-op when `deadline <= now`).
+#[test]
+fn late_app_handle_with_past_deadline_returns_deadline() {
+    let _guard = lock_and_reset();
+    set_enabled(true);
+
+    let scheduled =
+        record_error_for_test("cmdr_lib::network", "logged way before setup").expect("first call should open a window");
+    let now = Instant::now();
+    assert!(scheduled > now, "scheduled should be in the future at this point");
+
+    // We can't time-travel `Instant` cheaply, but we can validate the contract: the
+    // helper returns whatever scheduled_send_at was, and the production caller's
+    // sleep_until handles the past-deadline case by returning immediately.
+    let deadline = simulate_late_app_handle_for_test().expect("expected a deadline");
+    assert_eq!(deadline, scheduled);
+
+    reset_for_test();
+}
+
+/// If no window is active when the AppHandle arrives, the late-arrival helper is a no-op.
+#[test]
+fn late_app_handle_with_no_active_window_is_noop() {
+    let _guard = lock_and_reset();
+    set_enabled(true);
+
+    assert!(
+        simulate_late_app_handle_for_test().is_none(),
+        "no active window — nothing to spawn"
+    );
+    assert!(
+        snapshot_for_test().is_none(),
+        "the helper must not create state when there's nothing to do"
+    );
+
+    reset_for_test();
+}
+
 /// Documents the crash-loop interaction. If the process exits during the 60 s window,
 /// the spawned flush task is dropped before it fires — by design. The crash reporter
 /// covers panics; the auto-dispatcher is for soft errors that don't kill the app.