From 84c06c4bde0ccd788322d7d118192636add15acf Mon Sep 17 00:00:00 2001 From: Moe Katib Date: Sat, 18 Apr 2026 10:32:48 -0700 Subject: [PATCH] fix: auto-recover stale sync state after crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a sync process dies mid-run (laptop sleep, OOM, SIGKILL, terminal close), sync_state.json keeps status: "syncing" forever. The next run then threw "already syncing. Use --force to override" even though the filesystem lock had already detected the dead pid and taken over. The two concurrency gates disagreed. Now the filesystem lock is the single source of truth — if acquireSyncLock returns, no live process holds the lock, and a lingering "syncing" state is auto-recovered with a log line instead of a throw. Also installs SIGINT/SIGTERM handlers that flip state to "failed" and release the lock before exiting, so cleanly-signaled terminations don't leave zombie state behind. Co-Authored-By: Claude Opus 4.7 (1M context) --- package-lock.json | 4 ++-- package.json | 2 +- src/lib/sync/runner.ts | 44 ++++++++++++++++++++++++++++++------------ 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/package-lock.json b/package-lock.json index dca075e..d54a2e5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@withone/cli", - "version": "1.37.1", + "version": "1.37.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@withone/cli", - "version": "1.37.1", + "version": "1.37.2", "dependencies": { "@clack/prompts": "^0.9.1", "commander": "^13.1.0", diff --git a/package.json b/package.json index 9e85dea..4b5ae4d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@withone/cli", - "version": "1.37.1", + "version": "1.37.2", "description": "CLI for managing One", "type": "module", "files": [ diff --git a/src/lib/sync/runner.ts b/src/lib/sync/runner.ts index 3d906d5..3b2b4b6 100644 --- a/src/lib/sync/runner.ts +++ b/src/lib/sync/runner.ts @@ -129,16 +129,19 @@ export async function syncModel( // Acquire a cross-process lock so two concurrent syncs (e.g. cron tick + // manual run) don't race on the same table. Dry-run skips the lock since - // it performs no writes. + // it performs no writes. The lock handles stale-lock takeover (dead pid + // or age > 30min) so a successful return means no live process holds it. const lock = options.dryRun ? null : acquireSyncLock(platform, model); - // Check in-process state too (cheap and gives a clearer error message) + // If the lock was free but state still says 'syncing', a previous run + // crashed between "set syncing" and the cleanup paths. Lock is the source + // of truth on concurrency — auto-recover the stale state rather than + // forcing the user to pass --force or hand-edit sync_state.json. const existingState = getModelState(platform, model); - if (existingState?.status === 'syncing' && !options.force) { - if (lock) lock.release(); - throw new Error( - `Sync state says ${platform}/${model} is already syncing. ` + - `Use --force to override (this may happen if a previous sync crashed before cleanup).` + if (existingState?.status === 'syncing' && !options.dryRun && !isAgentMode()) { + process.stderr.write( + `Recovered from crashed previous sync of ${platform}/${model} ` + + `(state was 'syncing', no live owner)\n` ); } @@ -152,15 +155,29 @@ export async function syncModel( ); } - // Don't set status for dry-run — it's just a preview - if (!options.dryRun) { - updateModelState(platform, model, { status: 'syncing' }); - } - let db: Database.Database | null = null; let totalRecords = 0; let pagesProcessed = 0; let lastCursor: unknown = null; + + // Reset sync_state and release the lock if the process is killed mid-run + // (SIGINT from Ctrl-C, SIGTERM from a process manager, cron job being + // killed on laptop sleep). Without these handlers the state stays 'syncing' + // forever and the filesystem lock lingers for STALE_MS. + const signalCleanup = (signal: NodeJS.Signals): void => { + try { + updateModelState(platform, model, { status: 'failed', pagesProcessed, lastCursor }); + } catch { /* best-effort */ } + try { if (lock) lock.release(); } catch { /* best-effort */ } + process.exit(signal === 'SIGINT' ? 130 : 143); + }; + const onSigint = (): void => signalCleanup('SIGINT'); + const onSigterm = (): void => signalCleanup('SIGTERM'); + if (!options.dryRun) { + process.on('SIGINT', onSigint); + process.on('SIGTERM', onSigterm); + updateModelState(platform, model, { status: 'syncing' }); + } // Track every id we saw across pages, for --full-refresh deletion reconciliation. const seenIds = new Set(); // Hook counters @@ -626,5 +643,8 @@ export async function syncModel( (wrapped as any)._recordsSynced = totalRecords; (wrapped as any)._pagesProcessed = pagesProcessed; throw wrapped; + } finally { + process.off('SIGINT', onSigint); + process.off('SIGTERM', onSigterm); } }