diff --git a/.gitignore b/.gitignore index e1f2bf81d..3139b9c73 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ sec-db/ models/ .worktrees/ scratch/ +.claude/worktrees/ diff --git a/examples/web/src/status/QueueStatus.tsx b/examples/web/src/status/QueueStatus.tsx index 440d5fb02..11ecc4ab8 100644 --- a/examples/web/src/status/QueueStatus.tsx +++ b/examples/web/src/status/QueueStatus.tsx @@ -13,7 +13,6 @@ export function QueueStatus({ queueType }: { queueType: string }) { const [pending, setPending] = useState(0); const [processing, setProcessing] = useState(0); const [completed, setCompleted] = useState(0); - const [aborting, setAborting] = useState(0); const [errors, setErrors] = useState(0); const [disabled, setDisabled] = useState(0); @@ -26,7 +25,6 @@ export function QueueStatus({ queueType }: { queueType: string }) { setPending(await client.size(JobStatus.PENDING)); setProcessing(await client.size(JobStatus.PROCESSING)); setCompleted(await client.size(JobStatus.COMPLETED)); - setAborting(await client.size(JobStatus.ABORTING)); setErrors(await client.size(JobStatus.FAILED)); setDisabled(await client.size(JobStatus.DISABLED)); } @@ -53,7 +51,6 @@ export function QueueStatus({ queueType }: { queueType: string }) { setPending(0); setProcessing(0); setCompleted(0); - setAborting(0); setErrors(0); setDisabled(0); }, [registeredQueue]); diff --git a/packages/ai/src/execution/QueuedExecutionStrategy.ts b/packages/ai/src/execution/QueuedExecutionStrategy.ts index d6725d8a8..1e8b34512 100644 --- a/packages/ai/src/execution/QueuedExecutionStrategy.ts +++ b/packages/ai/src/execution/QueuedExecutionStrategy.ts @@ -48,7 +48,7 @@ export class QueuedExecutionStrategy implements IAiExecutionStrategy { this.limiter = new ConcurrencyLimiter(this.concurrency); } const limiter = this.limiter; - await this.acquireLimiterSlot(limiter, context.signal); + const token = await this.acquireLimiterSlot(limiter, context.signal); try { const job = new AiJob({ @@ -62,7 +62,7 @@ export class QueuedExecutionStrategy implements IAiExecutionStrategy { emit ); } finally { - await limiter.recordJobCompletion(); + await limiter.complete(token); } } @@ -75,7 +75,7 @@ export class QueuedExecutionStrategy implements IAiExecutionStrategy { * abort. Uses {@link ILimiter.tryAcquire} so concurrent callers cannot both * pass a check-then-record sequence and overshoot the configured limit. */ - private async acquireLimiterSlot(limiter: ILimiter, signal: AbortSignal): Promise { + private async acquireLimiterSlot(limiter: ILimiter, signal: AbortSignal): Promise { let token = await limiter.tryAcquire(); while (token === null || token === undefined) { if (signal.aborted) { @@ -86,6 +86,6 @@ export class QueuedExecutionStrategy implements IAiExecutionStrategy { await new Promise((resolve) => setTimeout(resolve, Math.max(20, Math.min(delay, 200)))); token = await limiter.tryAcquire(); } - void token; + return token; } } diff --git a/packages/ai/src/job/AiJob.ts b/packages/ai/src/job/AiJob.ts index 4f2153ffd..196160b9c 100644 --- a/packages/ai/src/job/AiJob.ts +++ b/packages/ai/src/job/AiJob.ts @@ -8,7 +8,6 @@ import { AbortSignalJobError, IJobExecuteContext, Job, - JobStatus, PermanentJobError, RetryableJobError, withJobErrorDiagnostics, @@ -241,7 +240,7 @@ export class AiJob< context: IJobExecuteContext, emit: AiEmit ): Promise { - if (context.signal.aborted || this.status === JobStatus.ABORTING) { + if (context.signal.aborted) { throw new AbortSignalJobError("Abort signal aborted before execution of job"); } diff --git a/packages/indexeddb/src/job-queue/IndexedDbJobStore.ts b/packages/indexeddb/src/job-queue/IndexedDbJobStore.ts new file mode 100644 index 000000000..5c42e8b2c --- /dev/null +++ b/packages/indexeddb/src/job-queue/IndexedDbJobStore.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobStore, JobRecord, JobStatus, MessageId } from "@workglow/job-queue"; +import type { PendingIndexedDbWrite } from "./IndexedDbMessageQueue"; +import type { IndexedDbQueueStorage } from "./IndexedDbQueueStorage"; + +export class IndexedDbJobStore implements IJobStore { + /** @internal — shared with the paired message queue */ + public readonly core: IndexedDbQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: IndexedDbQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + get(id: MessageId): Promise | undefined> { + return this.core.get(id); + } + + async peek(status?: JobStatus, num?: number): Promise[]> { + return this.core.peek(status as any, num); + } + + size(status?: JobStatus): Promise { + return this.core.size(status as any); + } + + async getByRunId(runId: string): Promise[]> { + return this.core.getByRunId(runId); + } + + outputForInput(input: Input): Promise { + return this.core.outputForInput(input); + } + + async saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise { + await this.core.saveProgress(id, progress, message, details); + } + + async saveResult(id: MessageId, output: Output): Promise { + const buf = this.pending.get(id) ?? {}; + buf.output = output ?? null; + this.pending.set(id, buf); + } + + async saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise { + const buf = this.pending.get(id) ?? {}; + buf.error = error; + buf.errorCode = errorCode; + buf.abortRequested = abortRequested; + this.pending.set(id, buf); + } + + async deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise { + await this.core.deleteJobsByStatusAndAge(status, olderThanMs); + } + + async delete(id: MessageId): Promise { + this.pending.delete(id); + await this.core.delete(id); + } + + async deleteAll(): Promise { + this.pending.clear(); + await this.core.deleteAll(); + } + + async abort(id: MessageId): Promise { + await this.core.abort(id); + } + + async saveStatus(id: MessageId, status: JobStatus): Promise { + await this.core.saveStatus(id, status); + } +} diff --git a/packages/indexeddb/src/job-queue/IndexedDbMessageQueue.ts b/packages/indexeddb/src/job-queue/IndexedDbMessageQueue.ts new file mode 100644 index 000000000..353bbca9f --- /dev/null +++ b/packages/indexeddb/src/job-queue/IndexedDbMessageQueue.ts @@ -0,0 +1,225 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + IClaim, + IMessageQueue, + JobStorageFormat, + MessageId, + QueueChangePayload, + QueueStorageScope, + QueueSubscribeOptions, + SendOptions, +} from "@workglow/job-queue"; +import { IndexedDbQueueStorage } from "./IndexedDbQueueStorage"; + +/** + * Per-id buffer that lets {@link IJobStore.saveResult}/{@link IJobStore.saveError} + * stage output/error until the terminal claim.ack()/fail() persists them in + * a single complete() call (avoids double-bumping `attempts`). + */ +export type PendingIndexedDbWrite = { + output?: Output | null; + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; +}; + +class IndexedDbClaim implements IClaim> { + constructor( + private readonly core: IndexedDbQueueStorage, + private readonly pending: Map>, + public readonly id: MessageId, + public readonly body: JobStorageFormat, + public readonly attempts: number, + private readonly workerId: string + ) {} + + async ack(result?: unknown): Promise { + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const output = + result !== undefined + ? result + : buf?.output !== undefined + ? buf.output + : (current.output ?? null); + await this.core.finalize(this.id, { + output: output as Output | null, + error: null, + error_code: null, + status: "COMPLETED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async retry(opts?: { delaySeconds?: number }): Promise { + this.pending.delete(this.id); + const delay = opts?.delaySeconds ?? 0; + const current = (await this.core.get(this.id)) ?? this.body; + await this.core.complete({ + ...current, + status: "PENDING", + lease_owner: null, + lease_expires_at: null, + visible_at: new Date(Date.now() + delay * 1000).toISOString(), + progress: 0, + progress_message: "", + progress_details: null, + }); + } + + async fail(opts?: { + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; + permanent?: boolean; + }): Promise { + void opts?.permanent; + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const error = + opts?.error !== undefined + ? opts.error + : buf?.error !== undefined + ? buf.error + : (current.error ?? null); + const errorCode = + opts?.errorCode !== undefined + ? opts.errorCode + : buf?.errorCode !== undefined + ? buf.errorCode + : (current.error_code ?? null); + const abortRequested = + opts?.abortRequested !== undefined ? opts.abortRequested : (buf?.abortRequested ?? false); + await this.core.finalize(this.id, { + error, + error_code: errorCode, + abort_requested_at: abortRequested + ? (current.abort_requested_at ?? new Date().toISOString()) + : (current.abort_requested_at ?? null), + status: "FAILED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async extendLease(ms: number): Promise { + await this.core.extendLease(this.id, this.workerId, ms); + } + + async disable(): Promise { + this.pending.delete(this.id); + const current = await this.core.get(this.id); + const completedAt = current?.completed_at ?? new Date().toISOString(); + await this.core.finalize(this.id, { + status: "DISABLED", + completed_at: completedAt, + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + }); + } +} + +export class IndexedDbMessageQueue implements IMessageQueue< + JobStorageFormat +> { + public readonly scope: QueueStorageScope = "process"; + + /** @internal — shared with the paired job store */ + public readonly core: IndexedDbQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: IndexedDbQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + async send(body: JobStorageFormat, opts?: SendOptions): Promise { + return this.core.add(applySendOptions(body, opts)); + } + + async sendBatch( + bodies: readonly JobStorageFormat[], + opts?: SendOptions + ): Promise { + const ids: MessageId[] = []; + for (const body of bodies) { + ids.push(await this.send(body, opts)); + } + return ids; + } + + async receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise>[]> { + const max = Math.max(1, opts.max ?? 1); + const claims: IClaim>[] = []; + while (claims.length < max) { + const job = await this.core.next(opts.workerId, { leaseMs: opts.leaseMs }); + if (!job) break; + claims.push( + new IndexedDbClaim( + this.core, + this.pending, + job.id, + job, + job.attempts ?? 0, + opts.workerId + ) + ); + } + return claims; + } + + async releaseClaim(id: MessageId): Promise { + this.pending.delete(id); + await this.core.releaseClaim(id); + } + + async migrate(): Promise { + await this.core.migrate(); + } + + getMigrations(): ReadonlyArray { + return this.core.getMigrations(); + } + + subscribeToChanges( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void { + return this.core.subscribeToChanges(callback, options); + } +} + +function applySendOptions( + body: JobStorageFormat, + opts?: SendOptions +): JobStorageFormat { + if (!opts) return body; + const out: JobStorageFormat = { ...body }; + if (opts.delaySeconds != null) { + out.visible_at = new Date(Date.now() + opts.delaySeconds * 1000).toISOString(); + } + if (opts.timeoutSeconds != null) { + out.deadline_at = new Date(Date.now() + opts.timeoutSeconds * 1000).toISOString(); + } + if (opts.fingerprint != null) out.fingerprint = opts.fingerprint; + if (opts.jobRunId != null) out.job_run_id = opts.jobRunId; + if (opts.maxAttempts != null) out.max_attempts = opts.maxAttempts; + return out; +} diff --git a/packages/indexeddb/src/job-queue/IndexedDbQueueStorage.ts b/packages/indexeddb/src/job-queue/IndexedDbQueueStorage.ts index 4b4f9d611..14e1da9b1 100644 --- a/packages/indexeddb/src/job-queue/IndexedDbQueueStorage.ts +++ b/packages/indexeddb/src/job-queue/IndexedDbQueueStorage.ts @@ -151,7 +151,7 @@ export class IndexedDbQueueStorage implements IQueueStorage implements IQueueStorage { @@ -248,9 +248,9 @@ export class IndexedDbQueueStorage implements IQueueStorage implements IQueueStorage | undefined> { + public async next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined> { const db = await this.getDb(); const tx = db.transaction(this.tableName, "readwrite"); const store = tx.objectStore(this.tableName); - const index = store.index("queue_status_run_after"); const now = new Date().toISOString(); + const leaseMs = opts?.leaseMs ?? 30000; + const leaseExpiry = new Date(Date.now() + leaseMs).toISOString(); const prefixKeyValues = this.getPrefixKeyValues(); // This ensures we can verify that we actually won the race to claim this job @@ -274,7 +279,39 @@ export class IndexedDbQueueStorage implements IQueueStorage | undefined>( (resolve, reject) => { - const cursorRequest = index.openCursor( + let claimedJob: JobStorageFormat | undefined; + + const tryClaimJob = (job: JobStorageFormat & Record) => { + // Lease-expiry reclaim consumes one attempt against max_attempts; + // a fresh PENDING claim does not (the worker's validateJobState + // FAILs the job when attempts >= max_attempts on the next step). + const isLeaseExpiryReclaim = job.status === JobStatus.PROCESSING; + job.status = JobStatus.PROCESSING; + job.last_attempted_at = now; + job.lease_owner = claimToken; + job.lease_expires_at = leaseExpiry; + if (isLeaseExpiryReclaim) { + job.attempts = ((job.attempts as number | undefined) ?? 0) + 1; + } + // Always clear stale abort_requested_at on (re)claim. A PROCESSING + // row may have had abort_requested_at set before the previous + // worker crashed; the new owner must start with a clean slate. + job.abort_requested_at = null; + + try { + const updateRequest = store.put(job); + updateRequest.onsuccess = () => { + claimedJob = job; + }; + updateRequest.onerror = () => {}; + } catch { + // ignore + } + }; + + // First: look for a PENDING job ready to run + const pendingIndex = store.index("queue_status_visible_at"); + const pendingRequest = pendingIndex.openCursor( IDBKeyRange.bound( [...prefixKeyValues, this.queueName, JobStatus.PENDING, ""], [...prefixKeyValues, this.queueName, JobStatus.PENDING, now], @@ -283,23 +320,16 @@ export class IndexedDbQueueStorage implements IQueueStorage | undefined; - let cursorStopped = false; - - cursorRequest.onsuccess = (e) => { + pendingRequest.onsuccess = (e) => { const cursor = (e.target as IDBRequest).result; if (!cursor) { - // Cursor exhausted - resolve with whatever we found (or undefined) - return; - } - - // If we already found and updated a job, stop iterating - if (cursorStopped) { + // No PENDING job found — try expired-lease PROCESSING job + if (!claimedJob) { + tryExpiredLeaseScan(); + } return; } - const job = cursor.value as JobStorageFormat & Record; - // Verify the job belongs to this queue, matches prefixes, and is still in PENDING state if ( job.queue !== this.queueName || job.status !== JobStatus.PENDING || @@ -308,34 +338,50 @@ export class IndexedDbQueueStorage implements IQueueStorage reject(pendingRequest.error); + + const tryExpiredLeaseScan = () => { + // Scan PROCESSING jobs to find one with an expired lease + const processingIndex = store.index("queue_status_visible_at"); + const processingRequest = processingIndex.openCursor( + IDBKeyRange.bound( + [...prefixKeyValues, this.queueName, JobStatus.PROCESSING, ""], + [...prefixKeyValues, this.queueName, JobStatus.PROCESSING, "￿"], + false, + false + ) + ); - try { - const updateRequest = store.put(job); - updateRequest.onsuccess = () => { - claimedJob = job; - cursorStopped = true; - // Stop cursor iteration - we've claimed a job - }; - updateRequest.onerror = (err) => { - console.error("Failed to update job status:", err); + processingRequest.onsuccess = (e) => { + const cursor = (e.target as IDBRequest).result; + if (!cursor) return; // none found, tx.oncomplete will resolve with undefined + const job = cursor.value as JobStorageFormat & Record; + if ( + job.queue !== this.queueName || + job.status !== JobStatus.PROCESSING || + !this.matchesPrefixes(job) + ) { cursor.continue(); - }; - } catch (err) { - console.error("Error updating job:", err); - cursor.continue(); - } - }; + return; + } + // Check for expired lease (null = expired per spec) + if (job.lease_expires_at && job.lease_expires_at >= now) { + cursor.continue(); + return; + } + tryClaimJob(job); + // Don't continue — attempted a claim + }; - cursorRequest.onerror = () => reject(cursorRequest.error); + processingRequest.onerror = () => reject(processingRequest.error); + }; // Wait for transaction to complete before resolving tx.oncomplete = () => { - // Notify hybrid manager of local change if (claimedJob) { this.hybridManager?.notifyLocalChange(); } @@ -351,29 +397,40 @@ export class IndexedDbQueueStorage implements IQueueStorage { + const job = await this.get(id); + if (!job || job.status !== JobStatus.PROCESSING || job.lease_owner !== workerId) { + throw new Error( + `extendLease failed: job ${String(id)} is not PROCESSING or lease is not owned by worker ${workerId}` + ); + } + job.lease_expires_at = new Date(Date.now() + ms).toISOString(); + await this.put(job); + } + /** * Retrieves the number of jobs in the queue. * Returns the count of jobs in the queue. @@ -415,13 +472,18 @@ export class IndexedDbQueueStorage implements IQueueStorage & Record; + jobAsRecord.abort_requested_at = null; // Ensure queue is set correctly job.queue = this.queueName; // Ensure prefix values are preserved - const jobWithPrefixes = job as JobStorageFormat & Record; + const jobWithPrefixes = jobAsRecord; for (const [key, value] of Object.entries(this.prefixValues)) { jobWithPrefixes[key] = value; } @@ -445,28 +507,62 @@ export class IndexedDbQueueStorage implements IQueueStorage { + public async releaseClaim(id: unknown): Promise { const job = await this.get(id); if (!job) return; job.status = JobStatus.PENDING; - job.worker_id = null; + job.lease_owner = null; job.progress = 0; job.progress_message = ""; job.progress_details = null; + // Clear stale abort_requested_at — an abort flag set during the previous + // claim must not immediately cancel the next worker that picks up the row. + (job as unknown as Record).abort_requested_at = null; await this.put(job); } /** - * Aborts a job in the queue. + * Aborts a job. + * - If PENDING: immediately mark as FAILED with abort_requested_at set. + * - If PROCESSING: set abort_requested_at only (leave status as PROCESSING). + * - Otherwise: no-op. */ public async abort(id: unknown): Promise { const job = await this.get(id); if (!job) return; + const now = new Date().toISOString(); + if (job.status === JobStatus.PENDING) { + job.status = JobStatus.FAILED; + job.abort_requested_at = now; + job.completed_at = now; + await this.complete(job); + } else if (job.status === JobStatus.PROCESSING) { + job.abort_requested_at = now; + await this.put(job); + } + } - job.status = JobStatus.ABORTING; - await this.complete(job); + /** Force-overwrite status without touching attempts (used to persist DISABLED after lease release). */ + public async saveStatus(id: unknown, status: string): Promise { + const db = await this.getDb(); + const tx = db.transaction(this.tableName, "readwrite"); + const store = tx.objectStore(this.tableName); + const getRequest = store.get(id as IDBValidKey); + return new Promise((resolve, reject) => { + getRequest.onsuccess = () => { + const record = getRequest.result; + if (!record) { + resolve(); + return; + } + const putRequest = store.put({ ...record, status }); + putRequest.onsuccess = () => resolve(); + putRequest.onerror = () => reject(putRequest.error); + }; + getRequest.onerror = () => reject(getRequest.error); + }); } /** @@ -495,6 +591,43 @@ export class IndexedDbQueueStorage implements IQueueStorage | null; + } + ): Promise { + const existing = await this.get(id); + if (!existing) return; + const updated = existing as JobStorageFormat & Record; + if ("output" in fields) updated.output = fields.output ?? null; + if ("error" in fields) updated.error = fields.error ?? null; + if ("error_code" in fields) updated.error_code = fields.error_code ?? null; + if ("status" in fields) updated.status = fields.status; + if ("completed_at" in fields) updated.completed_at = fields.completed_at ?? null; + if ("abort_requested_at" in fields) { + updated.abort_requested_at = fields.abort_requested_at ?? null; + } + if ("lease_owner" in fields) updated.lease_owner = fields.lease_owner ?? null; + if ("progress" in fields) updated.progress = fields.progress ?? 0; + if ("progress_message" in fields) updated.progress_message = fields.progress_message ?? ""; + if ("progress_details" in fields) updated.progress_details = fields.progress_details ?? null; + await this.put(updated); + } + /** * Deletes all jobs from the queue. */ @@ -596,7 +729,7 @@ export class IndexedDbQueueStorage implements IQueueStorage): Promise { const db = await this.getDb(); diff --git a/packages/indexeddb/src/job-queue/common.ts b/packages/indexeddb/src/job-queue/common.ts index cbb71aa2f..402ad4885 100644 --- a/packages/indexeddb/src/job-queue/common.ts +++ b/packages/indexeddb/src/job-queue/common.ts @@ -7,6 +7,9 @@ // organize-imports-ignore export * from "./IndexedDbQueueStorage"; +export * from "./IndexedDbMessageQueue"; +export * from "./IndexedDbJobStore"; +export * from "./createIndexedDbQueue"; export * from "./IndexedDbRateLimiterStorage"; // Versioned migration sets for the queue + rate-limiter object stores, plus diff --git a/packages/indexeddb/src/job-queue/createIndexedDbQueue.ts b/packages/indexeddb/src/job-queue/createIndexedDbQueue.ts new file mode 100644 index 000000000..70c49f926 --- /dev/null +++ b/packages/indexeddb/src/job-queue/createIndexedDbQueue.ts @@ -0,0 +1,32 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { IndexedDbJobStore } from "./IndexedDbJobStore"; +import { IndexedDbMessageQueue, type PendingIndexedDbWrite } from "./IndexedDbMessageQueue"; +import { IndexedDbQueueStorage, type IndexedDbQueueStorageOptions } from "./IndexedDbQueueStorage"; + +/** + * Factory for the paired IndexedDB message queue and job store. Both + * facades share a single underlying {@link IndexedDbQueueStorage} so writes + * through one are observable through the other. + */ +export function createIndexedDbQueue( + queueName: string, + opts?: IndexedDbQueueStorageOptions +): { + messageQueue: IndexedDbMessageQueue; + jobStore: IndexedDbJobStore; + /** @internal — exposed for callers that still need the legacy storage object. */ + core: IndexedDbQueueStorage; +} { + const core = new IndexedDbQueueStorage(queueName, opts); + const pending = new Map>(); + return { + messageQueue: new IndexedDbMessageQueue(core, pending), + jobStore: new IndexedDbJobStore(core, pending), + core, + }; +} diff --git a/packages/indexeddb/src/migrations/indexedDbQueueMigrations.ts b/packages/indexeddb/src/migrations/indexedDbQueueMigrations.ts index 1273726b2..05ce7721b 100644 --- a/packages/indexeddb/src/migrations/indexedDbQueueMigrations.ts +++ b/packages/indexeddb/src/migrations/indexedDbQueueMigrations.ts @@ -14,11 +14,17 @@ import type { IndexedDbMigration, IndexedDbMigrationGroup } from "./IndexedDbMig * Component name is `queue:indexeddb:` so two queues with * different table names get tracked independently in `_storage_migrations`. * - * Schema: a single object store keyed by `id` plus four compound indexes - * (queue/status, queue/status/run_after, queue/job_run_id, - * queue/fingerprint/status). When `prefixes` is non-empty the prefix columns - * are prepended to every index key path so per-tenant queries can be served - * directly by the index. + * v1 is FROZEN byte-for-byte against the pre-PR shape (097b4afa) — it + * creates the `queue_status_run_after` compound index and reads `run_after` + * from each row. The rename to `queue_status_visible_at` lives in v2, + * which also walks every existing row and copies `run_after → visible_at` + * synchronously inside the upgrade transaction. + * + * Schema (post v2): a single object store keyed by `id` plus four compound + * indexes (queue/status, queue/status/visible_at, queue/job_run_id, + * queue/fingerprint/status). When `prefixes` is non-empty the prefix + * columns are prepended to every index key path so per-tenant queries can + * be served directly by the index. */ export function indexedDbQueueMigrations( tableName: string, @@ -47,6 +53,52 @@ export function indexedDbQueueMigrations( } }, }, + { + component, + version: 2, + description: + "Rename queue_status_run_after → queue_status_visible_at; backfill run_after → visible_at", + up({ tx }) { + // IDB upgrade transactions auto-commit as soon as `up()` returns — + // we MUST NOT `await` anything between IDB operations. All cursor + // and index work below uses synchronous request callbacks fired by + // the same upgrade tx, which is the only legal pattern. + const store = tx.objectStore(tableName); + + // Drop the old index BEFORE rewriting rows so the cursor walk does + // not have to maintain its key. IDB doesn't allow renaming an index + // in place; the only path is delete + create. + const indexNames = Array.from(store.indexNames); + if (indexNames.includes("queue_status_run_after")) { + store.deleteIndex("queue_status_run_after"); + } + + // Walk every row and copy `run_after` → `visible_at`. `openCursor()` + // returns a request whose `onsuccess` fires once per row plus a + // final time with `cursor === null`; the upgrade tx stays open as + // long as new requests are issued from the current callback. + const cursorReq = store.openCursor(); + cursorReq.onsuccess = () => { + const cursor = cursorReq.result; + if (!cursor) return; + const row = cursor.value as Record | null; + if (row && row.visible_at === undefined && row.run_after !== undefined) { + row.visible_at = row.run_after; + cursor.update(row); + } + cursor.continue(); + }; + + // Recreate the index under the new name keyed on `visible_at`. + // Skip if a previous partial run already created it (defensive — IDB + // forbids two indexes sharing a name and would throw mid-upgrade). + if (!Array.from(store.indexNames).includes("queue_status_visible_at")) { + store.createIndex("queue_status_visible_at", k(["queue", "status", "visible_at"]), { + unique: false, + }); + } + }, + }, ]; } diff --git a/packages/job-queue/src/common.ts b/packages/job-queue/src/common.ts index bb3df61a2..57031b177 100644 --- a/packages/job-queue/src/common.ts +++ b/packages/job-queue/src/common.ts @@ -6,6 +6,7 @@ // organize-imports-ignore +export type { DeadLetter } from "./job/DeadLetter"; export * from "./job/Job"; export * from "./job/JobError"; export * from "./job/JobErrorDiagnostics"; @@ -14,6 +15,7 @@ export * from "./job/JobQueueEventListeners"; export * from "./job/JobQueueServer"; export * from "./job/JobQueueWorker"; export * from "./job/JobStorageConverters"; +export * from "./job/MessageQueueClient"; export * from "./limiter/CompositeLimiter"; export * from "./limiter/ConcurrencyLimiter"; export * from "./limiter/DelayLimiter"; @@ -22,9 +24,16 @@ export * from "./limiter/ILimiter"; export * from "./limiter/NullLimiter"; export * from "./limiter/RateLimiter"; +export * from "./queue-storage/IClaim"; +export * from "./queue-storage/IJobStore"; +export * from "./queue-storage/IMessageQueue"; export * from "./queue-storage/IQueueStorage"; +export * from "./queue-storage/InMemoryJobStore"; +export * from "./queue-storage/InMemoryMessageQueue"; export * from "./queue-storage/InMemoryQueueStorage"; export * from "./queue-storage/TelemetryQueueStorage"; +export * from "./queue-storage/createInMemoryQueue"; +export * from "./queue-storage/wrapQueueStorage"; export * from "./rate-limiter-storage/IRateLimiterStorage"; export * from "./rate-limiter-storage/InMemoryRateLimiterStorage"; diff --git a/packages/job-queue/src/job/DeadLetter.ts b/packages/job-queue/src/job/DeadLetter.ts new file mode 100644 index 000000000..bc914bddf --- /dev/null +++ b/packages/job-queue/src/job/DeadLetter.ts @@ -0,0 +1,17 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Payload written to the dead-letter queue when a job exhausts its retry budget. + */ +export interface DeadLetter { + readonly original: Input; + readonly error: string; + readonly errorCode: string | null; + readonly attempts: number; + readonly queueName: string; + readonly jobRunId: string | undefined; +} diff --git a/packages/job-queue/src/job/Job.ts b/packages/job-queue/src/job/Job.ts index 40e3d71e1..b251680af 100644 --- a/packages/job-queue/src/job/Job.ts +++ b/packages/job-queue/src/job/Job.ts @@ -34,19 +34,23 @@ export type JobConstructorParam = { error?: string | null; errorCode?: string | null; fingerprint?: string; - maxRetries?: number; + maxAttempts?: number; status?: JobStatus; createdAt?: Date; deadlineAt?: Date | null; - lastRanAt?: Date | null; - runAfter?: Date | null; + lastAttemptedAt?: Date | null; + visibleAt?: Date | null; completedAt?: Date | null; - runAttempts?: number; + attempts?: number; progress?: number; progressMessage?: string; progressDetails?: Record | null; - /** The ID of the worker that claimed this job, null if unclaimed */ - workerId?: string | null; + /** The ID of the worker that currently holds the lease on this job, null if unclaimed */ + leaseOwner?: string | null; + /** ISO timestamp when an abort was requested for this job (storage-layer concern) */ + abort_requested_at?: string | null; + /** ISO timestamp when the current lease expires (storage-layer concern) */ + lease_expires_at?: string | null; }; export type JobClass = new ( @@ -64,14 +68,14 @@ export class Job { public jobRunId: string | undefined; public queueName: string | undefined; public input: Input; - public maxRetries: number; + public maxAttempts: number; public createdAt: Date; public fingerprint: string | undefined; public status: JobStatus = JobStatus.PENDING; - public runAfter: Date; + public visibleAt: Date; public output: Output | null = null; - public runAttempts: number = 0; - public lastRanAt: Date | null = null; + public attempts: number = 0; + public lastAttemptedAt: Date | null = null; public completedAt: Date | null = null; public deadlineAt: Date | null = null; public error: string | null = null; @@ -79,8 +83,12 @@ export class Job { public progress: number = 0; public progressMessage: string = ""; public progressDetails: Record | null = null; - /** The ID of the worker that claimed this job */ - public workerId: string | null = null; + /** The ID of the worker that currently holds the lease on this job */ + public leaseOwner: string | null = null; + /** ISO timestamp when an abort was requested for this job (storage-layer concern) */ + public abort_requested_at: string | null = null; + /** ISO timestamp when the current lease expires (storage-layer concern) */ + public lease_expires_at: string | null = null; constructor({ queueName, @@ -91,22 +99,24 @@ export class Job { errorCode = null, fingerprint = undefined, output = null, - maxRetries = 10, + maxAttempts = 10, createdAt = new Date(), completedAt = null, status = JobStatus.PENDING, deadlineAt = null, - runAttempts = 0, - lastRanAt = null, - runAfter = new Date(), + attempts = 0, + lastAttemptedAt = null, + visibleAt = new Date(), progress = 0, progressMessage = "", progressDetails = null, - workerId = null, + leaseOwner = null, + abort_requested_at = null, + lease_expires_at = null, }: JobConstructorParam) { - this.runAfter = runAfter ?? new Date(); + this.visibleAt = visibleAt ?? new Date(); this.createdAt = createdAt ?? new Date(); - this.lastRanAt = lastRanAt ?? null; + this.lastAttemptedAt = lastAttemptedAt ?? null; this.deadlineAt = deadlineAt ?? null; this.completedAt = completedAt ?? null; @@ -116,15 +126,17 @@ export class Job { this.status = status; this.fingerprint = fingerprint; this.input = input; - this.maxRetries = maxRetries; - this.runAttempts = runAttempts; + this.maxAttempts = maxAttempts; + this.attempts = attempts; this.output = output; this.error = error; this.errorCode = errorCode; this.progress = progress; this.progressMessage = progressMessage; this.progressDetails = progressDetails; - this.workerId = workerId ?? null; + this.leaseOwner = leaseOwner ?? null; + this.abort_requested_at = abort_requested_at ?? null; + this.lease_expires_at = lease_expires_at ?? null; } async execute(_input: Input, _context: IJobExecuteContext): Promise { diff --git a/packages/job-queue/src/job/JobQueueClient.ts b/packages/job-queue/src/job/JobQueueClient.ts index 1120888c7..7cb700eb3 100644 --- a/packages/job-queue/src/job/JobQueueClient.ts +++ b/packages/job-queue/src/job/JobQueueClient.ts @@ -5,12 +5,15 @@ */ import { EventEmitter } from "@workglow/util"; +import type { IJobStore } from "../queue-storage/IJobStore"; +import type { IMessageQueue } from "../queue-storage/IMessageQueue"; import type { IQueueStorage, JobStorageFormat, QueueChangePayload, } from "../queue-storage/IQueueStorage"; import { JobStatus } from "../queue-storage/IQueueStorage"; +import { wrapQueueStorage } from "../queue-storage/wrapQueueStorage"; import { Job } from "./Job"; import { AbortSignalJobError, @@ -45,7 +48,13 @@ export interface JobHandle { * Options for creating a JobQueueClient */ export interface JobQueueClientOptions { - readonly storage: IQueueStorage; + /** + * Legacy single-storage option. Provide either `storage` OR the paired + * `messageQueue`+`jobStore`. + */ + readonly storage?: IQueueStorage; + readonly messageQueue?: IMessageQueue>; + readonly jobStore?: IJobStore; readonly queueName: string; } @@ -56,7 +65,9 @@ export interface JobQueueClientOptions { */ export class JobQueueClient { public readonly queueName: string; - protected readonly storage: IQueueStorage; + protected readonly messageQueue: IMessageQueue>; + protected readonly jobStore: IJobStore; + protected readonly storage: IQueueStorage | null; protected readonly events = new EventEmitter>(); protected server: JobQueueServer | null = null; protected storageUnsubscribe: (() => void) | null = null; @@ -91,7 +102,20 @@ export class JobQueueClient { constructor(options: JobQueueClientOptions) { this.queueName = options.queueName; - this.storage = options.storage; + if (options.messageQueue && options.jobStore) { + this.messageQueue = options.messageQueue; + this.jobStore = options.jobStore; + this.storage = null; + } else if (options.storage) { + const wrapped = wrapQueueStorage(options.storage); + this.messageQueue = wrapped.messageQueue; + this.jobStore = wrapped.jobStore; + this.storage = options.storage; + } else { + throw new Error( + "JobQueueClient requires either `storage` or both `messageQueue` and `jobStore`" + ); + } } /** @@ -135,7 +159,10 @@ export class JobQueueClient { return; // Already subscribed } - this.storageUnsubscribe = this.storage.subscribeToChanges( + const sub = this.messageQueue.subscribeToChanges; + if (!sub) return; // backend doesn't support subscriptions + this.storageUnsubscribe = sub.call( + this.messageQueue, (change: QueueChangePayload) => { this.handleStorageChange(change); } @@ -154,16 +181,18 @@ export class JobQueueClient { } /** - * Submit a job to the queue + * Send a job to the queue */ - public async submit( + public async send( input: Input, options?: { readonly jobRunId?: string; readonly fingerprint?: string; - readonly maxRetries?: number; - readonly runAfter?: Date; - readonly deadlineAt?: Date; + readonly maxAttempts?: number; + /** Delay in seconds before the job becomes visible for processing */ + readonly delaySeconds?: number; + /** Timeout in seconds after which the job deadline is exceeded */ + readonly timeoutSeconds?: number; } ): Promise> { const job: JobStorageFormat = { @@ -171,14 +200,20 @@ export class JobQueueClient { input, job_run_id: options?.jobRunId, fingerprint: options?.fingerprint, - max_retries: options?.maxRetries ?? 10, - run_after: options?.runAfter?.toISOString() ?? new Date().toISOString(), - deadline_at: options?.deadlineAt?.toISOString() ?? null, + max_attempts: options?.maxAttempts ?? 10, + visible_at: + options?.delaySeconds != null + ? new Date(Date.now() + options.delaySeconds * 1000).toISOString() + : new Date().toISOString(), + deadline_at: + options?.timeoutSeconds != null + ? new Date(Date.now() + options.timeoutSeconds * 1000).toISOString() + : null, completed_at: null, status: JobStatus.PENDING, }; - const id = await this.storage.add(job); + const id = await this.messageQueue.send(job); // Same-process fast path: poke the worker directly so it doesn't have to // wait for the poll interval (crucial for Sqlite/Postgres, whose @@ -189,18 +224,18 @@ export class JobQueueClient { } /** - * Submit multiple jobs to the queue + * Send multiple jobs to the queue */ - public async submitBatch( + public async sendBatch( inputs: readonly Input[], options?: { readonly jobRunId?: string; - readonly maxRetries?: number; + readonly maxAttempts?: number; } ): Promise[]> { const handles: JobHandle[] = []; for (const input of inputs) { - const handle = await this.submit(input, options); + const handle = await this.send(input, options); handles.push(handle); } return handles; @@ -211,7 +246,7 @@ export class JobQueueClient { */ public async getJob(id: unknown): Promise | undefined> { if (!id) throw new JobNotFoundError("Cannot get undefined job"); - const job = await this.storage.get(id); + const job = await this.jobStore.get(id); if (!job) return undefined; return this.storageToClass(job); } @@ -221,7 +256,7 @@ export class JobQueueClient { */ public async getJobsByRunId(runId: string): Promise[]> { if (!runId) throw new JobNotFoundError("Cannot get jobs by undefined runId"); - const jobs = await this.storage.getByRunId(runId); + const jobs = await this.jobStore.getByRunId(runId); return jobs.map((job) => this.storageToClass(job)); } @@ -229,7 +264,7 @@ export class JobQueueClient { * Peek at jobs in the queue */ public async peek(status?: JobStatus, num?: number): Promise[]> { - const jobs = await this.storage.peek(status, num); + const jobs = await this.jobStore.peek(status, num); return jobs.map((job) => this.storageToClass(job)); } @@ -237,7 +272,7 @@ export class JobQueueClient { * Get the size of the queue */ public async size(status?: JobStatus): Promise { - return this.storage.size(status); + return this.jobStore.size(status); } /** @@ -245,7 +280,7 @@ export class JobQueueClient { */ public async outputForInput(input: Input): Promise { if (!input) throw new JobNotFoundError("Cannot get output for undefined input"); - return this.storage.outputForInput(input); + return this.jobStore.outputForInput(input); } /** @@ -310,23 +345,24 @@ export class JobQueueClient { * * Same-process path: fires the in-memory abort controller on the attached * server — `handleAbort` will write FAILED directly, so we skip the - * `storage.abort(…)` ABORTING write. Writing both would race (last-writer- - * wins) and can leave the row stuck at ABORTING on async storages. + * `storage.abort(…)` write. Writing both would race (last-writer-wins) and + * can leave the row in an inconsistent state on async storages. * * Cross-process path (or job not currently running on any local worker): - * write ABORTING to storage so the remote worker's poll picks it up. + * write abort_requested_at to storage so the remote worker's poll picks it + * up (or mark FAILED immediately if the job is still PENDING). * * Crash window: if the process dies after the in-memory abort fires but - * before `failJob` writes FAILED, the row stays PROCESSING. `fixupJobs()` - * resets it to PENDING on next start and the job will re-run. Make handlers - * idempotent (or use `uniquenessKey`) if that's not acceptable. + * before `failJob` writes FAILED, the row stays PROCESSING. Lease expiry + * in `next()` will re-claim it on the next start so the job will re-run. + * Make handlers idempotent if that's not acceptable. */ public async abort(jobId: unknown): Promise { if (!jobId) throw new JobNotFoundError("Cannot abort undefined job"); const firedLocally = this.server?.abortJob(jobId) ?? false; if (!firedLocally) { try { - await this.storage.abort(jobId); + await this.jobStore.abort(jobId); } finally { this.events.emit("job_aborting", this.queueName, jobId); } @@ -479,8 +515,8 @@ export class JobQueueClient { * Called by server when a job is retried * @internal */ - public handleJobRetry(jobId: unknown, runAfter: Date): void { - this.events.emit("job_retry", this.queueName, jobId, runAfter); + public handleJobRetry(jobId: unknown, visibleAt: Date): void { + this.events.emit("job_retry", this.queueName, jobId, visibleAt); } /** @@ -551,8 +587,8 @@ export class JobQueueClient { this.handleJobDisabled(jobId); } else if (newStatus === JobStatus.PENDING && oldStatus === JobStatus.PROCESSING) { // Retry - const runAfter = change.new.run_after ? new Date(change.new.run_after) : new Date(); - this.handleJobRetry(jobId, runAfter); + const visibleAt = change.new.visible_at ? new Date(change.new.visible_at) : new Date(); + this.handleJobRetry(jobId, visibleAt); } // Progress update diff --git a/packages/job-queue/src/job/JobQueueEventListeners.ts b/packages/job-queue/src/job/JobQueueEventListeners.ts index 52beb50b3..ad3b7c457 100644 --- a/packages/job-queue/src/job/JobQueueEventListeners.ts +++ b/packages/job-queue/src/job/JobQueueEventListeners.ts @@ -17,7 +17,7 @@ export type JobQueueEventListeners = { job_complete: (queueName: string, jobId: unknown, output: Output) => void; job_error: (queueName: string, jobId: unknown, error: string) => void; job_disabled: (queueName: string, jobId: unknown) => void; - job_retry: (queueName: string, jobId: unknown, runAfter: Date) => void; + job_retry: (queueName: string, jobId: unknown, visibleAt: Date) => void; job_progress: ( queueName: string, jobId: unknown, diff --git a/packages/job-queue/src/job/JobQueueServer.ts b/packages/job-queue/src/job/JobQueueServer.ts index d910b430f..25572d209 100644 --- a/packages/job-queue/src/job/JobQueueServer.ts +++ b/packages/job-queue/src/job/JobQueueServer.ts @@ -7,12 +7,16 @@ import { EventEmitter, getLogger } from "@workglow/util"; import { ILimiter } from "../limiter/ILimiter"; import { NullLimiter } from "../limiter/NullLimiter"; +import type { IJobStore } from "../queue-storage/IJobStore"; +import type { IMessageQueue } from "../queue-storage/IMessageQueue"; import type { IQueueStorage, JobStorageFormat, QueueChangePayload, } from "../queue-storage/IQueueStorage"; import { JobStatus } from "../queue-storage/IQueueStorage"; +import { wrapQueueStorage } from "../queue-storage/wrapQueueStorage"; +import type { DeadLetter } from "./DeadLetter"; import { Job, JobClass } from "./Job"; import { JobQueueClient } from "./JobQueueClient"; import { JobQueueWorker } from "./JobQueueWorker"; @@ -42,7 +46,7 @@ export type JobQueueServerEventListeners = { job_complete: (queueName: string, jobId: unknown, output: Output) => void; job_error: (queueName: string, jobId: unknown, error: string) => void; job_disabled: (queueName: string, jobId: unknown) => void; - job_retry: (queueName: string, jobId: unknown, runAfter: Date) => void; + job_retry: (queueName: string, jobId: unknown, visibleAt: Date) => void; job_progress: ( queueName: string, jobId: unknown, @@ -58,7 +62,14 @@ export type JobQueueServerEvents = keyof JobQueueServerEventListeners { - readonly storage: IQueueStorage; + /** + * Legacy single-storage option. Provide either `storage` OR the paired + * `messageQueue`+`jobStore`. When `storage` is given it is wrapped via + * {@link wrapQueueStorage} internally. + */ + readonly storage?: IQueueStorage; + readonly messageQueue?: IMessageQueue>; + readonly jobStore?: IJobStore; readonly queueName: string; readonly limiter?: ILimiter; readonly workerCount?: number; @@ -72,6 +83,13 @@ export interface JobQueueServerOptions { * Defaults to 30s. Set to 0 to abort immediately. */ readonly stopTimeoutMs?: number; + /** + * Dead-letter queue to forward exhausted jobs to, or "discard" to silently drop them. + * Default: "discard". + */ + readonly deadLetter?: IMessageQueue> | "discard"; + /** Number of jobs to pre-fetch per poll iteration. Defaults to 1. */ + readonly prefetch?: number; } /** @@ -84,7 +102,10 @@ export class JobQueueServer< QueueJob extends Job = Job, > { public readonly queueName: string; - protected readonly storage: IQueueStorage; + protected readonly messageQueue: IMessageQueue>; + protected readonly jobStore: IJobStore; + /** Optional legacy storage handle, set when the user constructed via `storage`. */ + protected readonly storage: IQueueStorage | null; protected readonly jobClass: JobClass; public readonly limiter: ILimiter; protected readonly workerCount: number; @@ -94,6 +115,8 @@ export class JobQueueServer< protected readonly deleteAfterDisabledMs?: number; protected readonly cleanupIntervalMs: number; protected readonly stopTimeoutMs?: number; + protected readonly deadLetter: IMessageQueue> | "discard"; + protected readonly prefetch: number; protected readonly events = new EventEmitter>(); protected readonly workers: JobQueueWorker[] = []; @@ -115,7 +138,20 @@ export class JobQueueServer< constructor(jobClass: JobClass, options: JobQueueServerOptions) { this.queueName = options.queueName; - this.storage = options.storage; + if (options.messageQueue && options.jobStore) { + this.messageQueue = options.messageQueue; + this.jobStore = options.jobStore; + this.storage = null; + } else if (options.storage) { + const wrapped = wrapQueueStorage(options.storage); + this.messageQueue = wrapped.messageQueue; + this.jobStore = wrapped.jobStore; + this.storage = options.storage; + } else { + throw new Error( + "JobQueueServer requires either `storage` or both `messageQueue` and `jobStore`" + ); + } this.jobClass = jobClass; this.limiter = options.limiter ?? new NullLimiter(); this.workerCount = options.workerCount ?? 1; @@ -125,6 +161,8 @@ export class JobQueueServer< this.deleteAfterDisabledMs = options.deleteAfterDisabledMs; this.cleanupIntervalMs = options.cleanupIntervalMs ?? 10000; this.stopTimeoutMs = options.stopTimeoutMs; + this.deadLetter = options.deadLetter ?? "discard"; + this.prefetch = Math.max(1, options.prefetch ?? 1); this.initializeWorkers(); } @@ -146,7 +184,7 @@ export class JobQueueServer< // N× the limit (one bucket per process). if ( this.limiter.scope === "process" && - this.storage.scope === "cluster" && + this.messageQueue.scope === "cluster" && !(this.limiter instanceof NullLimiter) ) { getLogger().warn( @@ -154,14 +192,11 @@ export class JobQueueServer< { queueName: this.queueName, limiterScope: this.limiter.scope, - storage: this.storage.constructor.name, + storage: this.storage?.constructor.name ?? this.messageQueue.constructor.name, } ); } - // Fix stuck jobs from previous runs - await this.fixupJobs(); - // Subscribe to storage changes to wake workers when new work arrives. // - Cross-process deployments rely on this for wake-up. // - Same-process attached clients are also primarily woken by the direct @@ -173,7 +208,10 @@ export class JobQueueServer< // - Sqlite/Postgres throw here; the try/catch falls through and direct // notify is the sole wake path on those backends. try { - this.storageUnsubscribe = this.storage.subscribeToChanges( + const sub = this.messageQueue.subscribeToChanges; + if (!sub) throw new Error("messageQueue does not support subscribeToChanges"); + this.storageUnsubscribe = sub.call( + this.messageQueue, (change: QueueChangePayload) => { if ( change.type === "INSERT" || @@ -257,10 +295,20 @@ export class JobQueueServer< /** * Get the storage instance (for client connection) */ - public getStorage(): IQueueStorage { + public getStorage(): IQueueStorage | null { return this.storage; } + /** Get the message queue paired with this server. */ + public getMessageQueue(): IMessageQueue> { + return this.messageQueue; + } + + /** Get the job store paired with this server. */ + public getJobStore(): IJobStore { + return this.jobStore; + } + /** * Scale the number of workers */ @@ -391,11 +439,14 @@ export class JobQueueServer< */ protected createWorker(): JobQueueWorker { const worker = new JobQueueWorker(this.jobClass, { - storage: this.storage, + messageQueue: this.messageQueue, + jobStore: this.jobStore, queueName: this.queueName, limiter: this.limiter, pollIntervalMs: this.pollIntervalMs, stopTimeoutMs: this.stopTimeoutMs, + deadLetter: this.deadLetter, + prefetch: this.prefetch, }); // Forward worker events to server and clients @@ -413,7 +464,7 @@ export class JobQueueServer< // Immediate deletion when configured if (this.deleteAfterCompletionMs === 0) { - this.storage.delete(jobId).catch((err) => { + this.jobStore.delete(jobId).catch((err) => { console.error("Error deleting job after completion:", err); }); } @@ -429,7 +480,7 @@ export class JobQueueServer< // Immediate deletion when configured if (this.deleteAfterFailureMs === 0) { - this.storage.delete(jobId).catch((err) => { + this.jobStore.delete(jobId).catch((err) => { console.error("Error deleting job after error:", err); }); } @@ -445,7 +496,7 @@ export class JobQueueServer< // Immediate deletion when configured if (this.deleteAfterDisabledMs === 0) { - this.storage.delete(jobId).catch((err) => { + this.jobStore.delete(jobId).catch((err) => { console.error("Error deleting job after disabling:", err); }); } @@ -454,10 +505,10 @@ export class JobQueueServer< this.notifyWorkers(); }); - worker.on("job_retry", (jobId, runAfter) => { + worker.on("job_retry", (jobId, visibleAt) => { this.stats = { ...this.stats, retriedJobs: this.stats.retriedJobs + 1 }; - this.events.emit("job_retry", this.queueName, jobId, runAfter); - this.forwardToClients("handleJobRetry", jobId, runAfter); + this.events.emit("job_retry", this.queueName, jobId, visibleAt); + this.forwardToClients("handleJobRetry", jobId, visibleAt); }); worker.on("job_progress", (jobId, progress, message, details) => { @@ -480,7 +531,7 @@ export class JobQueueServer< errorCode?: string ): void; protected forwardToClients(method: "handleJobDisabled", jobId: unknown): void; - protected forwardToClients(method: "handleJobRetry", jobId: unknown, runAfter: Date): void; + protected forwardToClients(method: "handleJobRetry", jobId: unknown, visibleAt: Date): void; protected forwardToClients( method: "handleJobProgress", jobId: unknown, @@ -541,71 +592,23 @@ export class JobQueueServer< // Delete completed jobs after TTL if (this.deleteAfterCompletionMs !== undefined && this.deleteAfterCompletionMs > 0) { - await this.storage.deleteJobsByStatusAndAge( - JobStatus.COMPLETED, - this.deleteAfterCompletionMs - ); + await this.jobStore.deleteByStatusAndAge(JobStatus.COMPLETED, this.deleteAfterCompletionMs); } // Delete failed jobs after TTL if (this.deleteAfterFailureMs !== undefined && this.deleteAfterFailureMs > 0) { - await this.storage.deleteJobsByStatusAndAge(JobStatus.FAILED, this.deleteAfterFailureMs); + await this.jobStore.deleteByStatusAndAge(JobStatus.FAILED, this.deleteAfterFailureMs); } // Delete disabled jobs after TTL if (this.deleteAfterDisabledMs !== undefined && this.deleteAfterDisabledMs > 0) { - await this.storage.deleteJobsByStatusAndAge(JobStatus.DISABLED, this.deleteAfterDisabledMs); + await this.jobStore.deleteByStatusAndAge(JobStatus.DISABLED, this.deleteAfterDisabledMs); } } catch (error) { console.error("Error in cleanup:", error); } } - /** - * Fix stuck jobs from previous server runs. - * Jobs in PROCESSING or ABORTING state that are not owned by any of the current - * server's workers are considered orphaned and will be reset. - */ - protected async fixupJobs(): Promise { - try { - const stuckProcessingJobs = await this.storage.peek(JobStatus.PROCESSING); - const stuckAbortingJobs = await this.storage.peek(JobStatus.ABORTING); - const stuckJobs = [...stuckProcessingJobs, ...stuckAbortingJobs]; - - // Get the worker IDs of all workers managed by this server - const currentWorkerIds = new Set(this.getWorkerIds()); - - for (const jobData of stuckJobs) { - // Skip jobs that belong to workers in this server (they may still be processing) - if (jobData.worker_id && currentWorkerIds.has(jobData.worker_id)) { - continue; - } - - const job = this.storageToClass(jobData); - if (job.runAttempts >= job.maxRetries) { - job.status = JobStatus.FAILED; - job.error = "Max retries reached"; - job.errorCode = "MAX_RETRIES_REACHED"; - // Clear worker_id since job is now failed - job.workerId = null; - } else { - job.status = JobStatus.PENDING; - job.runAfter = job.lastRanAt || new Date(); - job.progress = 0; - job.progressMessage = ""; - job.progressDetails = null; - job.error = "Server restarted"; - // Clear worker_id so a new worker can claim this job - job.workerId = null; - } - - await this.storage.complete(this.classToStorage(job)); - } - } catch (error) { - console.error("Error in fixupJobs:", error); - } - } - /** * Convert storage format to Job class */ @@ -619,11 +622,4 @@ export class JobQueueServer< protected classToStorage(job: Job): JobStorageFormat { return classToStorage(job, this.queueName); } - - /** - * Get the worker IDs of all workers managed by this server - */ - public getWorkerIds(): string[] { - return this.workers.map((worker) => worker.workerId); - } } diff --git a/packages/job-queue/src/job/JobQueueWorker.ts b/packages/job-queue/src/job/JobQueueWorker.ts index 09c0c49e9..f4bbb5d92 100644 --- a/packages/job-queue/src/job/JobQueueWorker.ts +++ b/packages/job-queue/src/job/JobQueueWorker.ts @@ -14,8 +14,13 @@ import { } from "@workglow/util"; import { ILimiter } from "../limiter/ILimiter"; import { NullLimiter } from "../limiter/NullLimiter"; +import type { IClaim } from "../queue-storage/IClaim"; +import type { IJobStore } from "../queue-storage/IJobStore"; +import type { IMessageQueue } from "../queue-storage/IMessageQueue"; import type { IQueueStorage, JobStorageFormat } from "../queue-storage/IQueueStorage"; import { JobStatus } from "../queue-storage/IQueueStorage"; +import { wrapQueueStorage } from "../queue-storage/wrapQueueStorage"; +import type { DeadLetter } from "./DeadLetter"; import { Job, JobClass } from "./Job"; import { AbortSignalJobError, @@ -44,7 +49,7 @@ export type JobQueueWorkerEventListeners = { job_complete: (jobId: unknown, output: Output) => void; job_error: (jobId: unknown, error: string, errorCode?: string) => void; job_disabled: (jobId: unknown) => void; - job_retry: (jobId: unknown, runAfter: Date) => void; + job_retry: (jobId: unknown, visibleAt: Date) => void; job_progress: ( jobId: unknown, progress: number, @@ -61,7 +66,14 @@ export type JobQueueWorkerEvents = keyof JobQueueWorkerEventListeners { - readonly storage: IQueueStorage; + /** + * Legacy single-storage option. Provide either `storage` OR the paired + * `messageQueue`+`jobStore`. When `storage` is given it is wrapped via + * {@link wrapQueueStorage} internally. + */ + readonly storage?: IQueueStorage; + readonly messageQueue?: IMessageQueue>; + readonly jobStore?: IJobStore; readonly queueName: string; readonly limiter?: ILimiter; readonly pollIntervalMs?: number; @@ -75,6 +87,30 @@ export interface JobQueueWorkerOptions { * Defaults to 30s. Set to 0 to abort immediately. */ readonly stopTimeoutMs?: number; + /** + * If true, the worker will call extendLease periodically while a job is + * executing. Extension interval is leaseMs * 0.5. Default: false. + */ + readonly extendLeaseWhileRunning?: boolean; + /** + * How long (ms) the worker's lease on a claimed job lasts before another + * worker may re-claim it. Must be long enough to cover the maximum + * expected job duration if extendLeaseWhileRunning is false. + * Defaults to max(30_000, pollIntervalMs * 60). + */ + readonly leaseMs?: number; + /** + * Dead-letter queue to forward exhausted jobs to, or "discard" to drop them. + * Default: "discard". + */ + readonly deadLetter?: IMessageQueue> | "discard"; + /** + * Number of claims to fetch per loop iteration. Default: 1. + * With prefetch > 1, claims that cannot immediately acquire a limiter slot + * are released back to PENDING via retry, so concurrency is still governed + * by the limiter. + */ + readonly prefetch?: number; } /** @@ -88,12 +124,17 @@ export class JobQueueWorker< > { public readonly queueName: string; public readonly workerId: string; - protected readonly storage: IQueueStorage; + protected readonly messageQueue: IMessageQueue>; + protected readonly jobStore: IJobStore; protected readonly jobClass: JobClass; protected readonly limiter: ILimiter; protected readonly pollIntervalMs: number; protected readonly stopTimeoutMs: number; + protected readonly extendLeaseWhileRunning: boolean; + protected readonly leaseMs: number; protected readonly events = new EventEmitter>(); + protected readonly deadLetter: IMessageQueue> | "discard"; + protected readonly prefetch: number; protected running = false; @@ -104,6 +145,12 @@ export class JobQueueWorker< */ private readonly inFlight: Map> = new Map(); + /** + * Active claims for jobs currently being processed. Used to drive + * ack/retry/fail/extendLease in completion paths. + */ + private readonly activeClaims: Map>> = new Map(); + /** * Resolve function for the idle wait promise. * When set, the worker is idle and waiting for either a notification or poll timeout. @@ -145,11 +192,26 @@ export class JobQueueWorker< constructor(jobClass: JobClass, options: JobQueueWorkerOptions) { this.queueName = options.queueName; this.workerId = options.workerId ?? uuid4(); - this.storage = options.storage; + if (options.messageQueue && options.jobStore) { + this.messageQueue = options.messageQueue; + this.jobStore = options.jobStore; + } else if (options.storage) { + const wrapped = wrapQueueStorage(options.storage); + this.messageQueue = wrapped.messageQueue; + this.jobStore = wrapped.jobStore; + } else { + throw new Error( + "JobQueueWorker requires either `storage` or both `messageQueue` and `jobStore`" + ); + } this.jobClass = jobClass; this.limiter = options.limiter ?? new NullLimiter(); this.pollIntervalMs = options.pollIntervalMs ?? 100; this.stopTimeoutMs = options.stopTimeoutMs ?? 30_000; + this.extendLeaseWhileRunning = options.extendLeaseWhileRunning ?? false; + this.leaseMs = options.leaseMs ?? Math.max(30_000, this.pollIntervalMs * 60); + this.deadLetter = options.deadLetter ?? "discard"; + this.prefetch = Math.max(1, options.prefetch ?? 1); } /** @@ -309,12 +371,43 @@ export class JobQueueWorker< // ======================================================================== /** - * Get the next job from the queue + * Get the next job from the queue (always fetches a single claim). + * Used by {@link processNext} and the single-claim path of {@link processJobs}. */ protected async next(): Promise { - const job = await this.storage.next(this.workerId); - if (!job) return undefined; - return this.storageToClass(job) as QueueJob; + const claims = await this.messageQueue.receive({ + workerId: this.workerId, + leaseMs: this.leaseMs, + max: 1, + }); + const claim = claims[0]; + if (!claim) return undefined; + const job = this.storageToClass(claim.body) as QueueJob; + if (job.id != null) { + this.activeClaims.set(job.id, claim); + } + return job; + } + + /** + * Fetch up to `this.prefetch` claims from the queue and register them in + * {@link activeClaims}. Returns an array of jobs ready to be dispatched. + */ + private async nextBatch(): Promise { + const claims = await this.messageQueue.receive({ + workerId: this.workerId, + leaseMs: this.leaseMs, + max: this.prefetch, + }); + const jobs: QueueJob[] = []; + for (const claim of claims) { + const job = this.storageToClass(claim.body) as QueueJob; + if (job.id != null) { + this.activeClaims.set(job.id, claim); + } + jobs.push(job); + } + return jobs; } /** @@ -342,8 +435,13 @@ export class JobQueueWorker< // overshooting the configured limit by exactly the worker concurrency. // The atomic tryAcquire guarantees only one of N concurrent acquirers // succeeds when there's one slot left. - const job = await this.next(); - if (!job) { + // + // With prefetch > 1, we claim up to `prefetch` jobs at once and do a + // non-blocking tryAcquire for each. Jobs that can't immediately get a + // limiter slot are released back to PENDING so other workers can pick + // them up. With prefetch == 1 the behavior is identical to before. + const jobs = await this.nextBatch(); + if (jobs.length === 0) { // Queue is empty — sleep until notified of new work or until the // next deferred job becomes ready. const delay = await this.getIdleDelay(); @@ -351,38 +449,50 @@ export class JobQueueWorker< continue; } - if (!this.running) { - // Stopped during the await. Release the job back to PENDING. - await this.releaseClaimedJob(job); - return; - } + let anyDispatched = false; + let limiterFull = false; - const limiterToken = await this.limiter.tryAcquire(); - if (limiterToken === null || limiterToken === undefined) { - // Lost the race for the last slot, or hit the rate-limit window. - // Give the job back and wait for capacity. - await this.releaseClaimedJob(job); - await this.waitForWakeOrTimeout(await this.getLimiterWakeDelay()); - continue; + for (const job of jobs) { + if (!this.running) { + // Stopped during the batch. Release all remaining jobs back to PENDING. + await this.releaseClaimedJob(job); + continue; + } + + const limiterToken = await this.limiter.tryAcquire(); + if (limiterToken === null || limiterToken === undefined) { + // No limiter slot available — release the claim back so another + // worker (or this one on a later iteration) can pick it up. + await this.releaseClaimedJob(job); + limiterFull = true; + continue; + } + + if (!this.running) { + // Stop fired while tryAcquire was in flight. + try { + await this.limiter.release(limiterToken); + } catch { + // best-effort + } + await this.releaseClaimedJob(job); + continue; + } + + // Don't await - process in background to allow concurrent jobs. + this.processSingleJob(job, limiterToken); + anyDispatched = true; } if (!this.running) { - // Stop fired while tryAcquire was in flight. Undo both the limiter - // reservation (release THIS token, not "the most recent") and the - // job claim so we don't start processing on a worker that's about - // to exit. - try { - await this.limiter.release(limiterToken); - } catch { - // best-effort - } - await this.releaseClaimedJob(job); return; } - // Don't await - process in background to allow concurrent jobs. - // The loop will claim+acquire on the next iteration. - this.processSingleJob(job, limiterToken); + // If the limiter was full for every claim we fetched, back off before + // retrying so we don't busy-loop hammering the queue. + if (!anyDispatched && limiterFull) { + await this.waitForWakeOrTimeout(await this.getLimiterWakeDelay()); + } } catch { // Don't let transient errors kill the loop await sleep(this.pollIntervalMs); @@ -411,15 +521,15 @@ export class JobQueueWorker< /** * Determine how long to sleep when idle. * - * Peeks at the earliest PENDING job: if it has a future `run_after`, + * Peeks at the earliest PENDING job: if it has a future `visible_at`, * returns the time until it becomes ready (clamped to `pollIntervalMs`); * otherwise returns `pollIntervalMs`. */ private async getIdleDelay(): Promise { try { - const pending = await this.storage.peek(JobStatus.PENDING, 1); - if (pending.length > 0 && pending[0].run_after) { - const delay = new Date(pending[0].run_after).getTime() - Date.now(); + const pending = await this.jobStore.peek(JobStatus.PENDING, 1); + if (pending.length > 0 && pending[0].visible_at) { + const delay = new Date(pending[0].visible_at).getTime() - Date.now(); if (delay > 0) { return Math.min(delay, this.pollIntervalMs); } @@ -457,19 +567,20 @@ export class JobQueueWorker< } /** - * Check for jobs that have been marked for abort and trigger their abort controllers. - * - * Only relevant for jobs running on THIS worker (we have an abort controller - * registered for them). When no jobs are active, the peek result is irrelevant - * — skip the storage round-trip entirely. Important for battery life on - * same-process deployments (browser/mobile) where workers spend most time idle. + * Check for in-process jobs that have abort_requested_at set and trigger + * their abort controllers. Only relevant for jobs running on THIS worker + * (we have an abort controller registered for them). When no jobs are active, + * the peek result is irrelevant — skip the storage round-trip entirely. + * Important for battery life on same-process deployments (browser/mobile) + * where workers spend most time idle. */ protected async checkForAbortingJobs(): Promise { if (this.activeJobAbortControllers.size === 0) { return; } - const abortingJobs = await this.storage.peek(JobStatus.ABORTING); - for (const jobData of abortingJobs) { + const processingJobs = await this.jobStore.peek(JobStatus.PROCESSING); + for (const jobData of processingJobs) { + if (!jobData.abort_requested_at) continue; const controller = this.activeJobAbortControllers.get(jobData.id); if (controller && !controller.signal.aborted) { controller.abort(); @@ -497,18 +608,13 @@ export class JobQueueWorker< attributes: { "workglow.job.id": String(job.id), "workglow.job.queue": this.queueName, - "workglow.job.worker_id": this.workerId, - "workglow.job.run_attempt": job.runAttempts, - "workglow.job.max_retries": job.maxRetries, + "workglow.job.lease_owner": this.workerId, + "workglow.job.attempt": job.attempts, + "workglow.job.max_attempts": job.maxAttempts, }, }) : undefined; - // Set when validateJobState fails and we release() the limiter slot - // ourselves — the finally block then skips recordJobCompletion to avoid - // double-decrementing limiters where release() and recordJobCompletion() - // both decrement (e.g. ConcurrencyLimiter). - let slotReleased = false; try { // The limiter slot was already atomically reserved by tryAcquire() in // the main loop (or processNext), so we no longer call recordJobStart @@ -516,23 +622,37 @@ export class JobQueueWorker< try { await this.validateJobState(job); } catch (validationErr) { - // Validation failed before we ran any actual work — release THIS - // limiter slot (by token, not by recency) so it doesn't count toward - // the rate limit and we don't accidentally release another worker's - // slot. - try { - await this.limiter.release(limiterToken); - slotReleased = true; - } catch { - // best-effort - } + // Throw — the outer finally block's limiter.complete() will release + // the slot. Do NOT call limiter.release() here too; that would + // double-decrement the counter and admit one extra concurrent job. throw validationErr; } const abortController = this.createAbortController(job.id); this.events.emit("job_start", job.id); - const output = await this.executeJob(job, abortController.signal); + let leaseInterval: ReturnType | undefined; + if (this.extendLeaseWhileRunning) { + leaseInterval = setInterval(() => { + const claim = this.activeClaims.get(job.id); + if (!claim) return; + claim.extendLease(this.leaseMs).catch((err) => { + getLogger().error("extendLease failed during job execution:", { + error: err, + jobId: job.id, + }); + }); + }, this.leaseMs * 0.5); + } + + let output: Output; + try { + output = await this.executeJob(job, abortController.signal); + } finally { + if (leaseInterval !== undefined) { + clearInterval(leaseInterval); + } + } await this.completeJob(job, output); const elapsed = Date.now() - startTime; @@ -551,14 +671,33 @@ export class JobQueueWorker< throw new JobNotFoundError(`Job ${job.id} not found`); } - if (currentJob.runAttempts >= currentJob.maxRetries) { - spanErrorMessage = "Max retries reached"; + if (currentJob.attempts + 1 >= currentJob.maxAttempts) { + spanErrorMessage = "Max attempts reached"; + // Forward to dead-letter queue before marking as failed + if (this.deadLetter !== "discard") { + try { + await this.deadLetter.send({ + original: currentJob.input, + error: error.message, + errorCode: error.constructor.name ?? null, + attempts: currentJob.attempts, + queueName: this.queueName, + jobRunId: currentJob.jobRunId, + }); + } catch (dlqErr) { + getLogger().error("Dead-letter queue send failed:", { error: dlqErr }); + } + } await this.failJob(currentJob, new PermanentJobError(spanErrorMessage)); span?.setStatus(SpanStatusCode.ERROR, spanErrorMessage); } else { + // Only delete the abort controller (not the claim) so rescheduleJob + // can still call claim.retry(). rescheduleJob's finally block drops + // the claim from activeClaims after it has been settled. + this.activeJobAbortControllers.delete(job.id); await this.rescheduleJob(currentJob, error.retryDate); span?.addEvent("workglow.job.retry", { - "workglow.job.run_attempt": currentJob.runAttempts, + "workglow.job.attempt": currentJob.attempts, }); span?.setStatus(SpanStatusCode.UNSET); } @@ -568,15 +707,16 @@ export class JobQueueWorker< } span?.setAttributes({ "workglow.job.error": spanErrorMessage }); } finally { + await this.limiter.complete(limiterToken); span?.end(); - try { - if (!slotReleased) { - await this.limiter.recordJobCompletion(); - } - } finally { + // Guard against a concurrent processSingleJob for the same jobId (which + // can start before this finally block runs, e.g. after a reschedule). + // Only delete our own inFlight entry; if another invocation already + // replaced it, leave that entry alone. + if (this.inFlight.get(job.id) === inFlightPromise) { this.inFlight.delete(job.id); - resolveInFlight(); } + resolveInFlight(); } } @@ -609,6 +749,11 @@ export class JobQueueWorker< this.events.emit("job_progress", jobId, progress, message, details); } + /** Internal — resolve the active claim for a job id, throw if missing. */ + private getClaim(jobId: unknown): IClaim> | undefined { + return this.activeClaims.get(jobId); + } + /** * Mark a job as completed */ @@ -623,7 +768,23 @@ export class JobQueueWorker< job.error = null; job.errorCode = null; - await this.storage.complete(this.classToStorage(job)); + // H2 atomic ack: hand the result directly to claim.ack() so result + + // COMPLETED status land in a single storage write. If we crash here + // — anywhere between this call site and the storage layer's commit — + // the row stays PROCESSING, the lease expires, the next worker + // reclaims it, and no `job_complete` is ever emitted. Before this + // change we issued `saveResult` then `ack`: a crash between the two + // left a PROCESSING row with the output already written, no + // `job_complete`, and a redelivery that overwrote the previously- + // saved output. + const claim = this.getClaim(job.id); + if (claim) { + await claim.ack(output ?? null); + } else { + // No active claim (rare path — e.g. abort beat us to it). Fall back + // to the legacy two-step so the result is still persisted. + await this.jobStore.saveResult(job.id, (output ?? null) as Output); + } this.events.emit("job_complete", job.id, output as Output); } catch (err) { getLogger().error("completeJob errored:", { error: err }); @@ -645,7 +806,31 @@ export class JobQueueWorker< job.error = error.message; job.errorCode = error?.constructor?.name ?? null; - await this.storage.complete(this.classToStorage(job)); + // H2 atomic fail: hand error/errorCode/abortRequested directly to + // claim.fail() so they land in a single storage write together with + // status=FAILED. Eliminates the saveError-then-fail two-write window + // where a crash could leave the row PROCESSING with an `error` + // already written. + const abortRequested = error instanceof AbortSignalJobError; + const claim = this.getClaim(job.id); + if (claim) { + await claim.fail({ + error: error.message, + errorCode: error.constructor.name ?? null, + abortRequested, + }); + } else { + // Fallback — no active claim (e.g. lease lost or abort path). The + // legacy two-step is still correct here because we're writing + // straight to the job store; the atomicity loss only matters when + // ack/fail and the result write are split across claim and store. + await this.jobStore.saveError( + job.id, + error.message, + error.constructor.name ?? null, + abortRequested + ); + } this.events.emit("job_error", job.id, error.message, error.constructor.name); } catch (err) { getLogger().error("failJob errored:", { error: err }); @@ -665,7 +850,21 @@ export class JobQueueWorker< job.progressMessage = ""; job.progressDetails = null; - await this.storage.complete(this.classToStorage(job)); + // H5 atomic disable: a single storage write sets status=DISABLED, + // releases the lease, and clears progress fields. Replaces the legacy + // two-write `claim.fail()` then `jobStore.saveStatus(DISABLED)` path + // which briefly persisted FAILED before overwriting with DISABLED — + // any subscriber observing during the window saw a spurious + // FAILED transition and fired a `job_error` event. + const claim = this.getClaim(job.id); + if (claim?.disable) { + await claim.disable(); + } else { + // Fallback for external IClaim impls that haven't adopted disable() + // yet. saveStatus uses finalize() under the hood (no attempts bump, + // no error write), so it's correct as a single-write fallback. + await this.jobStore.saveStatus(job.id, JobStatus.DISABLED); + } this.events.emit("job_disabled", job.id); } catch (err) { getLogger().error("disableJob errored:", { error: err }); @@ -677,17 +876,26 @@ export class JobQueueWorker< /** * Release a job that {@link next} just claimed but that we won't process * because the worker was stopped mid-claim. Resets the row to PENDING so - * the next started worker can pick it up. `fixupJobs()` would otherwise - * skip it (it ignores rows owned by current-server worker IDs). + * the next started worker can pick it up. Lease expiry in `next()` would + * otherwise reclaim it after the lease expires. * - * Uses `storage.release()` rather than `storage.complete()` so the retry + * Uses `storage.releaseClaim()` rather than `storage.complete()` so the retry * budget isn't burned: the worker never actually attempted execution. */ protected async releaseClaimedJob(job: Job): Promise { try { - await this.storage.release(job.id); + // Prefer driving the claim's release path so any per-claim cleanup + // (e.g. transient buffers) is consistent with regular settlement. + const claim = this.activeClaims.get(job.id); + if (claim) { + await this.messageQueue.releaseClaim(claim.id); + } else { + await this.messageQueue.releaseClaim(job.id); + } } catch (err) { getLogger().error("releaseClaimedJob errored:", { error: err }); + } finally { + this.activeClaims.delete(job.id); } } @@ -698,18 +906,27 @@ export class JobQueueWorker< try { job.status = JobStatus.PENDING; const nextAvailableTime = await this.limiter.getNextAvailableTime(); - job.runAfter = retryDate instanceof Date ? retryDate : nextAvailableTime; + job.visibleAt = retryDate instanceof Date ? retryDate : nextAvailableTime; job.progress = 0; job.progressMessage = ""; job.progressDetails = null; - // Increment runAttempts to keep in-memory object in sync with storage + // Increment attempts to keep in-memory object in sync with storage // The storage layer will read from DB and increment, so this keeps them aligned - job.runAttempts = (job.runAttempts ?? 0) + 1; + job.attempts = (job.attempts ?? 0) + 1; - await this.storage.complete(this.classToStorage(job)); - this.events.emit("job_retry", job.id, job.runAfter); + const claim = this.getClaim(job.id); + const delaySeconds = Math.max(0, (job.visibleAt.getTime() - Date.now()) / 1000); + if (claim) { + await claim.retry({ delaySeconds }); + } + this.events.emit("job_retry", job.id, job.visibleAt); } catch (err) { getLogger().error("rescheduleJob errored:", { error: err }); + } finally { + // rescheduleJob is called from the catch branch in processSingleJob, + // which already calls cleanupJob via this method's path; ensure the + // claim ref is dropped too. + this.activeClaims.delete(job.id); } } @@ -757,7 +974,7 @@ export class JobQueueWorker< * `completeJob` that won the race (the COMPLETED→FAILED overwrite bug). * * If the job is no longer in flight here, it has already settled — recheck - * storage and only write FAILED for non-terminal states (i.e. an ABORTING + * storage and only write FAILED for non-terminal states (i.e. a PROCESSING * row left over from a cross-process abort that this worker never picked up). */ protected async handleAbort(jobId: unknown): Promise { @@ -783,7 +1000,7 @@ export class JobQueueWorker< * Get a job by ID */ protected async getJob(id: unknown): Promise | undefined> { - const job = await this.storage.get(id); + const job = await this.jobStore.get(id); if (!job) return undefined; return this.storageToClass(job); } @@ -798,10 +1015,7 @@ export class JobQueueWorker< if (job.status === JobStatus.FAILED) { throw new PermanentJobError(`Job ${job.id} has failed`); } - if ( - job.status === JobStatus.ABORTING || - this.activeJobAbortControllers.get(job.id)?.signal.aborted - ) { + if (this.activeJobAbortControllers.get(job.id)?.signal.aborted) { throw new AbortSignalJobError(`Job ${job.id} is being aborted`); } if (job.deadlineAt && job.deadlineAt < new Date()) { @@ -830,6 +1044,7 @@ export class JobQueueWorker< */ protected cleanupJob(jobId: unknown): void { this.activeJobAbortControllers.delete(jobId); + this.activeClaims.delete(jobId); } /** diff --git a/packages/job-queue/src/job/JobStorageConverters.ts b/packages/job-queue/src/job/JobStorageConverters.ts index a38ac4bfe..1d7c4c4f0 100644 --- a/packages/job-queue/src/job/JobStorageConverters.ts +++ b/packages/job-queue/src/job/JobStorageConverters.ts @@ -32,10 +32,12 @@ export function storageToClass( details: JobStorageFormat, jobClass: JobClass, options?: { + /** @deprecated use includeLeaseOwner instead */ readonly includeWorkerId?: boolean; + readonly includeLeaseOwner?: boolean; } ): Job { - const includeWorkerId = options?.includeWorkerId ?? true; + const includeLeaseOwner = options?.includeLeaseOwner ?? options?.includeWorkerId ?? true; return new jobClass({ id: details.id, jobRunId: details.job_run_id, @@ -43,10 +45,10 @@ export function storageToClass( fingerprint: details.fingerprint, input: details.input as Input, output: details.output as Output, - runAfter: toDate(details.run_after), + visibleAt: toDate(details.visible_at), createdAt: toDate(details.created_at)!, deadlineAt: toDate(details.deadline_at), - lastRanAt: toDate(details.last_ran_at), + lastAttemptedAt: toDate(details.last_attempted_at), completedAt: toDate(details.completed_at), progress: details.progress || 0, progressMessage: details.progress_message || "", @@ -54,9 +56,11 @@ export function storageToClass( status: details.status as JobStatus, error: details.error ?? null, errorCode: details.error_code ?? null, - runAttempts: details.run_attempts ?? 0, - maxRetries: details.max_retries ?? 10, - ...(includeWorkerId ? { workerId: details.worker_id ?? null } : {}), + attempts: details.attempts ?? 0, + maxAttempts: details.max_attempts ?? 10, + ...(includeLeaseOwner ? { leaseOwner: details.lease_owner ?? null } : {}), + abort_requested_at: details.abort_requested_at ?? null, + lease_expires_at: details.lease_expires_at ?? null, }); } @@ -78,16 +82,18 @@ export function classToStorage( output: job.output ?? null, error: job.error === null ? null : String(job.error), error_code: job.errorCode || null, - run_attempts: job.runAttempts ?? 0, - max_retries: job.maxRetries ?? 10, - run_after: dateToISOString(job.runAfter) ?? now, + attempts: job.attempts ?? 0, + max_attempts: job.maxAttempts ?? 10, + visible_at: dateToISOString(job.visibleAt) ?? now, created_at: dateToISOString(job.createdAt) ?? now, deadline_at: dateToISOString(job.deadlineAt), - last_ran_at: dateToISOString(job.lastRanAt), + last_attempted_at: dateToISOString(job.lastAttemptedAt), completed_at: dateToISOString(job.completedAt), progress: job.progress ?? 0, progress_message: job.progressMessage ?? "", progress_details: job.progressDetails ?? null, - worker_id: job.workerId ?? null, + lease_owner: job.leaseOwner ?? null, + abort_requested_at: job.abort_requested_at ?? null, + lease_expires_at: job.lease_expires_at ?? null, }; } diff --git a/packages/job-queue/src/job/MessageQueueClient.ts b/packages/job-queue/src/job/MessageQueueClient.ts new file mode 100644 index 000000000..b3ded8ca4 --- /dev/null +++ b/packages/job-queue/src/job/MessageQueueClient.ts @@ -0,0 +1,28 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IMessageQueue, MessageId, SendOptions } from "../queue-storage/IMessageQueue"; + +/** + * Thin producer-only client over an {@link IMessageQueue}. Use for + * fire-and-forget message production where you don't need the {@link Job} + * lifecycle abstractions in {@link JobQueueClient}. + */ +export class MessageQueueClient { + private readonly messageQueue: IMessageQueue; + + constructor(opts: { readonly messageQueue: IMessageQueue }) { + this.messageQueue = opts.messageQueue; + } + + async send(body: Body, opts?: SendOptions): Promise { + return this.messageQueue.send(body, opts); + } + + async sendBatch(bodies: readonly Body[], opts?: SendOptions): Promise { + return this.messageQueue.sendBatch(bodies, opts); + } +} diff --git a/packages/job-queue/src/limiter/CompositeLimiter.ts b/packages/job-queue/src/limiter/CompositeLimiter.ts index a2c8b1e7b..751912119 100644 --- a/packages/job-queue/src/limiter/CompositeLimiter.ts +++ b/packages/job-queue/src/limiter/CompositeLimiter.ts @@ -27,15 +27,6 @@ export class CompositeLimiter implements ILimiter { this.limiters.push(limiter); } - async canProceed(): Promise { - for (const limiter of this.limiters) { - if (!(await limiter.canProceed())) { - return false; // If any limiter says "no", proceed no further - } - } - return true; // All limiters agree - } - /** * Atomic against the composite: acquires children sequentially and rolls * back any successfully-acquired prefix if a later child rejects, so the @@ -71,12 +62,9 @@ export class CompositeLimiter implements ILimiter { await Promise.all(this.limiters.map((l, i) => l.release(token[i]).catch(() => {}))); } - async recordJobStart(): Promise { - await Promise.all(this.limiters.map((limiter) => limiter.recordJobStart())); - } - - async recordJobCompletion(): Promise { - await Promise.all(this.limiters.map((limiter) => limiter.recordJobCompletion())); + async complete(token: unknown): Promise { + if (!Array.isArray(token)) return; + await Promise.all(this.limiters.map((l, i) => l.complete(token[i]).catch(() => {}))); } async getNextAvailableTime(): Promise { diff --git a/packages/job-queue/src/limiter/ConcurrencyLimiter.ts b/packages/job-queue/src/limiter/ConcurrencyLimiter.ts index d6e152786..20e76935a 100644 --- a/packages/job-queue/src/limiter/ConcurrencyLimiter.ts +++ b/packages/job-queue/src/limiter/ConcurrencyLimiter.ts @@ -23,13 +23,6 @@ export class ConcurrencyLimiter implements ILimiter { this.maxConcurrentJobs = maxConcurrentJobs; } - async canProceed(): Promise { - return ( - this.currentRunningJobs < this.maxConcurrentJobs && - Date.now() >= this.nextAllowedStartTime.getTime() - ); - } - /** Sentinel token; ConcurrencyLimiter has no per-row identity, just a counter. */ private static readonly SENTINEL = Symbol("ConcurrencyLimiter.acquired"); @@ -53,11 +46,7 @@ export class ConcurrencyLimiter implements ILimiter { this.currentRunningJobs = Math.max(0, this.currentRunningJobs - 1); } - async recordJobStart(): Promise { - this.currentRunningJobs++; - } - - async recordJobCompletion(): Promise { + async complete(_token: unknown): Promise { this.currentRunningJobs = Math.max(0, this.currentRunningJobs - 1); } diff --git a/packages/job-queue/src/limiter/DelayLimiter.ts b/packages/job-queue/src/limiter/DelayLimiter.ts index cc78ab9cd..938e0123c 100644 --- a/packages/job-queue/src/limiter/DelayLimiter.ts +++ b/packages/job-queue/src/limiter/DelayLimiter.ts @@ -14,10 +14,6 @@ export class DelayLimiter implements ILimiter { private lastAcquireBaseline: number = 0; constructor(private delayInMilliseconds: number = 50) {} - async canProceed(): Promise { - return Date.now() >= this.nextAvailableTime.getTime(); - } - /** * Token records the previous nextAvailableTime so release can roll back to * exactly the state before this acquire — even if other acquires (or @@ -43,12 +39,8 @@ export class DelayLimiter implements ILimiter { } } - async recordJobStart(): Promise { - this.nextAvailableTime = new Date(Date.now() + this.delayInMilliseconds); - } - - async recordJobCompletion(): Promise { - // No action needed. + async complete(_token: unknown): Promise { + // No-op — the delay window reservation must persist until the window expires. } async getNextAvailableTime(): Promise { diff --git a/packages/job-queue/src/limiter/EvenlySpacedRateLimiter.ts b/packages/job-queue/src/limiter/EvenlySpacedRateLimiter.ts index 5a4683e6d..668b7fbc7 100644 --- a/packages/job-queue/src/limiter/EvenlySpacedRateLimiter.ts +++ b/packages/job-queue/src/limiter/EvenlySpacedRateLimiter.ts @@ -23,7 +23,6 @@ export class EvenlySpacedRateLimiter implements ILimiter { private readonly windowSizeMs: number; private readonly idealInterval: number; private nextAvailableTime: number = Date.now(); - private lastStartTime: number = 0; private durations: number[] = []; /** Promise chain used to serialize concurrent {@link tryAcquire} callers. */ private acquireChain: Promise = Promise.resolve(); @@ -41,12 +40,6 @@ export class EvenlySpacedRateLimiter implements ILimiter { this.idealInterval = this.windowSizeMs / this.maxExecutions; } - /** Can we start a new job right now? */ - async canProceed(): Promise { - const now = Date.now(); - return now >= this.nextAvailableTime; - } - /** * Atomic acquire: serialized by an internal promise chain so two concurrent * acquirers cannot both observe `now >= nextAvailableTime` and both proceed. @@ -69,7 +62,6 @@ export class EvenlySpacedRateLimiter implements ILimiter { const priorNextAvailable = this.nextAvailableTime; // Reserve the slot by advancing nextAvailableTime now (recordJobStart-style) // so a follow-up tryAcquire from another caller in the same tick blocks. - this.lastStartTime = now; if (this.durations.length === 0) { this.nextAvailableTime = now + this.idealInterval; } else { @@ -103,36 +95,11 @@ export class EvenlySpacedRateLimiter implements ILimiter { } } - /** Record that a job is starting now. */ - async recordJobStart(): Promise { - const now = Date.now(); - this.lastStartTime = now; - - // If no timing data yet, assume zero run-time (ideal interval) - if (this.durations.length === 0) { - this.nextAvailableTime = now + this.idealInterval; - } else { - // Compute average run duration - const sum = this.durations.reduce((a, b) => a + b, 0); - const avgDuration = sum / this.durations.length; - // Schedule next start: ideal spacing minus average duration - const waitMs = Math.max(0, this.idealInterval - avgDuration); - this.nextAvailableTime = now + waitMs; - } - } - /** - * Call this when a job finishes. - * We measure its duration, update our running-average, - * and then compute how long to wait before the next job start. + * No-op — rate window reservations must persist until the window expires. */ - async recordJobCompletion(): Promise { - const now = Date.now(); - const duration = now - this.lastStartTime; - this.durations.push(duration); - if (this.durations.length > this.maxExecutions) { - this.durations.shift(); - } + async complete(_token: unknown): Promise { + return Promise.resolve(); } async getNextAvailableTime(): Promise { @@ -149,6 +116,5 @@ export class EvenlySpacedRateLimiter implements ILimiter { async clear(): Promise { this.durations = []; this.nextAvailableTime = Date.now(); - this.lastStartTime = 0; } } diff --git a/packages/job-queue/src/limiter/ILimiter.ts b/packages/job-queue/src/limiter/ILimiter.ts index 8552e4cbb..a53dd432b 100644 --- a/packages/job-queue/src/limiter/ILimiter.ts +++ b/packages/job-queue/src/limiter/ILimiter.ts @@ -25,9 +25,8 @@ export type LimiterScope = "process" | "cluster"; * * The atomic primitive is {@link tryAcquire}: it both checks whether a job may * proceed and reserves the slot in a single uninterruptible step. Callers - * MUST use {@link tryAcquire}/{@link release} (not the legacy - * {@link canProceed}/{@link recordJobStart} pair) when correctness matters - * under concurrency. + * MUST use {@link tryAcquire}/{@link release} when correctness matters under + * concurrency. */ export interface ILimiter { /** @@ -63,20 +62,22 @@ export interface ILimiter { release(token: unknown): Promise; /** - * Legacy non-binding "would tryAcquire succeed?" probe. SUBJECT TO RACES — - * do not use this followed by {@link recordJobStart} in production code; use - * {@link tryAcquire} instead. Retained for observability and tests. - */ - canProceed(): Promise; - - /** - * Legacy "force-record an execution" hook. SUBJECT TO RACES when paired with - * {@link canProceed} — use {@link tryAcquire} instead. Retained for tests - * and external bookkeeping. + * Signal that the job which acquired this token has finished executing. + * Called on the normal completion path (success, error, retry) to release + * resources held for the duration of the job. + * + * Semantics differ from {@link release}: `release` undoes a reservation as + * if the job never ran; `complete` finalises a reservation that was actually + * used. + * + * - `ConcurrencyLimiter`: decrements the running-job counter so the next + * job can acquire a slot. + * - `RateLimiter`: no-op — the window reservation was consumed and must + * persist until the window expires. + * - All others: no-op. */ - recordJobStart(): Promise; + complete(token: unknown): Promise; - recordJobCompletion(): Promise; getNextAvailableTime(): Promise; setNextAvailableTime(date: Date): Promise; clear(): Promise; diff --git a/packages/job-queue/src/limiter/NullLimiter.ts b/packages/job-queue/src/limiter/NullLimiter.ts index 496632cb1..ddf6f7e84 100644 --- a/packages/job-queue/src/limiter/NullLimiter.ts +++ b/packages/job-queue/src/limiter/NullLimiter.ts @@ -26,15 +26,7 @@ export class NullLimiter implements ILimiter { // Do nothing } - async canProceed(): Promise { - return true; - } - - async recordJobStart(): Promise { - // Do nothing - } - - async recordJobCompletion(): Promise { + async complete(_token: unknown): Promise { // Do nothing } diff --git a/packages/job-queue/src/limiter/RateLimiter.ts b/packages/job-queue/src/limiter/RateLimiter.ts index b4f2008f6..107c41581 100644 --- a/packages/job-queue/src/limiter/RateLimiter.ts +++ b/packages/job-queue/src/limiter/RateLimiter.ts @@ -121,6 +121,14 @@ export class RateLimiter implements ILimiter { this.localBackoffUntilMs = 0; } + /** + * No-op for RateLimiter — the window reservation was consumed and must + * persist until the window expires. + */ + async complete(_token: unknown): Promise { + return Promise.resolve(); + } + protected addJitter(base: number): number { // full jitter in [base, 2*base) return base + Math.random() * base; @@ -133,69 +141,6 @@ export class RateLimiter implements ILimiter { ); } - /** - * Checks if a job can proceed based on rate limiting rules. - * @returns True if the job can proceed, false otherwise - */ - async canProceed(): Promise { - // First check if the window allows more executions - const windowStartTime = new Date(Date.now() - this.windowSizeInMilliseconds).toISOString(); - const attemptCount = await this.storage.getExecutionCount(this.queueName, windowStartTime); - const canProceedNow = attemptCount < this.maxExecutions; - - // If the window allows more executions, clear any backoff and proceed - if (canProceedNow) { - // Clear any existing nextAvailableTime backoff since the window allows more executions - const nextAvailableTime = await this.storage.getNextAvailableTime(this.queueName); - if (nextAvailableTime && new Date(nextAvailableTime).getTime() > Date.now()) { - // Clear the backoff by setting it to the past - const pastTime = new Date(Date.now() - 1000); - await this.storage.setNextAvailableTime(this.queueName, pastTime.toISOString()); - } - this.currentBackoffDelay = this.initialBackoffDelay; - return true; - } - - // Window is full, check if there's a backoff delay - const nextAvailableTime = await this.storage.getNextAvailableTime(this.queueName); - if (nextAvailableTime && new Date(nextAvailableTime).getTime() > Date.now()) { - this.increaseBackoff(); - return false; - } - - // Window is full but no backoff delay, so we can't proceed - this.increaseBackoff(); - return false; - } - - /** - * Records a new job attempt. - */ - async recordJobStart(): Promise { - await this.storage.recordExecution(this.queueName); - - const windowStartTime = new Date(Date.now() - this.windowSizeInMilliseconds).toISOString(); - const attemptCount = await this.storage.getExecutionCount(this.queueName, windowStartTime); - - if (attemptCount >= this.maxExecutions) { - const backoffExpires = new Date(Date.now() + this.addJitter(this.currentBackoffDelay)); - await this.setNextAvailableTime(backoffExpires); - } else { - // Window allows more executions, clear any existing nextAvailableTime by setting it to the past - const nextAvailableTime = await this.storage.getNextAvailableTime(this.queueName); - if (nextAvailableTime && new Date(nextAvailableTime).getTime() > Date.now()) { - // Clear the backoff since the window now allows more executions - // Set to a time in the past to effectively clear it - const pastTime = new Date(Date.now() - 1000); - await this.storage.setNextAvailableTime(this.queueName, pastTime.toISOString()); - } - } - } - - async recordJobCompletion(): Promise { - // Implementation can be no-op as completion doesn't affect rate limiting - } - /** * Retrieves the next available time for the specific queue. Returns the * latest of: the rate-limit wall (oldest execution + window), any externally diff --git a/packages/job-queue/src/queue-storage/IClaim.ts b/packages/job-queue/src/queue-storage/IClaim.ts new file mode 100644 index 000000000..885fe9da0 --- /dev/null +++ b/packages/job-queue/src/queue-storage/IClaim.ts @@ -0,0 +1,77 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { MessageId } from "./IMessageQueue"; + +export type { MessageId }; + +/** Optional shape passed by `IClaim.fail` to communicate the failure reason. */ +export interface ClaimFailOptions { + /** Human-readable error message persisted as `error`. */ + readonly error?: string | null; + /** Machine-readable error code persisted as `error_code`. */ + readonly errorCode?: string | null; + /** + * True when the failure was caused by an abort request. The claim's + * implementation persists `abort_requested_at` accordingly (some backends + * keep a previously-set value if this is false). + */ + readonly abortRequested?: boolean; + /** + * Hint: this failure must not be retried. Consumers may use this to skip + * back-off scheduling. Storage layers may ignore the flag (the worker + * still owns retry-vs-fail orchestration via attempts/max_attempts). + */ + readonly permanent?: boolean; +} + +/** Optional shape passed by `IClaim.disable` (currently empty — kept for forward compat). */ +export interface ClaimDisableOptions { + // No options today. Kept as an explicit type so future opts don't break call sites. +} + +/** + * A claim on a message from the queue. + * + * A claim is created when a worker calls {@link IMessageQueue.receive}. It + * represents an exclusive (leased) right to process the message. The worker + * must terminate the claim via one of {@link IClaim.ack}, + * {@link IClaim.retry}, {@link IClaim.fail}, {@link IClaim.disable}, or by + * letting the lease expire. + */ +export interface IClaim { + readonly id: MessageId; + readonly body: Body; + readonly attempts: number; + /** + * Mark the message as successfully processed (terminal). Optional `result` + * is persisted atomically with the COMPLETED status so a crash between + * "save result" and "ack" can no longer leave a PROCESSING row with a + * result the worker thinks it stored. The shape of `result` depends on + * the message body; for job-queue claims this is the typed `Output`. + */ + ack(result?: unknown): Promise; + /** Release the claim and reschedule for a later attempt. */ + retry(opts?: { delaySeconds?: number }): Promise; + /** + * Mark the message as failed (terminal). Error/errorCode/abortRequested + * are persisted atomically with the FAILED status — no separate + * `saveError` call beforehand. The `permanent` flag is a hint to skip + * back-off scheduling; storage layers may ignore it. + */ + fail(opts?: ClaimFailOptions): Promise; + /** Extend the lease by `ms` milliseconds. */ + extendLease(ms: number): Promise; + /** + * Mark the message as DISABLED (terminal). Atomic: writes status = + * DISABLED, releases the lease, and clears progress fields in a single + * storage write. Does NOT write error/error_code — DISABLED is not an + * error transition. Optional in this interface for backward + * compatibility with existing claim impls; every in-tree IClaim + * implements it. + */ + disable?(opts?: ClaimDisableOptions): Promise; +} diff --git a/packages/job-queue/src/queue-storage/IJobStore.ts b/packages/job-queue/src/queue-storage/IJobStore.ts new file mode 100644 index 000000000..09513cf95 --- /dev/null +++ b/packages/job-queue/src/queue-storage/IJobStore.ts @@ -0,0 +1,62 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { MessageId } from "./IMessageQueue"; +import type { JobStatus, JobStorageFormat } from "./IQueueStorage"; + +/** + * Record describing a stored job. Currently an alias for the legacy + * {@link JobStorageFormat} so adapters and native implementations can share + * the same record shape. + */ +export type JobRecord = JobStorageFormat; + +/** + * Read- and mutation-side of the job queue. Paired with + * {@link IMessageQueue}. + */ +export interface IJobStore { + get(id: MessageId): Promise | undefined>; + peek(status?: JobStatus, num?: number): Promise[]>; + size(status?: JobStatus): Promise; + getByRunId(runId: string): Promise[]>; + outputForInput(input: Input): Promise; + saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise; + /** + * @deprecated H2 (libs): result is now written atomically with the + * COMPLETED status via {@link IClaim.ack}'s `result` argument. New code + * should call `claim.ack(output)` directly. This method is retained as + * a buffered no-op wrapper for one minor release so callers depending on + * a separate write step keep compiling; backends route the value through + * the pending-buffer until ack persists it. + */ + saveResult(id: MessageId, output: Output): Promise; + /** + * @deprecated H2 (libs): error fields are now written atomically with the + * FAILED status via {@link IClaim.fail}'s opts. New code should call + * `claim.fail({ error, errorCode, abortRequested })` directly. Retained + * as a buffered no-op wrapper for one minor release. + */ + saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise; + deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise; + /** Delete a single job by id. */ + delete(id: MessageId): Promise; + /** Delete every job in this store. */ + deleteAll(): Promise; + abort(id: MessageId): Promise; + /** Force-overwrite the status field without incrementing attempts. Used to persist DISABLED after lease release. */ + saveStatus(id: MessageId, status: JobStatus): Promise; +} diff --git a/packages/job-queue/src/queue-storage/IMessageQueue.ts b/packages/job-queue/src/queue-storage/IMessageQueue.ts new file mode 100644 index 000000000..5223698d3 --- /dev/null +++ b/packages/job-queue/src/queue-storage/IMessageQueue.ts @@ -0,0 +1,44 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IClaim } from "./IClaim"; +import type { QueueChangePayload, QueueStorageScope, QueueSubscribeOptions } from "./IQueueStorage"; + +export type MessageId = unknown; + +/** + * Options for sending a message to the queue. + */ +export interface SendOptions { + readonly delaySeconds?: number; + readonly timeoutSeconds?: number; + readonly fingerprint?: string; + readonly jobRunId?: string; + readonly maxAttempts?: number; +} + +/** + * Message queue interface — owns producing and consuming messages. + * Pairs with {@link IJobStore} for read-side / mutation access to the + * stored job record. + */ +export interface IMessageQueue { + readonly scope: QueueStorageScope; + send(body: Body, opts?: SendOptions): Promise; + sendBatch(bodies: readonly Body[], opts?: SendOptions): Promise; + receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise[]>; + releaseClaim(id: MessageId): Promise; + migrate(): Promise; + getMigrations(): ReadonlyArray; + subscribeToChanges?( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void; +} diff --git a/packages/job-queue/src/queue-storage/IQueueStorage.ts b/packages/job-queue/src/queue-storage/IQueueStorage.ts index 973a6082d..81686b482 100644 --- a/packages/job-queue/src/queue-storage/IQueueStorage.ts +++ b/packages/job-queue/src/queue-storage/IQueueStorage.ts @@ -33,12 +33,11 @@ export interface QueueStorageOptions { readonly prefixValues?: Readonly>; } -export type JobStatus = "PENDING" | "PROCESSING" | "COMPLETED" | "ABORTING" | "FAILED" | "DISABLED"; +export type JobStatus = "PENDING" | "PROCESSING" | "COMPLETED" | "FAILED" | "DISABLED"; export const JobStatus = { PENDING: "PENDING", PROCESSING: "PROCESSING", COMPLETED: "COMPLETED", - ABORTING: "ABORTING", FAILED: "FAILED", DISABLED: "DISABLED", } as const satisfies Record; @@ -104,18 +103,24 @@ export type JobStorageFormat = { error?: string | null; error_code?: string | null; fingerprint?: string; - max_retries?: number; + /** + * Total attempt cap. A job with `max_attempts: 3` gets at most 3 executions total + * (not 3 retries after the first attempt). + */ + max_attempts?: number; status?: JobStatus; created_at?: string; deadline_at?: string | null; - last_ran_at?: string | null; - run_after: string | null; + last_attempted_at?: string | null; + visible_at: string | null; completed_at: string | null; - run_attempts?: number; + attempts?: number; progress?: number; progress_message?: string; progress_details?: Record | null; - worker_id?: string | null; + lease_owner?: string | null; + abort_requested_at?: string | null; + lease_expires_at?: string | null; }; /** @@ -156,21 +161,36 @@ export interface IQueueStorage { get(id: unknown): Promise | undefined>; /** - * Gets the next job from the queue storage + * Gets the next job from the queue storage. Claims PENDING jobs that are + * ready, and also reclaims PROCESSING jobs whose lease has expired (crash + * recovery). Sets `lease_expires_at = now + leaseMs` on the claimed row. * @param workerId - Worker ID to associate with the job (required) + * @param opts - Optional options including leaseMs (default 30000) * @returns The next job from the queue storage */ - next(workerId: string): Promise | undefined>; + next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined>; + + /** + * Extend the lease on a currently PROCESSING job. Guards: lease_owner must + * match workerId and status must be PROCESSING. Throws if lease was lost. + * @param id - The ID of the job to extend the lease for + * @param workerId - Worker ID that must match the current lease owner + * @param ms - Number of milliseconds to extend the lease by + */ + extendLease(id: unknown, workerId: string, ms: number): Promise; /** * Releases a job that was just claimed by {@link next} but won't be * processed (e.g. the worker was stopped mid-claim). Resets status to - * PENDING and clears worker_id WITHOUT incrementing run_attempts — + * PENDING and clears lease_owner WITHOUT incrementing attempts — * the worker never actually attempted execution, so the retry budget * must be preserved. * @param id - The id of the claimed job to release. */ - release(id: unknown): Promise; + releaseClaim(id: unknown): Promise; /** * Peeks at the next job(s) from the queue storage without removing them @@ -188,11 +208,52 @@ export interface IQueueStorage { size(status?: JobStatus): Promise; /** - * Completes a job in the queue storage + * Completes a job in the queue storage. Bumps `attempts` (legacy contract, + * preserved for backward compatibility with code paths that rely on it, + * such as PENDING-retry rescheduling). + * + * NEW callers (worker ack/fail paths) should prefer {@link finalize}, which + * writes the terminal result fields WITHOUT touching `attempts` — a successful + * ack must not consume a retry attempt that was already accounted for by + * the lease-expiry reclaim or by the wrapper's retry path. * @param job - The job to complete */ complete(job: JobStorageFormat): Promise; + /** + * Terminal write for a claim: persists the listed fields WITHOUT bumping + * the `attempts` counter. A partial overwrite — fields not present in + * `fields` are untouched. + * + * Introduced to fix the bug where `WrappedClaim.ack`/`fail` going through + * `complete()` incremented `attempts` on a successful execution. The + * lease-expiry reclaim already charged this attempt at `next()` time; + * charging it again at `ack()` time double-counts and rolls a successful + * job into MAX_ATTEMPTS_REACHED prematurely. + * + * The `lease_owner` / progress fields are also writable here so the + * atomic `disable` path can release the lease and clear progress in the + * same single write. + * + * @param id - The ID of the job to finalize + * @param fields - Terminal fields to write + */ + finalize( + id: unknown, + fields: { + output?: Output | null; + error?: string | null; + error_code?: string | null; + status?: JobStatus; + completed_at?: string | null; + abort_requested_at?: string | null; + lease_owner?: string | null; + progress?: number; + progress_message?: string; + progress_details?: Record | null; + } + ): Promise; + /** * Deletes all jobs from the queue storage */ diff --git a/packages/job-queue/src/queue-storage/InMemoryJobStore.ts b/packages/job-queue/src/queue-storage/InMemoryJobStore.ts new file mode 100644 index 000000000..be420680e --- /dev/null +++ b/packages/job-queue/src/queue-storage/InMemoryJobStore.ts @@ -0,0 +1,97 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobStore, JobRecord } from "./IJobStore"; +import type { MessageId } from "./IMessageQueue"; +import type { PendingInMemoryWrite } from "./InMemoryMessageQueue"; +import { InMemoryQueueStorage } from "./InMemoryQueueStorage"; +import type { JobStatus } from "./IQueueStorage"; + +export class InMemoryJobStore implements IJobStore { + /** @internal — shared with the paired message queue */ + public readonly core: InMemoryQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: InMemoryQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + get(id: MessageId): Promise | undefined> { + return this.core.get(id); + } + + async peek(status?: JobStatus, num?: number): Promise[]> { + return this.core.peek(status, num); + } + + size(status?: JobStatus): Promise { + return this.core.size(status); + } + + async getByRunId(runId: string): Promise[]> { + return this.core.getByRunId(runId); + } + + outputForInput(input: Input): Promise { + return this.core.outputForInput(input); + } + + async saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise { + await this.core.saveProgress(id, progress, message, details); + } + + async saveResult(id: MessageId, output: Output): Promise { + const buf = this.pending.get(id) ?? {}; + buf.output = output ?? null; + this.pending.set(id, buf); + } + + async saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise { + const buf = this.pending.get(id) ?? {}; + buf.error = error; + buf.errorCode = errorCode; + buf.abortRequested = abortRequested; + this.pending.set(id, buf); + } + + async deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise { + await this.core.deleteJobsByStatusAndAge(status, olderThanMs); + } + + async delete(id: MessageId): Promise { + this.pending.delete(id); + await this.core.delete(id); + } + + async deleteAll(): Promise { + this.pending.clear(); + await this.core.deleteAll(); + } + + async abort(id: MessageId): Promise { + await this.core.abort(id); + } + + async saveStatus(id: MessageId, status: JobStatus): Promise { + await this.core.saveStatus(id, status); + } +} diff --git a/packages/job-queue/src/queue-storage/InMemoryMessageQueue.ts b/packages/job-queue/src/queue-storage/InMemoryMessageQueue.ts new file mode 100644 index 000000000..78ef17b4f --- /dev/null +++ b/packages/job-queue/src/queue-storage/InMemoryMessageQueue.ts @@ -0,0 +1,233 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IClaim } from "./IClaim"; +import type { IMessageQueue, MessageId, SendOptions } from "./IMessageQueue"; +import { InMemoryQueueStorage } from "./InMemoryQueueStorage"; +import type { + JobStorageFormat, + QueueChangePayload, + QueueStorageScope, + QueueSubscribeOptions, +} from "./IQueueStorage"; + +/** + * Per-id buffer that lets {@link IJobStore.saveResult}/{@link IJobStore.saveError} + * stage output/error until the terminal claim.ack()/fail() persists them in + * a single complete() call (avoids double-bumping `attempts`). + */ +export type PendingInMemoryWrite = { + output?: Output | null; + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; +}; + +class InMemoryClaim implements IClaim> { + constructor( + private readonly core: InMemoryQueueStorage, + private readonly pending: Map>, + public readonly id: MessageId, + public readonly body: JobStorageFormat, + public readonly attempts: number, + private readonly workerId: string + ) {} + + async ack(result?: unknown): Promise { + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + // H2 atomic: persist output + COMPLETED status in one finalize() call. + // Falls back to the legacy pending-buffer if no `result` was passed in. + const output = + result !== undefined + ? result + : buf?.output !== undefined + ? buf.output + : (current.output ?? null); + await this.core.finalize(this.id, { + output: output as Output | null, + error: null, + error_code: null, + status: "COMPLETED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async retry(opts?: { delaySeconds?: number }): Promise { + this.pending.delete(this.id); + const delay = opts?.delaySeconds ?? 0; + const current = (await this.core.get(this.id)) ?? this.body; + await this.core.complete({ + ...current, + status: "PENDING", + lease_owner: null, + lease_expires_at: null, + visible_at: new Date(Date.now() + delay * 1000).toISOString(), + progress: 0, + progress_message: "", + progress_details: null, + }); + } + + async fail(opts?: { + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; + permanent?: boolean; + }): Promise { + void opts?.permanent; + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + // H2 atomic: persist error/errorCode/abortRequested + FAILED status in + // one finalize() call. Falls back to the pending-buffer if the worker + // still went through jobStore.saveError before this. + const error = + opts?.error !== undefined + ? opts.error + : buf?.error !== undefined + ? buf.error + : (current.error ?? null); + const errorCode = + opts?.errorCode !== undefined + ? opts.errorCode + : buf?.errorCode !== undefined + ? buf.errorCode + : (current.error_code ?? null); + const abortRequested = + opts?.abortRequested !== undefined ? opts.abortRequested : (buf?.abortRequested ?? false); + await this.core.finalize(this.id, { + error, + error_code: errorCode, + abort_requested_at: abortRequested + ? (current.abort_requested_at ?? new Date().toISOString()) + : (current.abort_requested_at ?? null), + status: "FAILED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async extendLease(ms: number): Promise { + await this.core.extendLease(this.id, this.workerId, ms); + } + + /** + * Atomic disable (H5): one storage write — status=DISABLED, lease + * released, progress cleared. No error/error_code (DISABLED is not an + * error transition). + */ + async disable(): Promise { + this.pending.delete(this.id); + const current = await this.core.get(this.id); + const completedAt = current?.completed_at ?? new Date().toISOString(); + await this.core.finalize(this.id, { + status: "DISABLED", + completed_at: completedAt, + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + }); + } +} + +export class InMemoryMessageQueue implements IMessageQueue< + JobStorageFormat +> { + public readonly scope: QueueStorageScope = "process"; + + /** @internal — shared with the paired job store */ + public readonly core: InMemoryQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: InMemoryQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + async send(body: JobStorageFormat, opts?: SendOptions): Promise { + return this.core.add(applySendOptions(body, opts)); + } + + async sendBatch( + bodies: readonly JobStorageFormat[], + opts?: SendOptions + ): Promise { + const ids: MessageId[] = []; + for (const body of bodies) { + ids.push(await this.send(body, opts)); + } + return ids; + } + + async receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise>[]> { + const max = Math.max(1, opts.max ?? 1); + const claims: IClaim>[] = []; + while (claims.length < max) { + const job = await this.core.next(opts.workerId, { leaseMs: opts.leaseMs }); + if (!job) break; + claims.push( + new InMemoryClaim( + this.core, + this.pending, + job.id, + job, + job.attempts ?? 0, + opts.workerId + ) + ); + } + return claims; + } + + async releaseClaim(id: MessageId): Promise { + this.pending.delete(id); + await this.core.releaseClaim(id); + } + + async migrate(): Promise { + await this.core.migrate(); + } + + getMigrations(): ReadonlyArray { + return this.core.getMigrations(); + } + + subscribeToChanges( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void { + return this.core.subscribeToChanges(callback, options); + } +} + +function applySendOptions( + body: JobStorageFormat, + opts?: SendOptions +): JobStorageFormat { + if (!opts) return body; + const out: JobStorageFormat = { ...body }; + if (opts.delaySeconds != null) { + out.visible_at = new Date(Date.now() + opts.delaySeconds * 1000).toISOString(); + } + if (opts.timeoutSeconds != null) { + out.deadline_at = new Date(Date.now() + opts.timeoutSeconds * 1000).toISOString(); + } + if (opts.fingerprint != null) out.fingerprint = opts.fingerprint; + if (opts.jobRunId != null) out.job_run_id = opts.jobRunId; + if (opts.maxAttempts != null) out.max_attempts = opts.maxAttempts; + return out; +} diff --git a/packages/job-queue/src/queue-storage/InMemoryQueueStorage.ts b/packages/job-queue/src/queue-storage/InMemoryQueueStorage.ts index 57d182ad2..6a3b87320 100644 --- a/packages/job-queue/src/queue-storage/InMemoryQueueStorage.ts +++ b/packages/job-queue/src/queue-storage/InMemoryQueueStorage.ts @@ -71,19 +71,6 @@ export class InMemoryQueueStorage implements IQueueStorage & Record> { - const now = new Date().toISOString(); - return this.jobQueue - .filter((job) => this.matchesPrefixes(job)) - .filter((job) => job.status === JobStatus.PENDING) - .filter((job) => !job.run_after || job.run_after <= now) - .sort((a, b) => (a.run_after || "").localeCompare(b.run_after || "")); - } - /** * Adds a new job to the queue * Generates an ID and fingerprint if not provided @@ -101,7 +88,7 @@ export class InMemoryQueueStorage implements IQueueStorage implements IQueueStorage this.matchesPrefixes(j)) - .sort((a, b) => (a.run_after || "").localeCompare(b.run_after || "")) + .sort((a, b) => (a.visible_at || "").localeCompare(b.visible_at || "")) .filter((j) => j.status === status) .slice(0, num); } /** - * Retrieves the next available job that is ready to be processed - * Updates the job status to PROCESSING before returning + * Retrieves the next available job that is ready to be processed. + * Claims PENDING jobs ready to run, and also reclaims PROCESSING jobs whose + * lease has expired (crash recovery). Sets lease_expires_at on the claimed row. * @param workerId - Worker ID to associate with the job + * @param opts - Optional options including leaseMs (default 30000) * @returns The next job or undefined if no job is available */ - public async next(workerId: string): Promise | undefined> { + public async next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined> { await sleep(0); - const top = this.pendingQueue(); + const leaseMs = opts?.leaseMs ?? 30000; + const now = new Date().toISOString(); + const leaseExpiry = new Date(Date.now() + leaseMs).toISOString(); + + // First look for a normal PENDING job ready to run + const pending = this.jobQueue + .filter((job) => this.matchesPrefixes(job)) + .filter((job) => job.status === JobStatus.PENDING) + .filter((job) => !job.visible_at || job.visible_at <= now) + .sort((a, b) => (a.visible_at || "").localeCompare(b.visible_at || "")); - const job = top[0]; + // Also look for PROCESSING jobs with expired leases + const expiredLease = this.jobQueue + .filter((job) => this.matchesPrefixes(job)) + .filter((job) => job.status === JobStatus.PROCESSING) + .filter((job) => !job.lease_expires_at || job.lease_expires_at < now) + .sort((a, b) => (a.visible_at || "").localeCompare(b.visible_at || "")); + + const job = pending[0] ?? expiredLease[0]; if (job) { const oldJob = { ...job }; + // Lease-expiry reclaim (job was PROCESSING with expired lease) consumes + // one attempt against max_attempts; a fresh PENDING claim does not. + const isLeaseExpiryReclaim = job.status === JobStatus.PROCESSING; job.status = JobStatus.PROCESSING; - job.last_ran_at = new Date().toISOString(); - job.worker_id = workerId; + job.last_attempted_at = now; + job.lease_owner = workerId; + job.lease_expires_at = leaseExpiry; + if (isLeaseExpiryReclaim) { + job.attempts = (job.attempts ?? 0) + 1; + } + // Always clear stale abort_requested_at on (re)claim so a flag set by + // an earlier worker does not immediately abort the new lease. + (job as unknown as Record).abort_requested_at = null; this.events.emit("change", { type: "UPDATE", old: oldJob, new: job }); return job; } } + /** + * Extend the lease on a currently PROCESSING job. + * @param id - The ID of the job to extend the lease for + * @param workerId - Worker ID that must match the current lease owner (lease_owner) + * @param ms - Number of milliseconds to extend the lease by + */ + public async extendLease(id: unknown, workerId: string, ms: number): Promise { + await sleep(0); + const job = this.jobQueue.find((j) => j.id === id && this.matchesPrefixes(j)); + if (!job || job.status !== JobStatus.PROCESSING || job.lease_owner !== workerId) { + throw new Error( + `extendLease failed: job ${String(id)} is not PROCESSING or lease is not owned by worker ${workerId}` + ); + } + const oldJob = { ...job }; + job.lease_expires_at = new Date(Date.now() + ms).toISOString(); + this.events.emit("change", { type: "UPDATE", old: oldJob, new: job }); + } + /** * Retrieves the size of the queue for a given status * @param status - The status of the jobs to retrieve. * @returns A promise that resolves to the number of jobs. */ - public async size(status = JobStatus.PENDING): Promise { + public async size(status: JobStatus = JobStatus.PENDING): Promise { await sleep(0); return this.jobQueue.filter((j) => this.matchesPrefixes(j) && j.status === status).length; } @@ -227,7 +264,7 @@ export class InMemoryQueueStorage implements IQueueStorage implements IQueueStorage j.id === job.id && this.matchesPrefixes(j)); if (index !== -1) { const existing = this.jobQueue[index]; - const currentAttempts = existing?.run_attempts ?? 0; - jobWithPrefixes.run_attempts = currentAttempts + 1; + const currentAttempts = existing?.attempts ?? 0; + jobWithPrefixes.attempts = currentAttempts + 1; + // PENDING-retry / terminal completion: clear abort_requested_at so an + // abort that was requested during the previous attempt does not + // immediately cancel the retry. Terminal statuses get a harmless + // cleanup of the same field. + jobWithPrefixes.abort_requested_at = null; // Preserve prefix values from the existing job for (const [key, value] of Object.entries(this.prefixValues)) { jobWithPrefixes[key] = value; @@ -250,35 +292,96 @@ export class InMemoryQueueStorage implements IQueueStorage { + public async releaseClaim(id: unknown): Promise { await sleep(0); const job = this.jobQueue.find((j) => j.id === id && this.matchesPrefixes(j)); if (job) { const oldJob = { ...job }; job.status = JobStatus.PENDING; - job.worker_id = null; + job.lease_owner = null; job.progress = 0; job.progress_message = ""; job.progress_details = null; + // Clear stale abort_requested_at — an abort flag set during the previous + // claim must not survive the release and immediately cancel the next claim. + (job as unknown as Record).abort_requested_at = null; this.events.emit("change", { type: "UPDATE", old: oldJob, new: job }); } } /** - * Aborts a job + * Aborts a job. + * - If PENDING: immediately mark as FAILED with abort_requested_at set. + * - If PROCESSING: set abort_requested_at only (leave status as PROCESSING). + * - Otherwise: no-op. * @param id - The id of the job to abort. */ public async abort(id: unknown): Promise { await sleep(0); const job = this.jobQueue.find((j) => j.id === id && this.matchesPrefixes(j)); - if (job) { - const oldJob = { ...job }; - job.status = JobStatus.ABORTING; - this.events.emit("change", { type: "UPDATE", old: oldJob, new: job }); + if (!job) return; + const oldJob = { ...job }; + const now = new Date().toISOString(); + if (job.status === JobStatus.PENDING) { + job.status = JobStatus.FAILED; + job.abort_requested_at = now; + job.completed_at = now; + } else if (job.status === JobStatus.PROCESSING) { + job.abort_requested_at = now; + } else { + return; } + this.events.emit("change", { type: "UPDATE", old: oldJob, new: job }); + } + + /** + * Terminal write that does NOT bump `attempts`. See IQueueStorage.finalize + * for the rationale. + */ + public async finalize( + id: unknown, + fields: { + output?: Output | null; + error?: string | null; + error_code?: string | null; + status?: JobStatus; + completed_at?: string | null; + abort_requested_at?: string | null; + lease_owner?: string | null; + progress?: number; + progress_message?: string; + progress_details?: Record | null; + } + ): Promise { + await sleep(0); + const job = this.jobQueue.find((j) => j.id === id && this.matchesPrefixes(j)); + if (!job) return; + const oldJob = { ...job }; + const target = job as JobStorageFormat & Record; + if ("output" in fields) target.output = (fields.output ?? null) as Output | null; + if ("error" in fields) target.error = fields.error ?? null; + if ("error_code" in fields) target.error_code = fields.error_code ?? null; + if ("status" in fields && fields.status !== undefined) target.status = fields.status; + if ("completed_at" in fields) target.completed_at = fields.completed_at ?? null; + if ("abort_requested_at" in fields) + target.abort_requested_at = fields.abort_requested_at ?? null; + if ("lease_owner" in fields) target.lease_owner = fields.lease_owner ?? null; + if ("progress" in fields) target.progress = fields.progress ?? 0; + if ("progress_message" in fields) target.progress_message = fields.progress_message ?? ""; + if ("progress_details" in fields) target.progress_details = fields.progress_details ?? null; + this.events.emit("change", { type: "UPDATE", old: oldJob, new: job }); + } + + /** + * Force-overwrite status without incrementing attempts. Standardized name — + * the legacy `updateJobStatus` alias was removed in favour of this name + * which mirrors `IJobStore.saveStatus` (the caller). + */ + public async saveStatus(id: unknown, status: JobStatus): Promise { + await this.finalize(id, { status }); } /** diff --git a/packages/job-queue/src/queue-storage/TelemetryQueueStorage.ts b/packages/job-queue/src/queue-storage/TelemetryQueueStorage.ts index 1c934d582..1a1ae45ad 100644 --- a/packages/job-queue/src/queue-storage/TelemetryQueueStorage.ts +++ b/packages/job-queue/src/queue-storage/TelemetryQueueStorage.ts @@ -34,8 +34,18 @@ export class TelemetryQueueStorage implements IQueueStorage | undefined> { return traced("workglow.storage.queue.get", this.storageName, () => this.inner.get(id)); } - next(workerId: string): Promise | undefined> { - return traced("workglow.storage.queue.next", this.storageName, () => this.inner.next(workerId)); + next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined> { + return traced("workglow.storage.queue.next", this.storageName, () => + this.inner.next(workerId, opts) + ); + } + extendLease(id: unknown, workerId: string, ms: number): Promise { + return traced("workglow.storage.queue.extendLease", this.storageName, () => + this.inner.extendLease(id, workerId, ms) + ); } peek(status?: JobStatus, num?: number): Promise>> { return traced("workglow.storage.queue.peek", this.storageName, () => @@ -50,8 +60,29 @@ export class TelemetryQueueStorage implements IQueueStorage { - return traced("workglow.storage.queue.release", this.storageName, () => this.inner.release(id)); + finalize( + id: unknown, + fields: { + output?: Output | null; + error?: string | null; + error_code?: string | null; + status?: JobStatus; + completed_at?: string | null; + abort_requested_at?: string | null; + lease_owner?: string | null; + progress?: number; + progress_message?: string; + progress_details?: Record | null; + } + ): Promise { + return traced("workglow.storage.queue.finalize", this.storageName, () => + this.inner.finalize(id, fields) + ); + } + releaseClaim(id: unknown): Promise { + return traced("workglow.storage.queue.releaseClaim", this.storageName, () => + this.inner.releaseClaim(id) + ); } deleteAll(): Promise { return traced("workglow.storage.queue.deleteAll", this.storageName, () => diff --git a/packages/job-queue/src/queue-storage/createInMemoryQueue.ts b/packages/job-queue/src/queue-storage/createInMemoryQueue.ts new file mode 100644 index 000000000..e0104695c --- /dev/null +++ b/packages/job-queue/src/queue-storage/createInMemoryQueue.ts @@ -0,0 +1,33 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { InMemoryJobStore } from "./InMemoryJobStore"; +import { InMemoryMessageQueue, type PendingInMemoryWrite } from "./InMemoryMessageQueue"; +import { InMemoryQueueStorage } from "./InMemoryQueueStorage"; +import type { QueueStorageOptions } from "./IQueueStorage"; + +/** + * Factory for the paired in-memory message queue and job store. Both + * facades share a single underlying {@link InMemoryQueueStorage} so writes + * through one are observable through the other. + */ +export function createInMemoryQueue( + queueName: string = "default", + opts?: QueueStorageOptions +): { + messageQueue: InMemoryMessageQueue; + jobStore: InMemoryJobStore; + /** @internal — exposed for callers that still need the legacy storage object. */ + core: InMemoryQueueStorage; +} { + const core = new InMemoryQueueStorage(queueName, opts); + const pending = new Map>(); + return { + messageQueue: new InMemoryMessageQueue(core, pending), + jobStore: new InMemoryJobStore(core, pending), + core, + }; +} diff --git a/packages/job-queue/src/queue-storage/wrapQueueStorage.ts b/packages/job-queue/src/queue-storage/wrapQueueStorage.ts new file mode 100644 index 000000000..8b057e017 --- /dev/null +++ b/packages/job-queue/src/queue-storage/wrapQueueStorage.ts @@ -0,0 +1,333 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IClaim } from "./IClaim"; +import type { IJobStore, JobRecord } from "./IJobStore"; +import type { IMessageQueue, MessageId, SendOptions } from "./IMessageQueue"; +import type { + IQueueStorage, + JobStatus, + JobStorageFormat, + QueueChangePayload, + QueueSubscribeOptions, +} from "./IQueueStorage"; + +/** + * Transient per-id buffer of outputs / errors written via + * {@link IJobStore.saveResult} / {@link IJobStore.saveError} ahead of the + * terminal {@link IClaim.ack} / {@link IClaim.fail} that actually persists + * the row. Folding both into a single legacy `storage.complete(...)` call + * avoids double-incrementing the `attempts` counter on backends whose + * `complete()` always bumps it. + */ +type PendingWrite = { + output?: Output | null; + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; +}; + +class WrappedClaim implements IClaim> { + constructor( + private readonly storage: IQueueStorage, + private readonly pending: Map>, + public readonly id: MessageId, + public readonly body: JobStorageFormat, + public readonly attempts: number, + private readonly workerId: string + ) {} + + async ack(result?: unknown): Promise { + // Atomic ack: persist result + terminal status in a single write so a + // crash between "save result" and "set COMPLETED" cannot leave a row + // PROCESSING with an output the worker thinks it saved. The caller may + // pass `result` explicitly (preferred — H2 atomicity guarantee) or rely + // on the legacy pending-buffer fallback for callers still going through + // jobStore.saveResult(...). + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.storage.get(this.id)) ?? this.body; + const output = + result !== undefined + ? result + : buf?.output !== undefined + ? buf.output + : (current.output ?? null); + await this.storage.finalize(this.id, { + // `output` cast — finalize is typed against Output but receives the + // result the worker passed in; the queue body's Output and the claim's + // Output align by construction. + output: output as never, + error: null, + error_code: null, + status: "COMPLETED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async retry(opts?: { delaySeconds?: number }): Promise { + this.pending.delete(this.id); + const delay = opts?.delaySeconds ?? 0; + const visibleAt = new Date(Date.now() + delay * 1000).toISOString(); + const current = (await this.storage.get(this.id)) ?? this.body; + await this.storage.complete({ + ...current, + status: "PENDING", + lease_owner: null, + lease_expires_at: null, + visible_at: visibleAt, + progress: 0, + progress_message: "", + progress_details: null, + // Clear abort_requested_at on retry — an abort flag set during the + // failed attempt must not survive into the next retry. Mirrors what + // each storage backend does at the SQL level in `complete()` for + // PENDING-retry, but applying it here in the wrapper guarantees the + // behaviour for storages that route writes through the wrapper. + abort_requested_at: null, + }); + } + + async fail(opts?: { + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; + permanent?: boolean; + }): Promise { + void opts?.permanent; // hint — worker owns retry-vs-fail decision + // Prefer explicitly-provided args (H2 atomicity). Fall back to the + // pending buffer for callers that still route through jobStore.saveError. + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.storage.get(this.id)) ?? this.body; + const error = + opts?.error !== undefined + ? opts.error + : buf?.error !== undefined + ? buf.error + : (current.error ?? null); + const errorCode = + opts?.errorCode !== undefined + ? opts.errorCode + : buf?.errorCode !== undefined + ? buf.errorCode + : (current.error_code ?? null); + const abortRequested = + opts?.abortRequested !== undefined ? opts.abortRequested : (buf?.abortRequested ?? false); + await this.storage.finalize(this.id, { + error, + error_code: errorCode, + abort_requested_at: abortRequested + ? (current.abort_requested_at ?? new Date().toISOString()) + : (current.abort_requested_at ?? null), + status: "FAILED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async extendLease(ms: number): Promise { + await this.storage.extendLease(this.id, this.workerId, ms); + } + + /** + * Atomic disable (H5): one storage write that sets status=DISABLED, + * releases the lease, and clears progress fields. Does NOT write + * error/error_code — DISABLED is a normal terminal transition, not a + * failure. Replaces the legacy two-write `claim.fail()` then + * `jobStore.saveStatus(DISABLED)` path that briefly published FAILED + * to subscribers before overwriting with DISABLED, firing a spurious + * `job_error` event in between. + */ + async disable(): Promise { + this.pending.delete(this.id); + const current = await this.storage.get(this.id); + const completedAt = current?.completed_at ?? new Date().toISOString(); + await this.storage.finalize(this.id, { + status: "DISABLED", + completed_at: completedAt, + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + // No error / error_code writes — DISABLED is not an error state. + }); + } +} + +class WrappedMessageQueue implements IMessageQueue> { + public get scope() { + return this.storage.scope; + } + + constructor( + private readonly storage: IQueueStorage, + private readonly pending: Map> + ) {} + + async send(body: JobStorageFormat, opts?: SendOptions): Promise { + const job = applySendOptions(body, opts); + return this.storage.add(job); + } + + async sendBatch( + bodies: readonly JobStorageFormat[], + opts?: SendOptions + ): Promise { + const ids: MessageId[] = []; + for (const body of bodies) { + ids.push(await this.send(body, opts)); + } + return ids; + } + + async receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise>[]> { + const max = Math.max(1, opts.max ?? 1); + const claims: IClaim>[] = []; + while (claims.length < max) { + const next = await this.storage.next(opts.workerId, { leaseMs: opts.leaseMs }); + if (!next) break; + claims.push( + new WrappedClaim( + this.storage, + this.pending, + next.id, + next, + next.attempts ?? 0, + opts.workerId + ) + ); + } + return claims; + } + + async releaseClaim(id: MessageId): Promise { + this.pending.delete(id); + await this.storage.releaseClaim(id); + } + + async migrate(): Promise { + await this.storage.migrate(); + } + + getMigrations(): ReadonlyArray { + return this.storage.getMigrations(); + } + + subscribeToChanges( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void { + return this.storage.subscribeToChanges(callback, options); + } +} + +class WrappedJobStore implements IJobStore { + constructor( + private readonly storage: IQueueStorage, + private readonly pending: Map> + ) {} + + get(id: MessageId): Promise | undefined> { + return this.storage.get(id); + } + async peek(status?: JobStatus, num?: number): Promise[]> { + return this.storage.peek(status, num); + } + size(status?: JobStatus): Promise { + return this.storage.size(status); + } + async getByRunId(runId: string): Promise[]> { + return this.storage.getByRunId(runId); + } + outputForInput(input: Input): Promise { + return this.storage.outputForInput(input); + } + async saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise { + await this.storage.saveProgress(id, progress, message, details); + } + async saveResult(id: MessageId, output: Output): Promise { + // Buffered until claim.ack() persists it. Calling storage.complete() here + // would double-bump `attempts` on backends whose complete() increments it. + const buf = this.pending.get(id) ?? {}; + buf.output = output ?? null; + this.pending.set(id, buf); + } + async saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise { + const buf = this.pending.get(id) ?? {}; + buf.error = error; + buf.errorCode = errorCode; + buf.abortRequested = abortRequested; + this.pending.set(id, buf); + } + async deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise { + await this.storage.deleteJobsByStatusAndAge(status, olderThanMs); + } + async delete(id: MessageId): Promise { + this.pending.delete(id); + await this.storage.delete(id); + } + async deleteAll(): Promise { + this.pending.clear(); + await this.storage.deleteAll(); + } + async abort(id: MessageId): Promise { + await this.storage.abort(id); + } + + async saveStatus(id: MessageId, status: JobStatus): Promise { + // Use finalize() so the status write does not bump attempts. The previous + // implementation went through complete() and pre-decremented attempts by 1 + // to offset the increment — a fragile compensation that broke if attempts + // was undefined or the storage didn't bump (the bug it was working around + // is fixed by finalize itself). + await this.storage.finalize(id, { status }); + } +} + +function applySendOptions( + body: JobStorageFormat, + opts?: SendOptions +): JobStorageFormat { + if (!opts) return body; + const out: JobStorageFormat = { ...body }; + if (opts.delaySeconds != null) { + out.visible_at = new Date(Date.now() + opts.delaySeconds * 1000).toISOString(); + } + if (opts.timeoutSeconds != null) { + out.deadline_at = new Date(Date.now() + opts.timeoutSeconds * 1000).toISOString(); + } + if (opts.fingerprint != null) out.fingerprint = opts.fingerprint; + if (opts.jobRunId != null) out.job_run_id = opts.jobRunId; + if (opts.maxAttempts != null) out.max_attempts = opts.maxAttempts; + return out; +} + +export function wrapQueueStorage( + storage: IQueueStorage +): { + messageQueue: IMessageQueue>; + jobStore: IJobStore; +} { + const pending = new Map>(); + return { + messageQueue: new WrappedMessageQueue(storage, pending), + jobStore: new WrappedJobStore(storage, pending), + }; +} diff --git a/packages/task-graph/src/task-graph/TaskGraphRunner.ts b/packages/task-graph/src/task-graph/TaskGraphRunner.ts index c791b8d63..f7558e4ed 100644 --- a/packages/task-graph/src/task-graph/TaskGraphRunner.ts +++ b/packages/task-graph/src/task-graph/TaskGraphRunner.ts @@ -605,6 +605,14 @@ export class TaskGraphRunner { this.runScheduler.armGraphTimeout(config.timeout, ctx); } + // Notify the disposal strategy that a new run is starting. Inactivity + // strategies clear any pending idle timers here, closing the race + // window where a stale timer armed at the previous runComplete could + // dispose a resource the new run is about to touch. + if (this.resourceScope) { + await this.resourceScope.runStart(); + } + // Early-out if parent signal was already aborted (RunContext constructor // already aborted ctx.abortController in that case) if (ctx.abortController.signal.aborted) return; diff --git a/packages/task-graph/src/task/TaskRunner.ts b/packages/task-graph/src/task/TaskRunner.ts index 36630e26c..1a814a2ee 100644 --- a/packages/task-graph/src/task/TaskRunner.ts +++ b/packages/task-graph/src/task/TaskRunner.ts @@ -573,6 +573,14 @@ export class TaskRunner< this.resourceScope = config.resourceScope; } + // Notify the disposal strategy that a new run is starting. Inactivity + // strategies use this hook to clear any pending idle timers that were + // armed at the previous `runComplete`, closing the race window where a + // timer could fire mid-run and dispose a resource we are about to use. + if (this.resourceScope) { + await this.resourceScope.runStart(); + } + // Early-out if parent signal was already aborted (TaskRunContext constructor // already aborted ctx.abortController in that case) if (ctx.abortController.signal.aborted) return; diff --git a/packages/tasks/src/task/FetchUrlTask.ts b/packages/tasks/src/task/FetchUrlTask.ts index 020c55338..5ccb37c9c 100644 --- a/packages/tasks/src/task/FetchUrlTask.ts +++ b/packages/tasks/src/task/FetchUrlTask.ts @@ -519,9 +519,9 @@ export class FetchUrlTask< throw executeContext.signal.reason ?? new AbortSignalJobError("The operation was aborted"); } - const handle = await registeredQueue.client.submit(jobInput as Input, { + const handle = await registeredQueue.client.send(jobInput as Input, { jobRunId: this.runConfig.runnerId, - maxRetries: 10, + maxAttempts: 10, }); // Wire abort signal to queued job diff --git a/packages/test/src/contract/tabular-storage/assertions/subscribeToChanges.ts b/packages/test/src/contract/tabular-storage/assertions/subscribeToChanges.ts index 530eb8556..c7da3cdc3 100644 --- a/packages/test/src/contract/tabular-storage/assertions/subscribeToChanges.ts +++ b/packages/test/src/contract/tabular-storage/assertions/subscribeToChanges.ts @@ -15,16 +15,84 @@ import { import { itExpectFail } from "../../itExpectFail"; import type { TabularStorageContractOpts } from "../types"; +/** + * Subscription contract. Selection between the two blocks below is driven by + * `opts.usesPolling`: + * + * - `usesPolling: false` → `subscribeToChanges.eventDriven` — strict commit + * order (event-driven subscriptions like Postgres LISTEN/NOTIFY or the + * in-memory broadcast bus emit one event per write, in write order). + * - `usesPolling: true` → `subscribeToChanges.polling` — set equality plus + * event count (polling-based subscriptions diff a snapshot and have no + * way to preserve commit order). + * + * When `capabilities.supportsSubscriptions` is true, `usesPolling` is + * required on the contract opts; the type guarantees this at compile time. + */ export function subscribeToChangesBlock(opts: TabularStorageContractOpts): void { + if (!opts.capabilities.supportsSubscriptions) { + describe.skip("subscribeToChanges", () => { + it("skipped: capabilities.supportsSubscriptions === false", () => {}); + }); + return; + } + const expectFails = new Set(opts.expectedFailures ?? []); const itImpl = expectFails.has("subscribeToChanges") ? itExpectFail : it; + // `usesPolling` is required when `supportsSubscriptions: true` (enforced by + // the discriminated `TabularStorageContractOpts` union). The non-null + // assertion below is safe given the early-return on `supportsSubscriptions`. + const usesPolling = opts.usesPolling!; + const pollingIntervalMs = opts.pollingIntervalMs ?? 1; + const waitTime = usesPolling ? Math.max(pollingIntervalMs * 8, 200) : 50; + const initWaitTime = usesPolling ? Math.max(pollingIntervalMs * 10, 300) : 10; + + if (usesPolling) { + describe("subscribeToChanges.polling", () => { + let storage: ITabularStorage; + + beforeEach(async () => { + storage = await opts.createStorage(); + await storage.setupDatabase?.(); + }); + + afterEach(async () => { + await storage.deleteAll(); + storage.destroy?.(); + }); + + itImpl( + "observes every write (set equality + count, order unspecified)", + async () => { + const changes: TabularChangePayload>[] = []; + const unsubscribe = storage.subscribeToChanges((change) => changes.push(change), { + pollingIntervalMs, + }); - describe.skipIf(!opts.capabilities.supportsSubscriptions)("subscribeToChanges", () => { - const usesPolling = opts.usesPolling ?? false; - const pollingIntervalMs = opts.pollingIntervalMs ?? 1; - const waitTime = usesPolling ? Math.max(pollingIntervalMs * 8, 200) : 50; - const initWaitTime = usesPolling ? Math.max(pollingIntervalMs * 10, 300) : 10; + await sleep(initWaitTime); + await storage.put({ name: "t1", type: "s1", option: "v1", success: true }); + await storage.put({ name: "t2", type: "s2", option: "v2", success: false }); + await storage.put({ name: "t3", type: "s3", option: "v3", success: true }); + + await sleep(waitTime); + + const writeEvents = changes.filter((c) => c.type === "INSERT" || c.type === "UPDATE"); + // Polling diffs a snapshot: every write must be visible exactly once, + // but commit order is unspecified. + expect(writeEvents.length).toBe(3); + const options = writeEvents.map((e) => e.new?.option).sort(); + expect(options).toEqual(["v1", "v2", "v3"]); + + unsubscribe(); + }, + opts.timeout + ); + }); + return; + } + + describe("subscribeToChanges.eventDriven", () => { let storage: ITabularStorage; beforeEach(async () => { @@ -54,18 +122,11 @@ export function subscribeToChangesBlock(opts: TabularStorageContractOpts): void await sleep(waitTime); const writeEvents = changes.filter((c) => c.type === "INSERT" || c.type === "UPDATE"); + // Event-driven backends must emit one event per write in commit order. expect(writeEvents.length).toBe(3); - - const options = writeEvents.map((e) => e.new?.option).sort(); - if (usesPolling) { - // Polling detects all writes in one snapshot diff — order is unspecified - expect(options).toEqual(["v1", "v2", "v3"]); - } else { - // Event-driven subscriptions emit in write order - expect(writeEvents[0].new?.option).toBe("v1"); - expect(writeEvents[1].new?.option).toBe("v2"); - expect(writeEvents[2].new?.option).toBe("v3"); - } + expect(writeEvents[0].new?.option).toBe("v1"); + expect(writeEvents[1].new?.option).toBe("v2"); + expect(writeEvents[2].new?.option).toBe("v3"); unsubscribe(); }, diff --git a/packages/test/src/contract/tabular-storage/types.ts b/packages/test/src/contract/tabular-storage/types.ts index 12cb333c9..7384804d5 100644 --- a/packages/test/src/contract/tabular-storage/types.ts +++ b/packages/test/src/contract/tabular-storage/types.ts @@ -33,24 +33,13 @@ export type TabularStorageContractAssertion = | "withTransactionRollback" | "countMatchesQuery"; -export interface TabularStorageContractOpts { +interface TabularStorageContractBaseOpts { readonly name: string; readonly skip?: boolean; readonly timeout?: number; readonly createStorage: () => Promise< ITabularStorage >; - readonly capabilities: { - readonly supportsSubscriptions: boolean; - readonly supportsVectorColumns: boolean; - readonly supportsTransactions: boolean; - /** Whether `query(criteria)` is supported. False for FsFolder etc. */ - readonly supportsQuery: boolean; - }; - /** Whether this storage uses polling (requires longer waits between steps). */ - readonly usesPolling?: boolean; - /** Polling interval forwarded to subscribeToChanges for polling-based implementations. */ - readonly pollingIntervalMs?: number; /** * Required when capabilities.supportsVectorColumns is true. Creates a fresh * storage instance typed to VectorItemSchema for the round-trip assertion. @@ -65,3 +54,44 @@ export interface TabularStorageContractOpts { */ readonly expectedFailures?: ReadonlyArray; } + +interface SubscriptionCapableCapabilities { + readonly supportsSubscriptions: true; + readonly supportsVectorColumns: boolean; + readonly supportsTransactions: boolean; + /** Whether `query(criteria)` is supported. False for FsFolder etc. */ + readonly supportsQuery: boolean; +} + +interface SubscriptionCapableOpts extends TabularStorageContractBaseOpts { + readonly capabilities: SubscriptionCapableCapabilities; + /** + * Required when supportsSubscriptions is true. Selects between the two + * subscribeToChanges contract blocks: + * - false → eventDriven (strict commit order) + * - true → polling (set equality + count) + */ + readonly usesPolling: boolean; + /** Polling interval forwarded to subscribeToChanges for polling-based implementations. */ + readonly pollingIntervalMs?: number; +} + +interface SubscriptionIncapableCapabilities { + readonly supportsSubscriptions: false; + readonly supportsVectorColumns: boolean; + readonly supportsTransactions: boolean; + /** Whether `query(criteria)` is supported. False for FsFolder etc. */ + readonly supportsQuery: boolean; +} + +interface SubscriptionIncapableOpts extends TabularStorageContractBaseOpts { + readonly capabilities: SubscriptionIncapableCapabilities; + /** + * Only meaningful when supportsSubscriptions is true. Permitted (but + * ignored) on incapable backends to keep wiring ergonomic. + */ + readonly usesPolling?: boolean; + readonly pollingIntervalMs?: number; +} + +export type TabularStorageContractOpts = SubscriptionCapableOpts | SubscriptionIncapableOpts; diff --git a/packages/test/src/test/job-queue/InMemoryJobQueue.test.ts b/packages/test/src/test/job-queue/InMemoryJobQueue.test.ts index 0a81bb718..6390cbeff 100644 --- a/packages/test/src/test/job-queue/InMemoryJobQueue.test.ts +++ b/packages/test/src/test/job-queue/InMemoryJobQueue.test.ts @@ -4,9 +4,20 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { InMemoryQueueStorage, InMemoryRateLimiterStorage, RateLimiter } from "@workglow/job-queue"; -import { setLogger } from "@workglow/util"; -import { describe } from "vitest"; +import type { DeadLetter, IJobExecuteContext } from "@workglow/job-queue"; +import { + ConcurrencyLimiter, + InMemoryQueueStorage, + InMemoryRateLimiterStorage, + Job, + JobQueueClient, + JobQueueServer, + JobStatus, + RateLimiter, + RetryableJobError, +} from "@workglow/job-queue"; +import { setLogger, sleep, uuid4 } from "@workglow/util"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { getTestingLogger } from "../../binding/TestingLogger"; import { runGenericJobQueueTests } from "./genericJobQueueTests"; @@ -22,3 +33,362 @@ describe("InMemoryJobQueue", () => { }) ); }); + +// --------------------------------------------------------------------------- +// New tests for abort_requested_at and lease expiry (PR 2) +// --------------------------------------------------------------------------- + +interface TI { + readonly taskType?: string; + readonly data?: string; + readonly [key: string]: unknown; +} +interface TO { + readonly result?: string; + readonly [key: string]: unknown; +} + +class SimpleTestJob extends Job { + public override async execute(input: TI, context: IJobExecuteContext): Promise { + if (input.taskType === "long_running") { + return new Promise((_, reject) => { + context.signal.addEventListener("abort", () => reject(new Error("Aborted")), { + once: true, + }); + }); + } + return { result: "done" }; + } +} + +describe("InMemoryQueueStorage — abort_requested_at & lease expiry", () => { + let storage: InMemoryQueueStorage; + let queueName: string; + + beforeEach(async () => { + queueName = `test-lease-${uuid4()}`; + storage = new InMemoryQueueStorage(queueName); + await storage.migrate(); + }); + + afterEach(async () => { + await storage.deleteAll(); + }); + + it("abort PENDING → immediate FAILED with abort_requested_at set", async () => { + const id = await storage.add({ input: { data: "x" }, visible_at: null, completed_at: null }); + expect(id).toBeDefined(); + + // Job is PENDING before abort + const before = await storage.get(id); + expect(before?.status).toBe(JobStatus.PENDING); + expect(before?.abort_requested_at).toBeFalsy(); + + await storage.abort(id); + + const after = await storage.get(id); + expect(after?.status).toBe(JobStatus.FAILED); + expect(after?.abort_requested_at).toBeTruthy(); + // No ABORTING status ever appears + expect(after?.status).not.toBe("ABORTING"); + }); + + it("abort PROCESSING → sets abort_requested_at only, leaves status PROCESSING", async () => { + const id = await storage.add({ input: { data: "y" }, visible_at: null, completed_at: null }); + // Claim it + await storage.next("worker-1"); + + const processing = await storage.get(id); + expect(processing?.status).toBe(JobStatus.PROCESSING); + + await storage.abort(id); + + const after = await storage.get(id); + expect(after?.status).toBe(JobStatus.PROCESSING); + expect(after?.abort_requested_at).toBeTruthy(); + expect(after?.status).not.toBe("ABORTING"); + }); + + it("lease expiry re-claim: second worker claims job after first lease expires", async () => { + const id = await storage.add({ input: { data: "z" }, visible_at: null, completed_at: null }); + + // Claim with a very short lease (10ms) + const claimed1 = await storage.next("worker-1", { leaseMs: 10 }); + expect(claimed1?.id).toBe(id); + expect(claimed1?.lease_owner).toBe("worker-1"); + + // Second worker immediately — should NOT claim (lease still active) + const tooEarly = await storage.next("worker-2", { leaseMs: 30000 }); + expect(tooEarly).toBeUndefined(); + + // Wait for lease to expire + await sleep(30); + + // Now worker-2 should reclaim it + const claimed2 = await storage.next("worker-2", { leaseMs: 30000 }); + expect(claimed2?.id).toBe(id); + expect(claimed2?.lease_owner).toBe("worker-2"); + expect(claimed2?.status).toBe(JobStatus.PROCESSING); + }); + + it("extendLease keeps job alive past original expiry", async () => { + const id = await storage.add({ input: { data: "w" }, visible_at: null, completed_at: null }); + + // Claim with a short lease (20ms) + const claimed = await storage.next("worker-a", { leaseMs: 20 }); + expect(claimed?.id).toBe(id); + + // Extend the lease to 5 seconds before it expires + await sleep(5); + await storage.extendLease(id, "worker-a", 5000); + + // Wait past the original 20ms lease + await sleep(30); + + // worker-b should NOT be able to reclaim because the lease was extended + const notClaimed = await storage.next("worker-b", { leaseMs: 30000 }); + expect(notClaimed).toBeUndefined(); + + // worker-a's job should still be PROCESSING and owned by worker-a + const job = await storage.get(id); + expect(job?.status).toBe(JobStatus.PROCESSING); + expect(job?.lease_owner).toBe("worker-a"); + }); + + it("extendLease throws if lease is not owned by worker", async () => { + const id = await storage.add({ input: { data: "v" }, visible_at: null, completed_at: null }); + await storage.next("worker-x"); + + await expect(storage.extendLease(id, "worker-y", 5000)).rejects.toThrow(/extendLease failed/); + }); + + it("abort PROCESSING worker observes abort_requested_at via checkForAbortingJobs", async () => { + const server = new JobQueueServer(SimpleTestJob, { + storage: storage as any, + queueName, + pollIntervalMs: 5, + stopTimeoutMs: 0, + }); + const client = new JobQueueClient({ storage: storage as any, queueName }); + client.attach(server); + + await server.start(); + + const handle = await client.send({ taskType: "long_running", data: "abort-test" }); + + // Wait for PROCESSING + for (let i = 0; i < 200; i++) { + const j = await client.getJob(handle.id); + if (j?.status === JobStatus.PROCESSING) break; + await sleep(5); + } + expect((await client.getJob(handle.id))?.status).toBe(JobStatus.PROCESSING); + + // Abort via storage directly (simulating cross-process abort) + await storage.abort(handle.id); + + // Wait for job to fail + let failed = false; + for (let i = 0; i < 200; i++) { + const j = await client.getJob(handle.id); + if (j?.status === JobStatus.FAILED) { + failed = true; + break; + } + await sleep(5); + } + + await server.stop(); + expect(failed).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Minimal in-memory IMessageQueue for DLQ testing +// --------------------------------------------------------------------------- + +import type { IClaim, IMessageQueue, MessageId } from "@workglow/job-queue"; + +class CollectingQueue implements IMessageQueue { + public readonly scope = "process" as const; + public readonly messages: Body[] = []; + + async send(body: Body): Promise { + this.messages.push(body); + return this.messages.length - 1; + } + + async sendBatch(bodies: readonly Body[]): Promise { + return Promise.all(bodies.map((b) => this.send(b))); + } + + async receive(_opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise[]> { + return []; + } + + async releaseClaim(_id: MessageId): Promise {} + + async migrate(): Promise {} + + getMigrations(): ReadonlyArray { + return []; + } +} + +// --------------------------------------------------------------------------- +// DLQ tests (PR 5) +// --------------------------------------------------------------------------- + +class AlwaysFailJob extends Job { + public override async execute(_input: TI, _context: IJobExecuteContext): Promise { + throw new RetryableJobError("always fails"); + } +} + +describe("InMemoryJobQueue — dead-letter queue (PR 5)", () => { + let storage: InMemoryQueueStorage; + let queueName: string; + + beforeEach(async () => { + queueName = `test-dlq-${uuid4()}`; + storage = new InMemoryQueueStorage(queueName); + await storage.migrate(); + }); + + afterEach(async () => { + await storage.deleteAll(); + }); + + it("exhausted job lands in DLQ with correct fields", async () => { + const dlq = new CollectingQueue>(); + + const server = new JobQueueServer(AlwaysFailJob, { + storage: storage as any, + queueName, + pollIntervalMs: 5, + stopTimeoutMs: 0, + deadLetter: dlq, + }); + const client = new JobQueueClient({ storage: storage as any, queueName }); + client.attach(server); + + await server.start(); + + // maxAttempts=1 so the job exhausts on the first failure + const handle = await client.send({ data: "dlq-test" }, { maxAttempts: 1 }); + + // Wait for FAILED + for (let i = 0; i < 200; i++) { + const j = await client.getJob(handle.id); + if (j?.status === JobStatus.FAILED) break; + await sleep(5); + } + expect((await client.getJob(handle.id))?.status).toBe(JobStatus.FAILED); + + await server.stop(); + + expect(dlq.messages).toHaveLength(1); + const letter = dlq.messages[0]; + expect(letter.original).toEqual({ data: "dlq-test" }); + expect(letter.error).toBeTruthy(); + expect(letter.queueName).toBe(queueName); + expect(letter.attempts).toBeGreaterThanOrEqual(0); + }); + + it("deadLetter: 'discard' (default) — exhausted job reaches FAILED, DLQ stays empty", async () => { + const dlq = new CollectingQueue>(); + + // Deliberately do NOT pass deadLetter — default is "discard" + const server = new JobQueueServer(AlwaysFailJob, { + storage: storage as any, + queueName, + pollIntervalMs: 5, + stopTimeoutMs: 0, + }); + const client = new JobQueueClient({ storage: storage as any, queueName }); + client.attach(server); + + await server.start(); + + const handle = await client.send({ data: "discard-test" }, { maxAttempts: 1 }); + + // Wait for FAILED + for (let i = 0; i < 200; i++) { + const j = await client.getJob(handle.id); + if (j?.status === JobStatus.FAILED) break; + await sleep(5); + } + expect((await client.getJob(handle.id))?.status).toBe(JobStatus.FAILED); + + await server.stop(); + + // DLQ was never passed to the server, so nothing was forwarded + expect(dlq.messages).toHaveLength(0); + }); +}); + +// --------------------------------------------------------------------------- +// Prefetch tests (PR 5) +// --------------------------------------------------------------------------- + +describe("InMemoryJobQueue — worker prefetch (PR 5)", () => { + it("prefetch: 4 with ConcurrencyLimiter(2) — at most 2 jobs run concurrently", async () => { + const queueName = `test-prefetch-${uuid4()}`; + const storage = new InMemoryQueueStorage(queueName); + await storage.migrate(); + + let concurrent = 0; + let maxConcurrent = 0; + + class TrackedSlowJob extends Job { + public override async execute(_input: TI, _context: IJobExecuteContext): Promise { + concurrent++; + maxConcurrent = Math.max(maxConcurrent, concurrent); + await sleep(30); + concurrent--; + return { result: "done" }; + } + } + + const limiter = new ConcurrencyLimiter(2); + + const server = new JobQueueServer(TrackedSlowJob, { + storage: storage as any, + queueName, + pollIntervalMs: 5, + stopTimeoutMs: 200, + limiter, + prefetch: 4, + }); + const client = new JobQueueClient({ storage: storage as any, queueName }); + client.attach(server); + + await server.start(); + + // Submit 6 jobs + const handles = await Promise.all( + Array.from({ length: 6 }, (_, i) => client.send({ data: `job-${i}` })) + ); + + // Wait for all to complete + for (let i = 0; i < 500; i++) { + const statuses = await Promise.all(handles.map((h) => client.getJob(h.id))); + if (statuses.every((j) => j?.status === JobStatus.COMPLETED)) break; + await sleep(10); + } + + const statuses = await Promise.all(handles.map((h) => client.getJob(h.id))); + expect(statuses.every((j) => j?.status === JobStatus.COMPLETED)).toBe(true); + + // ConcurrencyLimiter(2) must have enforced a max of 2 concurrent jobs + expect(maxConcurrent).toBeLessThanOrEqual(2); + expect(maxConcurrent).toBeGreaterThanOrEqual(1); + + await server.stop(); + await storage.deleteAll(); + }); +}); diff --git a/packages/test/src/test/job-queue/Limiters.test.ts b/packages/test/src/test/job-queue/Limiters.test.ts index 3ed214667..0aa1368e6 100644 --- a/packages/test/src/test/job-queue/Limiters.test.ts +++ b/packages/test/src/test/job-queue/Limiters.test.ts @@ -17,10 +17,6 @@ import { beforeEach, describe, expect, it } from "vitest"; describe("NullLimiter", () => { const limiter = new NullLimiter(); - it("should always allow proceeding", async () => { - expect(await limiter.canProceed()).toBe(true); - }); - it("should always tryAcquire successfully", async () => { expect(await limiter.tryAcquire()).not.toBeNull(); expect(await limiter.tryAcquire()).not.toBeNull(); @@ -30,9 +26,7 @@ describe("NullLimiter", () => { expect(limiter.scope).toBe("process"); }); - it("should not throw on any method call", async () => { - await expect(limiter.recordJobStart()).resolves.toBeUndefined(); - await expect(limiter.recordJobCompletion()).resolves.toBeUndefined(); + it("should not throw on release, setNextAvailableTime, or clear", async () => { await expect(limiter.release(null)).resolves.toBeUndefined(); await expect(limiter.setNextAvailableTime(new Date())).resolves.toBeUndefined(); await expect(limiter.clear()).resolves.toBeUndefined(); @@ -72,41 +66,32 @@ describe("ConcurrencyLimiter", () => { expect(await limiter.tryAcquire()).not.toBeNull(); }); - it("should allow proceeding when under limit", async () => { - expect(await limiter.canProceed()).toBe(true); - }); - - it("should block when at concurrency limit", async () => { - await limiter.recordJobStart(); - await limiter.recordJobStart(); - expect(await limiter.canProceed()).toBe(false); - }); - - it("should allow again after job completion", async () => { - await limiter.recordJobStart(); - await limiter.recordJobStart(); - expect(await limiter.canProceed()).toBe(false); - await limiter.recordJobCompletion(); - expect(await limiter.canProceed()).toBe(true); + it("tryAcquire should return null when at concurrency limit", async () => { + await limiter.tryAcquire(); + await limiter.tryAcquire(); + expect(await limiter.tryAcquire()).toBeNull(); }); - it("should not go below zero running jobs", async () => { - await limiter.recordJobCompletion(); - await limiter.recordJobCompletion(); - expect(await limiter.canProceed()).toBe(true); + it("tryAcquire should succeed again after release", async () => { + const t1 = await limiter.tryAcquire(); + const t2 = await limiter.tryAcquire(); + expect(await limiter.tryAcquire()).toBeNull(); + await limiter.release(t1); + expect(await limiter.tryAcquire()).not.toBeNull(); + await limiter.release(t2); }); it("should reset on clear", async () => { - await limiter.recordJobStart(); - await limiter.recordJobStart(); + await limiter.tryAcquire(); + await limiter.tryAcquire(); await limiter.clear(); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); }); it("should respect setNextAvailableTime", async () => { const future = new Date(Date.now() + 100_000); await limiter.setNextAvailableTime(future); - expect(await limiter.canProceed()).toBe(false); + expect(await limiter.tryAcquire()).toBeNull(); }); }); @@ -127,80 +112,92 @@ describe("DelayLimiter", () => { expect(successes).toBe(1); }); - it("should allow proceeding initially", async () => { - expect(await limiter.canProceed()).toBe(true); + it("should allow tryAcquire initially", async () => { + expect(await limiter.tryAcquire()).not.toBeNull(); }); - it("should block after recording a job start", async () => { - await limiter.recordJobStart(); - expect(await limiter.canProceed()).toBe(false); + it("should block tryAcquire during delay window", async () => { + await limiter.tryAcquire(); + expect(await limiter.tryAcquire()).toBeNull(); }); - it("should allow proceeding after delay expires", async () => { + it("should allow tryAcquire after delay expires", async () => { const shortDelayLimiter = new DelayLimiter(10); - await shortDelayLimiter.recordJobStart(); + await shortDelayLimiter.tryAcquire(); await sleep(20); - expect(await shortDelayLimiter.canProceed()).toBe(true); + expect(await shortDelayLimiter.tryAcquire()).not.toBeNull(); }); it("should reset on clear", async () => { - await limiter.recordJobStart(); + await limiter.tryAcquire(); await limiter.clear(); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); }); it("should only update nextAvailableTime if later", async () => { const past = new Date(Date.now() - 1000); await limiter.setNextAvailableTime(past); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); }); }); describe("CompositeLimiter", () => { - it("should proceed when all limiters agree", async () => { + it("should tryAcquire successfully when all limiters agree", async () => { const limiter = new CompositeLimiter([new NullLimiter(), new NullLimiter()]); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); }); - it("should block when any limiter blocks", async () => { + it("should return null from tryAcquire when any limiter is at capacity", async () => { const concurrency = new ConcurrencyLimiter(1); - await concurrency.recordJobStart(); + await concurrency.tryAcquire(); const limiter = new CompositeLimiter([new NullLimiter(), concurrency]); - expect(await limiter.canProceed()).toBe(false); + expect(await limiter.tryAcquire()).toBeNull(); }); it("should addLimiter dynamically", async () => { const limiter = new CompositeLimiter(); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); const blocking = new ConcurrencyLimiter(0); limiter.addLimiter(blocking); - expect(await limiter.canProceed()).toBe(false); + expect(await limiter.tryAcquire()).toBeNull(); }); - it("should propagate recordJobStart to all limiters", async () => { + it("should propagate tryAcquire state to all child limiters", async () => { const concurrency = new ConcurrencyLimiter(2); const limiter = new CompositeLimiter([concurrency]); - await limiter.recordJobStart(); - await limiter.recordJobStart(); - expect(await concurrency.canProceed()).toBe(false); + await limiter.tryAcquire(); + await limiter.tryAcquire(); + expect(await concurrency.tryAcquire()).toBeNull(); }); it("should return latest getNextAvailableTime across limiters", async () => { - const delay1 = new DelayLimiter(10); const delay2 = new DelayLimiter(1000); + await delay2.tryAcquire(); const before = Date.now(); - await delay2.recordJobStart(); - const limiter = new CompositeLimiter([delay1, delay2]); + const limiter = new CompositeLimiter([new DelayLimiter(10), delay2]); const nextTime = await limiter.getNextAvailableTime(); expect(nextTime.getTime()).toBeGreaterThan(before + 500); }); it("should propagate clear to all limiters", async () => { const concurrency = new ConcurrencyLimiter(1); - await concurrency.recordJobStart(); + await concurrency.tryAcquire(); const limiter = new CompositeLimiter([concurrency]); await limiter.clear(); - expect(await concurrency.canProceed()).toBe(true); + expect(await concurrency.tryAcquire()).not.toBeNull(); + }); + + it("should roll back already-acquired slots when a later child fails", async () => { + const first = new ConcurrencyLimiter(1); + const second = new ConcurrencyLimiter(1); + // Saturate second so the composite will fail on it + await second.tryAcquire(); + const composite = new CompositeLimiter([first, second]); + const result = await composite.tryAcquire(); + expect(result).toBeNull(); + // first's slot should have been rolled back — it can be acquired again + const token = await first.tryAcquire(); + expect(token).not.toBeNull(); }); }); @@ -229,9 +226,9 @@ describe("EvenlySpacedRateLimiter", () => { ); }); - it("should allow proceeding initially", async () => { + it("should allow tryAcquire initially", async () => { const limiter = new EvenlySpacedRateLimiter({ maxExecutions: 10, windowSizeInSeconds: 1 }); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); }); it("should report process scope", () => { @@ -247,36 +244,35 @@ describe("EvenlySpacedRateLimiter", () => { expect(successes).toBe(1); }); - it("should space requests by setting next available time", async () => { + it("should space requests by advancing next available time after tryAcquire", async () => { const limiter = new EvenlySpacedRateLimiter({ maxExecutions: 10, windowSizeInSeconds: 10 }); - await limiter.recordJobStart(); + await limiter.tryAcquire(); // idealInterval = 10000/10 = 1000ms, so next available should be ~1s from now const nextTime = await limiter.getNextAvailableTime(); expect(nextTime.getTime()).toBeGreaterThan(Date.now() + 500); }); - it("should track job completion durations", async () => { - const limiter = new EvenlySpacedRateLimiter({ maxExecutions: 10, windowSizeInSeconds: 1 }); - await limiter.recordJobStart(); - await limiter.recordJobCompletion(); - // After recording completion, a second start should account for duration - await limiter.recordJobStart(); - const nextTime = await limiter.getNextAvailableTime(); - expect(nextTime.getTime()).toBeGreaterThanOrEqual(Date.now()); - }); - it("should reset on clear", async () => { const limiter = new EvenlySpacedRateLimiter({ maxExecutions: 1, windowSizeInSeconds: 100 }); - await limiter.recordJobStart(); - expect(await limiter.canProceed()).toBe(false); + await limiter.tryAcquire(); + expect(await limiter.tryAcquire()).toBeNull(); await limiter.clear(); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); }); it("should only update nextAvailableTime if later via setNextAvailableTime", async () => { const limiter = new EvenlySpacedRateLimiter({ maxExecutions: 10, windowSizeInSeconds: 1 }); const past = new Date(Date.now() - 1000); await limiter.setNextAvailableTime(past); - expect(await limiter.canProceed()).toBe(true); + expect(await limiter.tryAcquire()).not.toBeNull(); + }); + + it("release should roll back the slot so a follow-up tryAcquire succeeds", async () => { + const limiter = new EvenlySpacedRateLimiter({ maxExecutions: 1, windowSizeInSeconds: 100 }); + const token = await limiter.tryAcquire(); + expect(token).not.toBeNull(); + expect(await limiter.tryAcquire()).toBeNull(); + await limiter.release(token); + expect(await limiter.tryAcquire()).not.toBeNull(); }); }); diff --git a/packages/test/src/test/job-queue/RateLimiter.test.ts b/packages/test/src/test/job-queue/RateLimiter.test.ts index 8ebe2148c..6279fc987 100644 --- a/packages/test/src/test/job-queue/RateLimiter.test.ts +++ b/packages/test/src/test/job-queue/RateLimiter.test.ts @@ -120,46 +120,6 @@ describe("RateLimiter", () => { }); }); - describe("canProceed", () => { - it("should allow when execution count is below limit", async () => { - const limiter = new RateLimiter(storage, "queue", { - maxExecutions: 5, - windowSizeInSeconds: 60, - }); - expect(await limiter.canProceed()).toBe(true); - }); - - it("should block when execution count meets limit", async () => { - storage._setExecutionCount(5); - const limiter = new RateLimiter(storage, "queue", { - maxExecutions: 5, - windowSizeInSeconds: 60, - }); - expect(await limiter.canProceed()).toBe(false); - }); - }); - - describe("recordJobStart", () => { - it("should call storage.recordExecution", async () => { - const limiter = new RateLimiter(storage, "queue", { - maxExecutions: 10, - windowSizeInSeconds: 60, - }); - await limiter.recordJobStart(); - expect(storage.recordExecution).toHaveBeenCalledWith("queue"); - }); - }); - - describe("recordJobCompletion", () => { - it("should be a no-op", async () => { - const limiter = new RateLimiter(storage, "queue", { - maxExecutions: 10, - windowSizeInSeconds: 60, - }); - await expect(limiter.recordJobCompletion()).resolves.toBeUndefined(); - }); - }); - describe("clear", () => { it("should clear storage", async () => { const limiter = new RateLimiter(storage, "queue", { diff --git a/packages/test/src/test/job-queue/TelemetryQueueStorage.test.ts b/packages/test/src/test/job-queue/TelemetryQueueStorage.test.ts index 819ef81cd..e1b639a28 100644 --- a/packages/test/src/test/job-queue/TelemetryQueueStorage.test.ts +++ b/packages/test/src/test/job-queue/TelemetryQueueStorage.test.ts @@ -38,7 +38,7 @@ describe("TelemetryQueueStorage", () => { it("should forward add and create a span", async () => { const id = await wrapped.add({ input: { data: "test" }, - run_after: null, + visible_at: null, completed_at: null, }); expect(id).toBeDefined(); @@ -53,7 +53,7 @@ describe("TelemetryQueueStorage", () => { it("should forward next and create a span", async () => { await inner.add({ input: { data: "test" }, - run_after: null, + visible_at: null, completed_at: null, }); const job = await wrapped.next("worker-1"); @@ -70,7 +70,7 @@ describe("TelemetryQueueStorage", () => { it("should forward deleteAll and create a span", async () => { await inner.add({ input: { data: "test" }, - run_after: null, + visible_at: null, completed_at: null, }); await wrapped.deleteAll(); @@ -80,7 +80,7 @@ describe("TelemetryQueueStorage", () => { it("should forward get and create a span", async () => { const id = await inner.add({ input: { data: "test" }, - run_after: null, + visible_at: null, completed_at: null, }); const job = await wrapped.get(id); diff --git a/packages/test/src/test/job-queue/genericJobQueueTests.ts b/packages/test/src/test/job-queue/genericJobQueueTests.ts index 68fdc8ac7..e27d37be9 100644 --- a/packages/test/src/test/job-queue/genericJobQueueTests.ts +++ b/packages/test/src/test/job-queue/genericJobQueueTests.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { IQueueStorage, JobHandle } from "@workglow/job-queue"; +import type { IQueueStorage, JobHandle, JobStorageFormat } from "@workglow/job-queue"; import { AbortSignalJobError, IJobExecuteContext, @@ -215,7 +215,7 @@ export function runGenericJobQueueTests( describe("Basics", () => { it("should add a job to the queue", async () => { - const handle = await client.submit({ taskType: "task1", data: "input1" }); + const handle = await client.send({ taskType: "task1", data: "input1" }); expect(await client.size()).toBe(1); const retrievedJob = await client.getJob(handle.id); expect(retrievedJob?.status).toBe(JobStatus.PENDING); @@ -243,7 +243,7 @@ export function runGenericJobQueueTests( await server.start(); // Add and complete a job - const handle = await client.submit({ taskType: "other", data: "input1" }); + const handle = await client.send({ taskType: "other", data: "input1" }); await handle.waitFor(); const jobExists = !!(await client.getJob(handle.id)); @@ -261,7 +261,7 @@ export function runGenericJobQueueTests( await server.start(); // Add and complete a job - const handle = await client.submit({ taskType: "other", data: "input1" }); + const handle = await client.send({ taskType: "other", data: "input1" }); await handle.waitFor(); // Give a small delay @@ -291,7 +291,7 @@ export function runGenericJobQueueTests( await server.start(); // Test completed job - immediate deletion happens in completeJob - const completedHandle = await client.submit({ taskType: "other", data: "input1" }); + const completedHandle = await client.send({ taskType: "other", data: "input1" }); await completedHandle.waitFor(); // Small delay to allow cleanup @@ -300,7 +300,7 @@ export function runGenericJobQueueTests( expect(completedJobExists).toBe(false); // Test failed job - const failedHandle = await client.submit({ taskType: "failing", data: "input2" }); + const failedHandle = await client.send({ taskType: "failing", data: "input2" }); try { await failedHandle.waitFor(); } catch (error) { @@ -316,8 +316,8 @@ export function runGenericJobQueueTests( it("should process jobs and get stats", async () => { await server.start(); - const handle1 = await client.submit({ taskType: "other", data: "input1" }); - const handle2 = await client.submit({ taskType: "other", data: "input2" }); + const handle1 = await client.send({ taskType: "other", data: "input1" }); + const handle2 = await client.send({ taskType: "other", data: "input2" }); await handle1.waitFor(); await handle2.waitFor(); @@ -329,15 +329,15 @@ export function runGenericJobQueueTests( }); it("should clear all jobs in the queue", async () => { - await client.submit({ taskType: "task1", data: "input1" }); - await client.submit({ taskType: "task1", data: "input1" }); + await client.send({ taskType: "task1", data: "input1" }); + await client.send({ taskType: "task1", data: "input1" }); expect(await client.size()).toBe(2); await storage.deleteAll(); expect(await client.size()).toBe(0); }); it("should retrieve the output for a given task type and input", async () => { - const handle = await client.submit({ taskType: "task1", data: "input1" }); + const handle = await client.send({ taskType: "task1", data: "input1" }); await server.start(); await handle.waitFor(); const output = await client.outputForInput({ taskType: "task1", data: "input1" }); @@ -345,10 +345,10 @@ export function runGenericJobQueueTests( }); it("should run the queue and execute all", async () => { - await client.submit({ taskType: "task1", data: "input1" }); - await client.submit({ taskType: "task2", data: "input2" }); - await client.submit({ taskType: "task1", data: "input1" }); - const lastHandle = await client.submit({ taskType: "task2", data: "input2" }); + await client.send({ taskType: "task1", data: "input1" }); + await client.send({ taskType: "task2", data: "input2" }); + await client.send({ taskType: "task1", data: "input1" }); + const lastHandle = await client.send({ taskType: "task2", data: "input2" }); await server.start(); await lastHandle.waitFor(); await server.stop(); @@ -361,9 +361,9 @@ export function runGenericJobQueueTests( const totalJobs = 16; const maxAllowed = 4; // limiter: 4 per 60s for (let i = 0; i < totalJobs - 1; i++) { - await client.submit({ taskType: "task1", data: `input${i}` }); + await client.send({ taskType: "task1", data: `input${i}` }); } - await client.submit({ taskType: "task2", data: "input_last" }); + await client.send({ taskType: "task2", data: "input_last" }); await server.start(); @@ -385,7 +385,7 @@ export function runGenericJobQueueTests( }); it("should abort a long-running job and trigger the abort event", async () => { - const handle = await client.submit({ taskType: "long_running", data: "input101" }); + const handle = await client.send({ taskType: "long_running", data: "input101" }); let abortEventTriggered = false; client.on("job_aborting", (_qn: string, eventJobId: unknown) => { @@ -425,25 +425,25 @@ export function runGenericJobQueueTests( }); expect(abortEventTriggered).toBe(true); const finalJob = await client.getJob(handle.id); - expect(finalJob?.status).toBeOneOf([JobStatus.FAILED, JobStatus.ABORTING]); + expect(finalJob?.status).toBe(JobStatus.FAILED); }); it("should abort all jobs in a job run while leaving other jobs unaffected", async () => { const jobRunId1 = "test-run-1"; const jobRunId2 = "test-run-2"; - const handle1 = await client.submit( + const handle1 = await client.send( { taskType: "long_running", data: "input1" }, { jobRunId: jobRunId1 } ); - const handle2 = await client.submit( + const handle2 = await client.send( { taskType: "long_running", data: "input2" }, { jobRunId: jobRunId1 } ); - const handle3 = await client.submit( + const handle3 = await client.send( { taskType: "long_running", data: "input3" }, { jobRunId: jobRunId2 } ); - const handle4 = await client.submit( + const handle4 = await client.send( { taskType: "long_running", data: "input4" }, { jobRunId: jobRunId2 } ); @@ -463,28 +463,20 @@ export function runGenericJobQueueTests( } await client.abortJobRun(jobRunId1); - while (attempts < 50) { - const job3Status = (await client.getJob(handle3.id))?.status; - const job4Status = (await client.getJob(handle4.id))?.status; - if ( - (job3Status === JobStatus.FAILED || job3Status === JobStatus.ABORTING) && - (job4Status === JobStatus.FAILED || job4Status === JobStatus.ABORTING) - ) { + // Wait for handle1 and handle2 (jobRunId1) to be aborted/failed + while (attempts < 200) { + const job1Status = (await client.getJob(handle1.id))?.status; + const job2Status = (await client.getJob(handle2.id))?.status; + if (job1Status === JobStatus.FAILED && job2Status === JobStatus.FAILED) { break; } - await sleep(1); + await sleep(5); attempts++; } // Verify job statuses - expect((await client.getJob(handle1.id))?.status).toBeOneOf([ - JobStatus.FAILED, - JobStatus.ABORTING, - ]); - expect((await client.getJob(handle2.id))?.status).toBeOneOf([ - JobStatus.FAILED, - JobStatus.ABORTING, - ]); + expect((await client.getJob(handle1.id))?.status).toBe(JobStatus.FAILED); + expect((await client.getJob(handle2.id))?.status).toBe(JobStatus.FAILED); const job3Status = (await client.getJob(handle3.id))?.status; const job4Status = (await client.getJob(handle4.id))?.status; @@ -493,7 +485,7 @@ export function runGenericJobQueueTests( }); it("should wait for a job to complete", async () => { - const handle = await client.submit({ taskType: "task1", data: "input1" }); + const handle = await client.send({ taskType: "task1", data: "input1" }); await server.start(); const output = await handle.waitFor(); expect(output).toEqual({ result: "output1" }); @@ -542,10 +534,10 @@ export function runGenericJobQueueTests( try { // Add jobs to both queues - const handle1 = await client1.submit({ taskType: "task1", data: "queue1-job1" }); - const handle2 = await client1.submit({ taskType: "task1", data: "queue1-job2" }); - const handle3 = await client2.submit({ taskType: "task1", data: "queue2-job1" }); - const handle4 = await client2.submit({ taskType: "task1", data: "queue2-job2" }); + const handle1 = await client1.send({ taskType: "task1", data: "queue1-job1" }); + const handle2 = await client1.send({ taskType: "task1", data: "queue1-job2" }); + const handle3 = await client2.send({ taskType: "task1", data: "queue2-job1" }); + const handle4 = await client2.send({ taskType: "task1", data: "queue2-job2" }); // Verify each queue only sees its own jobs expect(await client1.size()).toBe(2); @@ -631,7 +623,7 @@ export function runGenericJobQueueTests( client.attach(server); await server.start(); - const handle = await client.submit({ taskType: "other", data: "input-wake" }); + const handle = await client.send({ taskType: "other", data: "input-wake" }); const start = Date.now(); const result = (await Promise.race([ handle.waitFor(), @@ -645,7 +637,7 @@ export function runGenericJobQueueTests( itFastWake("deferred submit wakes before the poll interval elapses", async () => { // pollIntervalMs is 60s — without notify() flipping hasDeferredJobs, the - // worker would sleep through the full 60s and miss the runAfter deadline. + // worker would sleep through the full 60s and miss the visible_at deadline. await server.stop(); const limiter = await limiterFactory?.(queueName, 4, 60); server = new JobQueueServer(TestJob, { @@ -657,10 +649,9 @@ export function runGenericJobQueueTests( client.attach(server); await server.start(); - const runAfter = new Date(Date.now() + 200); - const handle = await client.submit( + const handle = await client.send( { taskType: "other", data: "deferred-wake" }, - { runAfter } + { delaySeconds: 0.2 } ); const start = Date.now(); @@ -674,7 +665,7 @@ export function runGenericJobQueueTests( expect(Date.now() - start).toBeLessThan(5_000); }); - itFastWake("abort resolves quickly without waiting for an ABORTING poll", async () => { + itFastWake("abort resolves quickly via in-process requestAbort path", async () => { // Long poll interval so the only route to abort delivery is the // in-process requestAbort path (Change 3). await server.stop(); @@ -688,7 +679,7 @@ export function runGenericJobQueueTests( client.attach(server); await server.start(); - const handle = await client.submit({ taskType: "long_running", data: "to-abort" }); + const handle = await client.send({ taskType: "long_running", data: "to-abort" }); // Wait for the worker to pick it up (accommodate slower async storages). for (let i = 0; i < 300; i++) { @@ -724,7 +715,7 @@ export function runGenericJobQueueTests( await server.start(); // Submit a job that sleeps briefly, then completes. - const handle = await client.submit({ taskType: "other", data: "drain" }); + const handle = await client.send({ taskType: "other", data: "drain" }); // Wait until PROCESSING, then stop — the drain should wait for it to finish. for (let i = 0; i < 50; i++) { @@ -751,7 +742,7 @@ export function runGenericJobQueueTests( try { await server.start(); - const handle = await client.submit({ taskType: "progress", data: "track-progress" }); + const handle = await client.send({ taskType: "progress", data: "track-progress" }); await handle.waitFor(); expect(saveProgressCalls).toBe(0); } finally { @@ -769,7 +760,7 @@ export function runGenericJobQueueTests( details: Record | null; }> = []; - const handle = await client.submit({ taskType: "progress", data: "input1" }); + const handle = await client.send({ taskType: "progress", data: "input1" }); // Listen for progress events client.on( @@ -822,7 +813,7 @@ export function runGenericJobQueueTests( details: Record | null; }> = []; - const handle = await client.submit({ taskType: "progress", data: "input1" }); + const handle = await client.send({ taskType: "progress", data: "input1" }); // Add job-specific listener const cleanup = handle.onProgress( @@ -866,7 +857,7 @@ export function runGenericJobQueueTests( // Set up multiple jobs that take some time to complete const handles = []; for (let i = 0; i < 10; i++) { - const handle = await client.submit({ taskType: "progress", data: `input${i}` }); + const handle = await client.send({ taskType: "progress", data: `input${i}` }); handles.push(handle); } @@ -890,7 +881,7 @@ export function runGenericJobQueueTests( // Add burst of jobs for (let i = 0; i < numJobs; i++) { - const handle = await client.submit({ taskType: "other", data: `input${i}` }); + const handle = await client.send({ taskType: "other", data: `input${i}` }); handles.push(handle); } @@ -918,11 +909,11 @@ export function runGenericJobQueueTests( // between those reads, producing transient under/over-count snapshots on // faster Vitest/Node runs. async function getJobCounts( - runAttempts = 50, + attempts = 50, retryDelay = 5 ): Promise<{ pending: number; processing: number; completed: number }> { let lastCounts = { pending: 0, processing: 0, completed: 0 }; - for (let i = 0; i < runAttempts; i++) { + for (let i = 0; i < attempts; i++) { try { const jobs = await Promise.all(handles.map((handle) => client.getJob(handle.id))); const pending = jobs.filter((job) => job?.status === JobStatus.PENDING).length; @@ -935,7 +926,7 @@ export function runGenericJobQueueTests( return lastCounts; } } catch (err) { - if (i === runAttempts - 1) throw err; + if (i === attempts - 1) throw err; } await sleep(retryDelay); } @@ -964,7 +955,7 @@ export function runGenericJobQueueTests( // Try to add jobs faster than the rate limit for (let i = 0; i < 30; i++) { - const handle = await client.submit({ taskType: "progress", data: `input${i}` }); + const handle = await client.send({ taskType: "progress", data: `input${i}` }); handles.push(handle); } @@ -987,7 +978,7 @@ export function runGenericJobQueueTests( describe("Job Queue Restart", () => { it("should recover rate limits after pause", async () => { // Add a single quick job to test rate limiting - const initialHandle = await client.submit({ taskType: "other", data: "test_job" }); + const initialHandle = await client.send({ taskType: "other", data: "test_job" }); // Start queue and wait for job to complete await server.start(); @@ -1001,7 +992,7 @@ export function runGenericJobQueueTests( await server.stop(); // Add another job after pause - const newHandle = await client.submit({ taskType: "other", data: "after_pause" }); + const newHandle = await client.send({ taskType: "other", data: "after_pause" }); const pendingJob = await client.getJob(newHandle.id); expect(pendingJob?.status).toBe(JobStatus.PENDING); @@ -1019,9 +1010,9 @@ export function runGenericJobQueueTests( describe("Error Handling", () => { it("should handle job failures and mark job as failed", async () => { - const handle = await client.submit( + const handle = await client.send( { taskType: "failing", data: "will-fail" }, - { maxRetries: 0 } + { maxAttempts: 1 } ); let error: Error | null = null; @@ -1039,13 +1030,18 @@ export function runGenericJobQueueTests( expect(failedJob?.status).toBe(JobStatus.FAILED); expect(failedJob?.error).toBe("Job failed as expected"); expect(failedJob?.errorCode).toBe("JobError"); - expect(failedJob?.runAttempts).toBe(1); + // Post-finalize semantics (C2 + M4): a single failed attempt that + // exhausts maxAttempts=1 ends the run via failJob → claim.fail() → + // storage.finalize(). finalize() does NOT bump `attempts`, so the + // counter remains 0. (The old code bumped via complete() which is + // exactly the double-counting bug being fixed.) + expect(failedJob?.attempts).toBe(0); }); - it("should retry a failed job up to maxRetries", async () => { - const handle = await client.submit( + it("should retry a failed job up to maxAttempts", async () => { + const handle = await client.send( { taskType: "failing_retryable", data: "will-retry" }, - { maxRetries: 2 } + { maxAttempts: 3 } ); let error: Error | null = null; @@ -1063,8 +1059,13 @@ export function runGenericJobQueueTests( const failedJob = await client.getJob(handle.id); expect(failedJob?.status).toBe(JobStatus.FAILED); - expect(failedJob?.runAttempts).toBe(3); // Should have attempted 3 times - expect(failedJob?.error).toBe("Max retries reached"); + // Post-finalize semantics: the PENDING-retry path bumps attempts in + // storage.complete() — so the first two retries bump from 0→1→2. + // The third (final) attempt fails permanently and goes through + // failJob → claim.fail() → finalize() which does NOT bump. Final + // value: 2. (The old behaviour bumped here too, yielding 3.) + expect(failedJob?.attempts).toBe(2); + expect(failedJob?.error).toBe("Max attempts reached"); await server.stop(); }); @@ -1073,9 +1074,9 @@ export function runGenericJobQueueTests( const telemetry = new RecordingTelemetryProvider(); setTelemetryProvider(telemetry); - const handle = await client.submit( + const handle = await client.send( { taskType: "failing_retryable", data: "will-retry" }, - { maxRetries: 2 } + { maxAttempts: 3 } ); try { @@ -1088,16 +1089,16 @@ export function runGenericJobQueueTests( const span = telemetry.spans.at(-1); expect(span?.status).toEqual({ code: SpanStatusCode.ERROR, - message: "Max retries reached", + message: "Max attempts reached", }); - expect(span?.attributes["workglow.job.error"]).toBe("Max retries reached"); + expect(span?.attributes["workglow.job.error"]).toBe("Max attempts reached"); }); it("should handle permanent failures without retrying", async () => { await server.start(); - const handle = await client.submit( + const handle = await client.send( { taskType: "permanent_fail", data: "no-retry" }, - { maxRetries: 2 } + { maxAttempts: 3 } ); let error: Error | null = null; @@ -1113,7 +1114,10 @@ export function runGenericJobQueueTests( const failedJob = await client.getJob(handle.id); expect(failedJob?.status).toBe(JobStatus.FAILED); expect(failedJob?.error).toBe("Permanent failure - do not retry"); - expect(failedJob?.runAttempts).toBe(1); // Should not retry permanent failures + // A permanent failure on the first attempt skips rescheduleJob and goes + // straight to failJob → claim.fail() → finalize(), which does NOT bump + // attempts (C2 + M4). Final counter: 0. + expect(failedJob?.attempts).toBe(0); await server.stop(); }); @@ -1130,9 +1134,9 @@ export function runGenericJobQueueTests( errorEventError = error; }); - const handle = await client.submit( + const handle = await client.send( { taskType: "failing", data: "will-fail" }, - { maxRetries: 0 } + { maxAttempts: 1 } ); try { @@ -1146,4 +1150,198 @@ export function runGenericJobQueueTests( expect(errorEventError).toContain("Job failed as expected"); }); }); + + describe("atomic disableJob (H5)", () => { + it("disable() writes DISABLED in a single storage write — never observes FAILED", async () => { + // The H5 contract: disableJob writes status=DISABLED in one storage + // operation. The legacy two-write path (claim.fail() then + // saveStatus(DISABLED)) briefly persisted FAILED, so any subscriber + // observing during the window saw a transient FAILED → DISABLED. + // + // We assert this two ways: + // - storage.get() after the call shows DISABLED. + // - if the backend supports subscriptions, no FAILED transition + // appears in the event stream for this id. + // Backends without working subscribeToChanges (subscriptions disabled + // or limited) simply do not emit anything; the final-state assertion + // is the strong invariant. + const handle = await client.send({ taskType: "task1", data: "atomic-disable" }); + const id = handle.id; + + const transitions: string[] = []; + // Subscriptions are optional per backend. Sqlite/Postgres-with-Pool/ + // Supabase throw synchronously when subscribe is unsupported; treat + // that as "no events to observe" and let the final-state assertion + // carry the contract. + let unsubscribe: () => void = () => {}; + try { + unsubscribe = storage.subscribeToChanges((change) => { + const newStatus = change.new?.status; + if (newStatus && change.new?.id === id) { + transitions.push(newStatus); + } + }); + } catch { + // backend does not support subscribe — skip the event-stream check + } + await sleep(20); + + await storage.next("test-worker", { leaseMs: 30_000 }); + await storage.finalize(id, { + status: JobStatus.DISABLED, + completed_at: new Date().toISOString(), + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + }); + await sleep(100); + unsubscribe(); + + // Final-state invariant — strong, works for every backend. + const final = await storage.get(id); + expect(final?.status).toBe(JobStatus.DISABLED); + + // Event-stream invariant — only enforced when the backend produced any + // transitions at all. Sqlite/Postgres/Supabase may emit nothing here + // depending on their LISTEN/NOTIFY config; that's OK — the absence of + // FAILED is what matters when we DO see transitions. + if (transitions.length > 0) { + expect(transitions).not.toContain(JobStatus.FAILED); + } + }); + }); + + describe("atomic ack/fail (H2)", () => { + it("ack persists result+status in one write — no separate saveResult step", async () => { + // The H2 contract: claim.ack(result) writes output + COMPLETED in a + // single storage operation. Earlier the worker did + // `jobStore.saveResult(...)` THEN `claim.ack()` — two separate writes + // that could split a row into "result saved, status still PROCESSING". + // We exercise that contract directly through the storage API: there + // should be no path that observes a COMPLETED row with output=null + // when the caller passed a non-null result. + const handle = await client.send({ taskType: "task1", data: "atomic-ack" }); + const id = handle.id; + const claimed = await storage.next("test-worker", { leaseMs: 30_000 }); + expect(claimed?.id).toBe(id); + // Directly call finalize() — the same call path claim.ack() takes. + await storage.finalize(id, { + output: { result: "computed" } as unknown as TOutput, + error: null, + error_code: null, + status: JobStatus.COMPLETED, + completed_at: new Date().toISOString(), + }); + const final = await storage.get(id); + expect(final?.status).toBe(JobStatus.COMPLETED); + expect(final?.output).toEqual({ result: "computed" }); + }); + }); + + describe("ack must not bump attempts (C2 + M4)", () => { + it("submit → claim → finalize(COMPLETED): attempts stays at 0", async () => { + // The contract: ack/fail go through storage.finalize(), which does NOT + // touch the `attempts` counter. A successful execution must not consume + // a retry attempt — the lease-expiry reclaim already charges the + // attempt at next() time, so charging it again here double-counts and + // can roll a healthy job into MAX_ATTEMPTS_REACHED. + const handle = await client.send({ taskType: "task1", data: "ack-no-bump" }); + const id = handle.id; + + const claimed = await storage.next("test-worker", { leaseMs: 30_000 }); + expect(claimed?.id).toBe(id); + // Fresh PENDING claim does NOT bump attempts (the bump only happens + // for lease-expiry reclaim, and we just did a fresh claim). + expect(claimed?.attempts ?? 0).toBe(0); + + // Simulate successful ack via finalize(). + await storage.finalize(id, { + output: { result: "ok" }, + error: null, + error_code: null, + status: JobStatus.COMPLETED, + completed_at: new Date().toISOString(), + }); + + const finalJob = await storage.get(id); + expect(finalJob?.status).toBe(JobStatus.COMPLETED); + // The bug under fix: previously this was 1 because complete() bumped attempts. + expect(finalJob?.attempts ?? 0).toBe(0); + }); + }); + + describe("Abort/Retry/Lease invariants (H1 + H4)", () => { + it("abort → retry: reclaimed PENDING row has abort_requested_at cleared", async () => { + // Send a job, abort it while PENDING (sets abort_requested_at + FAILED + // in the storage layer immediately). Then re-submit with the same id + // routine by calling releaseClaim semantics: instead, we exercise the + // PENDING-retry branch of complete() directly via the storage API so we + // don't depend on the worker loop's retry orchestration. + const handle = await client.send({ taskType: "task1", data: "abort-retry-1" }); + const id = handle.id; + // Simulate worker claim, then a retry-rescheduling complete() call. + const claimed = await storage.next("test-worker-1", { leaseMs: 30_000 }); + expect(claimed).toBeDefined(); + expect(claimed?.id).toBe(id); + + // Set an abort_requested_at directly so we can prove complete() clears it. + await storage.abort(id); + const afterAbort = await storage.get(id); + // PROCESSING + abort_requested_at set. + expect(afterAbort?.abort_requested_at).toBeTruthy(); + + // Retry path: storage.complete() with PENDING + new visible_at clears it. + await storage.complete({ + ...(afterAbort as JobStorageFormat), + status: JobStatus.PENDING, + visible_at: new Date(Date.now() + 10).toISOString(), + error: null, + error_code: null, + attempts: (afterAbort?.attempts ?? 0) + 1, + }); + + const afterRetry = await storage.get(id); + expect(afterRetry?.status).toBe(JobStatus.PENDING); + // The fix under test: abort_requested_at must be NULL on retry. + expect(afterRetry?.abort_requested_at ?? null).toBe(null); + }); + + it("releaseClaim clears abort_requested_at", async () => { + const handle = await client.send({ taskType: "task1", data: "release-claim" }); + const id = handle.id; + + await storage.next("test-worker-2", { leaseMs: 30_000 }); + await storage.abort(id); + const afterAbort = await storage.get(id); + expect(afterAbort?.abort_requested_at).toBeTruthy(); + + await storage.releaseClaim(id); + const afterRelease = await storage.get(id); + expect(afterRelease?.status).toBe(JobStatus.PENDING); + expect(afterRelease?.abort_requested_at ?? null).toBe(null); + }); + + it("lease-expiry reclaim bumps attempts but clears abort_requested_at", async () => { + const handle = await client.send({ taskType: "task1", data: "lease-expiry" }); + const id = handle.id; + + // Claim with a 0ms lease so the next claim sees it as expired. + const first = await storage.next("crashed-worker", { leaseMs: 1 }); + expect(first?.id).toBe(id); + const attemptsBeforeReclaim = first?.attempts ?? 0; + + // Set abort_requested_at to simulate "abort raced with crash". + await storage.abort(id); + + // Wait so the lease becomes expired. + await sleep(20); + + // Reclaim by a different worker — must bump attempts and clear flag. + const second = await storage.next("rescue-worker", { leaseMs: 30_000 }); + expect(second?.id).toBe(id); + expect(second?.attempts).toBe(attemptsBeforeReclaim + 1); + expect(second?.abort_requested_at ?? null).toBe(null); + }); + }); } diff --git a/packages/test/src/test/job-queue/genericPrefixedQueueStorageTests.ts b/packages/test/src/test/job-queue/genericPrefixedQueueStorageTests.ts index fac498037..a01008ba6 100644 --- a/packages/test/src/test/job-queue/genericPrefixedQueueStorageTests.ts +++ b/packages/test/src/test/job-queue/genericPrefixedQueueStorageTests.ts @@ -64,14 +64,14 @@ export function runGenericPrefixedQueueStorageTests( // Add job to storage1 (user1) const job1Id = await storage1.add({ input: { data: "user1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); // Add job to storage2 (user2) const job2Id = await storage2.add({ input: { data: "user2-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -89,19 +89,19 @@ export function runGenericPrefixedQueueStorageTests( it("should process jobs independently per prefix", async () => { await storage1.add({ input: { data: "user1-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); - // Small delay to ensure different run_after timestamps for ordering + // Small delay to ensure different visible_at timestamps for ordering await sleep(10); await storage1.add({ input: { data: "user1-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage2.add({ input: { data: "user2-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -124,12 +124,12 @@ export function runGenericPrefixedQueueStorageTests( it("should delete only jobs matching prefix", async () => { await storage1.add({ input: { data: "user1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage2.add({ input: { data: "user2-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -178,17 +178,17 @@ export function runGenericPrefixedQueueStorageTests( it("should isolate jobs by both prefix values", async () => { const job1Id = await storage1.add({ input: { data: "user1-project100" }, - run_after: null, + visible_at: null, completed_at: null, }); const job2Id = await storage2.add({ input: { data: "user1-project200" }, - run_after: null, + visible_at: null, completed_at: null, }); const job3Id = await storage3.add({ input: { data: "user2-project100" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -211,17 +211,17 @@ export function runGenericPrefixedQueueStorageTests( it("should filter peek results by both prefixes", async () => { await storage1.add({ input: { data: "user1-project100-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage1.add({ input: { data: "user1-project100-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage2.add({ input: { data: "user1-project200-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -262,12 +262,12 @@ export function runGenericPrefixedQueueStorageTests( it("should isolate jobs by queue name even with same prefixes", async () => { const jobAId = await storageQueueA.add({ input: { data: "queue-a-job" }, - run_after: null, + visible_at: null, completed_at: null, }); const jobBId = await storageQueueB.add({ input: { data: "queue-b-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -316,17 +316,17 @@ export function runGenericPrefixedQueueStorageTests( it("should isolate jobs across multiple queues with different prefix values", async () => { const job1Id = await queue1.add({ input: { data: "queue1-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); const job2Id = await queue2.add({ input: { data: "queue2-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); const job3Id = await queue3.add({ input: { data: "queue3-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -350,23 +350,23 @@ export function runGenericPrefixedQueueStorageTests( it("should process jobs independently across multiple queues", async () => { await queue1.add({ input: { data: "queue1-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await sleep(10); await queue1.add({ input: { data: "queue1-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue2.add({ input: { data: "queue2-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue3.add({ input: { data: "queue3-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -393,22 +393,22 @@ export function runGenericPrefixedQueueStorageTests( it("should delete jobs independently across multiple queues", async () => { await queue1.add({ input: { data: "queue1-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue1.add({ input: { data: "queue1-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue2.add({ input: { data: "queue2-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue3.add({ input: { data: "queue3-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -424,27 +424,27 @@ export function runGenericPrefixedQueueStorageTests( it("should peek jobs independently across multiple queues", async () => { await queue1.add({ input: { data: "queue1-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue1.add({ input: { data: "queue1-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue2.add({ input: { data: "queue2-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue3.add({ input: { data: "queue3-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queue3.add({ input: { data: "queue3-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -486,17 +486,17 @@ export function runGenericPrefixedQueueStorageTests( it("should isolate jobs across multiple queues without prefixes", async () => { const jobAId = await queueA.add({ input: { data: "queue-a-job" }, - run_after: null, + visible_at: null, completed_at: null, }); const jobBId = await queueB.add({ input: { data: "queue-b-job" }, - run_after: null, + visible_at: null, completed_at: null, }); const jobCId = await queueC.add({ input: { data: "queue-c-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -520,23 +520,23 @@ export function runGenericPrefixedQueueStorageTests( it("should process jobs independently across queues without prefixes", async () => { await queueA.add({ input: { data: "queue-a-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await sleep(10); await queueA.add({ input: { data: "queue-a-job2" }, - run_after: null, + visible_at: null, completed_at: null, }); await queueB.add({ input: { data: "queue-b-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await queueC.add({ input: { data: "queue-c-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -577,7 +577,7 @@ export function runGenericPrefixedQueueStorageTests( // Add a single job to the queue await storage.add({ input: { data: "single-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -616,10 +616,10 @@ export function runGenericPrefixedQueueStorageTests( for (let i = 0; i < numJobs; i++) { await storage.add({ input: { data: `job-${i}` }, - run_after: null, + visible_at: null, completed_at: null, }); - // Small delay to ensure different run_after timestamps + // Small delay to ensure different visible_at timestamps await sleep(5); } @@ -677,17 +677,17 @@ export function runGenericPrefixedQueueStorageTests( it("should isolate jobs across queues with different prefix configurations", async () => { const jobNoPrefixId = await queueNoPrefix.add({ input: { data: "no-prefix-job" }, - run_after: null, + visible_at: null, completed_at: null, }); const jobSinglePrefixId = await queueSinglePrefix.add({ input: { data: "single-prefix-job" }, - run_after: null, + visible_at: null, completed_at: null, }); const jobTwoPrefixesId = await queueTwoPrefixes.add({ input: { data: "two-prefixes-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -730,19 +730,19 @@ export function runGenericPrefixedQueueStorageTests( it("should process jobs independently across queues with mixed configurations", async () => { await queueNoPrefix.add({ input: { data: "no-prefix-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await sleep(10); await queueSinglePrefix.add({ input: { data: "single-prefix-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); await sleep(10); await queueTwoPrefixes.add({ input: { data: "two-prefixes-job1" }, - run_after: null, + visible_at: null, completed_at: null, }); diff --git a/packages/test/src/test/job-queue/genericQueueStorageSubscriptionTests.ts b/packages/test/src/test/job-queue/genericQueueStorageSubscriptionTests.ts index 75e08f836..05b80c66c 100644 --- a/packages/test/src/test/job-queue/genericQueueStorageSubscriptionTests.ts +++ b/packages/test/src/test/job-queue/genericQueueStorageSubscriptionTests.ts @@ -87,7 +87,7 @@ export function runGenericQueueStorageSubscriptionTests( const jobId = await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -105,7 +105,7 @@ export function runGenericQueueStorageSubscriptionTests( it("should notify on job update", async () => { const jobId = await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -139,7 +139,7 @@ export function runGenericQueueStorageSubscriptionTests( it("should notify on job completion", async () => { const jobId = await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -180,7 +180,7 @@ export function runGenericQueueStorageSubscriptionTests( it("should notify on job deletion", async () => { const jobId = await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -208,12 +208,12 @@ export function runGenericQueueStorageSubscriptionTests( it("should notify on deleteAll", async () => { await storage.add({ input: { data: "test-job-1" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage.add({ input: { data: "test-job-2" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -241,7 +241,7 @@ export function runGenericQueueStorageSubscriptionTests( it("should notify on progress updates", async () => { const jobId = await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -280,7 +280,7 @@ export function runGenericQueueStorageSubscriptionTests( await storage.add({ input: { data: "test-job-1" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -293,7 +293,7 @@ export function runGenericQueueStorageSubscriptionTests( await storage.add({ input: { data: "test-job-2" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -319,7 +319,7 @@ export function runGenericQueueStorageSubscriptionTests( await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -358,14 +358,14 @@ export function runGenericQueueStorageSubscriptionTests( // Add job to storage1 (user1) await storage.add({ input: { data: "user1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); // Add job to storage2 (user2) await storage2.add({ input: { data: "user2-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -414,7 +414,7 @@ export function runGenericQueueStorageSubscriptionTests( await storage.add({ input: { data: "test-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -497,17 +497,17 @@ export function runGenericQueueStorageSubscriptionTests( // Add jobs to different user/project combinations await storage1.add({ input: { data: "user1-project1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage2.add({ input: { data: "user1-project2-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage3.add({ input: { data: "user2-project1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -542,17 +542,17 @@ export function runGenericQueueStorageSubscriptionTests( // Add jobs to different user/project combinations await storage1.add({ input: { data: "user1-project1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage2.add({ input: { data: "user1-project2-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage3.add({ input: { data: "user2-project1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); @@ -587,17 +587,17 @@ export function runGenericQueueStorageSubscriptionTests( // Add jobs to different user/project combinations await storage1.add({ input: { data: "user1-project1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage2.add({ input: { data: "user1-project2-job" }, - run_after: null, + visible_at: null, completed_at: null, }); await storage3.add({ input: { data: "user2-project1-job" }, - run_after: null, + visible_at: null, completed_at: null, }); diff --git a/packages/test/src/test/resource/DisposeStrategy.test.ts b/packages/test/src/test/resource/DisposeStrategy.test.ts index f46951b87..d4236c244 100644 --- a/packages/test/src/test/resource/DisposeStrategy.test.ts +++ b/packages/test/src/test/resource/DisposeStrategy.test.ts @@ -181,6 +181,73 @@ describe("DisposeStrategy.inactivity", () => { } expect(d).toHaveBeenCalledOnce(); }); + + it("runStart() clears pending timers — registered key survives runComplete→runStart→advance(idleMs)", async () => { + // Race scenario: previous run completes and arms a 1000ms idle timer + // for key "a"; the next run begins ~500ms later. Without runStart(), + // the timer fires mid-run and disposes the resource the new run is + // about to use. With runStart(), the timer is cleared. + const scope = new ResourceScope({ strategy: DisposeStrategy.inactivity(1000) }); + const d = vi.fn(async () => {}); + scope.register("a", d); + + await scope.runComplete(); + await vi.advanceTimersByTimeAsync(500); + + // New run begins. + await scope.runStart(); + expect(d).not.toHaveBeenCalled(); + + // Advance well past the original idleMs — the cleared timer must not fire. + await vi.advanceTimersByTimeAsync(2_000); + expect(d).not.toHaveBeenCalled(); + expect(scope.size).toBe(1); + }); + + it("re-register after dispose survives without firing a stale timer", async () => { + // Sequence: + // 1. register("a", d1); runComplete arms a 1000ms timer for "a". + // 2. dispose("a") fires d1 (escape hatch) — but the timer entry may + // still be in the strategy's map. + // 3. register("a", d2); the new disposer must NOT be torn down by a + // lingering timer from the previous registration. + const scope = new ResourceScope({ strategy: DisposeStrategy.inactivity(1000) }); + const d1 = vi.fn(async () => {}); + scope.register("a", d1); + await scope.runComplete(); + + // Escape-hatch dispose runs d1 immediately. + await scope.dispose("a"); + expect(d1).toHaveBeenCalledOnce(); + + // Re-register a new disposer under the same key; onRegister must clear + // any pending timer left over from the previous registration. + const d2 = vi.fn(async () => {}); + scope.register("a", d2); + + // If a stale timer were still armed, advancing to idleMs would fire it. + await vi.advanceTimersByTimeAsync(2_000); + expect(d2).not.toHaveBeenCalled(); + expect(scope.size).toBe(1); + }); + + it("when runStart is omitted, the inactivity timer fires (control case)", async () => { + // Inverted scenario: the runner did NOT call runStart() before its + // next run. The previous timer is still armed; after idleMs it fires + // and disposes the resource. This documents the bug runStart() exists + // to fix. + const scope = new ResourceScope({ strategy: DisposeStrategy.inactivity(1000) }); + const d = vi.fn(async () => {}); + scope.register("a", d); + + await scope.runComplete(); + + // Simulate "next run begins" — but the runner forgets runStart(). + await vi.advanceTimersByTimeAsync(1000); + + expect(d).toHaveBeenCalledOnce(); + expect(scope.size).toBe(0); + }); }); describe("DisposePresets", () => { diff --git a/packages/test/src/test/storage-migrations/IndexedDbQueueMigrations.integration.test.ts b/packages/test/src/test/storage-migrations/IndexedDbQueueMigrations.integration.test.ts new file mode 100644 index 000000000..56611db31 --- /dev/null +++ b/packages/test/src/test/storage-migrations/IndexedDbQueueMigrations.integration.test.ts @@ -0,0 +1,155 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import "fake-indexeddb/auto"; + +import { indexedDbQueueMigrations } from "@workglow/indexeddb/job-queue"; +import { IndexedDbMigrationRunner } from "@workglow/indexeddb/storage"; +import { describe, expect, it } from "vitest"; + +/** + * Regression test for the v1 → v2 IndexedDB queue migration. + * + * Scenario the v2 migration must handle: + * 1. A pre-PR install ran the original v1 (which named the compound index + * `queue_status_run_after` and stored `run_after` per row). + * 2. Upgrade lands; v2 must drop the old index, walk every row and copy + * `run_after → visible_at`, then recreate the index as + * `queue_status_visible_at` keyed on `visible_at`. + * + * The migration `up()` body MUST be synchronous — IDB upgrade transactions + * auto-commit on the next microtask, so any `await` between IDB requests + * would lose the rest of the migration. We verify that the cursor walk + * succeeds entirely inside the upgrade tx. + */ +describe("indexedDB queue migrations: v1 → v2 rename + backfill", () => { + it("synthetic v1 DB upgrades cleanly: rows carry visible_at, index renamed", async () => { + const dbName = `wglw_idb_qm_${Math.random().toString(36).slice(2)}`; + const tableName = "jobs"; + + // ── (1) Hand-build a v1 DB exactly as the pre-PR migration would have: + // create the object store + the four v1 indexes (including the old + // `queue_status_run_after`), then seed a row with `run_after` set and + // `visible_at` absent. + await new Promise((resolve, reject) => { + const req = indexedDB.open(dbName, 2); // bookkeeping store + queue store + req.onerror = () => reject(req.error); + req.onupgradeneeded = () => { + const db = req.result; + // Synthetic bookkeeping store that records v1 as already applied so + // the runner doesn't re-execute v1 (its createObjectStore would throw). + if (!db.objectStoreNames.contains("_storage_migrations")) { + db.createObjectStore("_storage_migrations", { keyPath: ["component", "version"] }); + } + if (!db.objectStoreNames.contains(tableName)) { + const store = db.createObjectStore(tableName, { keyPath: "id" }); + store.createIndex("queue_status", ["queue", "status"], { unique: false }); + // The legacy v1 index name + key path. + store.createIndex("queue_status_run_after", ["queue", "status", "run_after"], { + unique: false, + }); + store.createIndex("queue_job_run_id", ["queue", "job_run_id"], { unique: false }); + store.createIndex("queue_fingerprint_status", ["queue", "fingerprint", "status"], { + unique: false, + }); + } + }; + req.onsuccess = () => { + const db = req.result; + const tx = db.transaction(["_storage_migrations", tableName], "readwrite"); + tx.oncomplete = () => { + db.close(); + resolve(); + }; + tx.onerror = () => { + db.close(); + reject(tx.error); + }; + // Record v1 as already applied. + tx.objectStore("_storage_migrations").put({ + component: `queue:indexeddb:${tableName}`, + version: 1, + description: "Create queue object store + indexes", + appliedAt: new Date().toISOString(), + }); + // Seed a row exactly as the v1 schema would have stored it. + tx.objectStore(tableName).put({ + id: 1, + queue: "test", + fingerprint: "fp1", + job_run_id: "jr1", + status: "PENDING", + input: "{}", + run_after: "2026-01-01T00:00:00.000Z", + }); + }; + }); + + // ── (2) Run the full migration chain. v1 is recorded → skipped; v2 + // executes the rename + backfill. + const runner = new IndexedDbMigrationRunner(dbName); + await runner.run(indexedDbQueueMigrations(tableName, [])); + + // ── (3) Verify: old index gone, new index present, row's visible_at + // equals the original run_after, and peek/next-style queries still work. + await new Promise((resolve, reject) => { + const req = indexedDB.open(dbName); + req.onerror = () => reject(req.error); + req.onsuccess = () => { + const db = req.result; + try { + const tx = db.transaction([tableName], "readonly"); + const store = tx.objectStore(tableName); + const names = Array.from(store.indexNames); + expect(names).not.toContain("queue_status_run_after"); + expect(names).toContain("queue_status_visible_at"); + + const getReq = store.get(1); + getReq.onsuccess = () => { + const row = getReq.result as + | { id: number; run_after: string; visible_at?: string } + | undefined; + try { + expect(row).toBeDefined(); + expect(row!.visible_at).toBe("2026-01-01T00:00:00.000Z"); + // Cursor walk on the new index also succeeds — proves the + // backfill produced data the new index can serve. + const idx = store.index("queue_status_visible_at"); + const cReq = idx.openCursor(); + cReq.onsuccess = () => { + const cursor = cReq.result; + try { + expect(cursor).not.toBeNull(); + } catch (e) { + reject(e); + return; + } + }; + tx.oncomplete = () => { + db.close(); + resolve(); + }; + tx.onerror = () => { + db.close(); + reject(tx.error); + }; + } catch (e) { + db.close(); + reject(e); + } + }; + getReq.onerror = () => { + db.close(); + reject(getReq.error); + }; + } catch (e) { + db.close(); + reject(e); + } + }; + }); + }); +}); diff --git a/packages/test/src/test/storage-migrations/queueMigrationsParity.integration.test.ts b/packages/test/src/test/storage-migrations/queueMigrationsParity.integration.test.ts new file mode 100644 index 000000000..ff565787d --- /dev/null +++ b/packages/test/src/test/storage-migrations/queueMigrationsParity.integration.test.ts @@ -0,0 +1,133 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { PGlite } from "@electric-sql/pglite"; +import { postgresQueueMigrations } from "@workglow/postgres/job-queue"; +import type { Pool } from "@workglow/postgres/storage"; +import { PostgresMigrationRunner } from "@workglow/postgres/storage"; +import { sqliteQueueMigrations } from "@workglow/sqlite/job-queue"; +import { Sqlite, SqliteMigrationRunner } from "@workglow/sqlite/storage"; +import { describe, expect, it } from "vitest"; + +/** + * Verifies the v1→v2→v3 migration chain produces schema parity with a + * manually-built "fresh-install" DB: regardless of whether you arrive via + * the legacy v1 columns or a brand-new install, the final schema after v3 + * must be byte-identical (column names, types, defaults, and indexes). + * + * The frozen-v1 invariant requires this: v1 MUST keep creating the legacy + * names (run_after / run_attempts / max_retries / last_ran_at / worker_id), + * and v3 MUST IF-EXISTS-rename them. Fresh installs run v1 → v2 → v3 too, + * so v3's IF EXISTS guards must turn into no-ops cleanly. + */ +describe("postgres queue migrations: v1→v2→v3 schema parity", () => { + it("fresh install lands on the same schema as a legacy install", async () => { + const a = new PGlite(); + const b = new PGlite(); + try { + const dbA = a as unknown as Pool; + const dbB = b as unknown as Pool; + + // (1) Run the full migration chain on a fresh DB ("install A"). + const runnerA = new PostgresMigrationRunner(dbA); + await runnerA.run(postgresQueueMigrations("jobs_a", [])); + + // (2) Build a synthetic "already migrated to v1" DB by running ONLY + // v1 (this is what an existing deployment looks like before this PR), + // then continue with v2 and v3. + const allB = postgresQueueMigrations("jobs_b", []); + const runnerB = new PostgresMigrationRunner(dbB); + await runnerB.run([allB[0]]); // v1 only + // Then run the rest of the chain on the legacy DB. + await runnerB.run(allB); + + const finalCols = async (db: Pool, tbl: string): Promise => { + const r = await db.query<{ column_name: string }>( + `SELECT column_name FROM information_schema.columns + WHERE table_name = $1 AND table_schema = current_schema() + ORDER BY column_name`, + [tbl] + ); + return r.rows.map((row) => row.column_name); + }; + + const colsA = await finalCols(dbA, "jobs_a"); + const colsB = await finalCols(dbB, "jobs_b"); + expect(colsA).toEqual(colsB); + // Final schema must use the NEW names — no legacy names should remain. + for (const legacy of [ + "run_after", + "run_attempts", + "max_retries", + "last_ran_at", + "worker_id", + ]) { + expect(colsA).not.toContain(legacy); + } + for (const renamed of [ + "visible_at", + "attempts", + "max_attempts", + "last_attempted_at", + "lease_owner", + ]) { + expect(colsA).toContain(renamed); + } + } finally { + await a.close(); + await b.close(); + } + }); +}); + +describe("sqlite queue migrations: v1→v2→v3 schema parity", () => { + it("fresh install lands on the same schema as a legacy install", async () => { + await Sqlite.init(); + const dbA = new Sqlite.Database(":memory:"); + const dbB = new Sqlite.Database(":memory:"); + try { + const runnerA = new SqliteMigrationRunner(dbA); + await runnerA.run(sqliteQueueMigrations("jobs_a", [])); + + const allB = sqliteQueueMigrations("jobs_b", []); + const runnerB = new SqliteMigrationRunner(dbB); + await runnerB.run([allB[0]]); + await runnerB.run(allB); + + const finalCols = (db: Sqlite.Database, tbl: string): string[] => + db + .prepare<[], { name: string }>(`PRAGMA table_info(${tbl})`) + .all() + .map((r) => r.name) + .sort(); + + const colsA = finalCols(dbA, "jobs_a"); + const colsB = finalCols(dbB, "jobs_b"); + expect(colsA).toEqual(colsB); + for (const legacy of [ + "run_after", + "run_attempts", + "max_retries", + "last_ran_at", + "worker_id", + ]) { + expect(colsA).not.toContain(legacy); + } + for (const renamed of [ + "visible_at", + "attempts", + "max_attempts", + "last_attempted_at", + "lease_owner", + ]) { + expect(colsA).toContain(renamed); + } + } finally { + dbA.close(); + dbB.close(); + } + }); +}); diff --git a/packages/test/src/test/storage-tabular/CachedTabularStorage.integration.test.ts b/packages/test/src/test/storage-tabular/CachedTabularStorage.integration.test.ts index b96a184c2..535a1d64e 100644 --- a/packages/test/src/test/storage-tabular/CachedTabularStorage.integration.test.ts +++ b/packages/test/src/test/storage-tabular/CachedTabularStorage.integration.test.ts @@ -704,6 +704,9 @@ runTabularStorageContract({ supportsTransactions: false, supportsQuery: true, }, + // CachedTabularStorage forwards subscribeToChanges to the durable backing + // store (here InMemoryTabularStorage), which is strictly event-driven. + usesPolling: false, createVectorStorage: async () => { const durable = new InMemoryTabularStorage< typeof VectorItemSchema, diff --git a/packages/test/src/test/storage-tabular/SharedInMemoryTabularStorage.test.ts b/packages/test/src/test/storage-tabular/SharedInMemoryTabularStorage.test.ts index b28d937dc..0389b6f89 100644 --- a/packages/test/src/test/storage-tabular/SharedInMemoryTabularStorage.test.ts +++ b/packages/test/src/test/storage-tabular/SharedInMemoryTabularStorage.test.ts @@ -69,6 +69,9 @@ runTabularStorageContract({ supportsTransactions: false, supportsQuery: true, }, + // SharedInMemoryTabularStorage broadcasts events via BroadcastChannel / + // the inner InMemoryTabularStorage event bus — strictly event-driven. + usesPolling: false, createVectorStorage: async () => new SharedInMemoryTabularStorage( `shared_vec_${uuid4().replace(/-/g, "_")}`, diff --git a/packages/test/src/test/storage-tabular/TelemetryTabularStorage.test.ts b/packages/test/src/test/storage-tabular/TelemetryTabularStorage.test.ts index c0fcb51d4..a40bd39e3 100644 --- a/packages/test/src/test/storage-tabular/TelemetryTabularStorage.test.ts +++ b/packages/test/src/test/storage-tabular/TelemetryTabularStorage.test.ts @@ -123,6 +123,9 @@ runTabularStorageContract({ supportsTransactions: false, supportsQuery: true, }, + // TelemetryTabularStorage delegates subscribeToChanges to its inner storage + // (InMemoryTabularStorage here) which is strictly event-driven. + usesPolling: false, createVectorStorage: async () => { const inner = new InMemoryTabularStorage< typeof VectorItemSchema, diff --git a/packages/test/src/test/storage-tabular/subscribeToChangesContract.meta.test.ts b/packages/test/src/test/storage-tabular/subscribeToChangesContract.meta.test.ts new file mode 100644 index 000000000..23182d9c9 --- /dev/null +++ b/packages/test/src/test/storage-tabular/subscribeToChangesContract.meta.test.ts @@ -0,0 +1,93 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { InMemoryTabularStorage, type TabularChangePayload } from "@workglow/storage"; +import type { FromSchema } from "@workglow/util/schema"; +import { describe, expect, it } from "vitest"; +import { CompoundPrimaryKeyNames, CompoundSchema } from "./genericTabularStorageTests"; + +/** + * Meta-tests for the `subscribeToChanges` contract assertion split. + * + * The contract has two flavors: + * - `subscribeToChanges.eventDriven` — strict commit order (one event per + * write, emitted in write order). + * - `subscribeToChanges.polling` — set equality + event count (a polling + * snapshot diff cannot preserve commit order). + * + * These meta-tests exercise the invariants the two contract blocks check by + * running them against `InMemoryTabularStorage` (event-driven) and verifying + * the failure modes each block is designed to catch — write counts off, set + * mismatched, or commit order broken. + */ +describe("subscribeToChanges contract — meta", () => { + it("eventDriven invariant: count + commit-order equality", async () => { + const storage = new InMemoryTabularStorage< + typeof CompoundSchema, + typeof CompoundPrimaryKeyNames + >(CompoundSchema, CompoundPrimaryKeyNames); + + const changes: TabularChangePayload>[] = []; + const unsubscribe = storage.subscribeToChanges((change) => changes.push(change)); + + await storage.put({ name: "t1", type: "s1", option: "v1", success: true }); + await storage.put({ name: "t2", type: "s2", option: "v2", success: false }); + await storage.put({ name: "t3", type: "s3", option: "v3", success: true }); + + const writes = changes.filter((c) => c.type === "INSERT" || c.type === "UPDATE"); + expect(writes.length).toBe(3); + // Commit-order assertion — would fire if an event-driven backend reordered. + expect(writes.map((w) => w.new?.option)).toEqual(["v1", "v2", "v3"]); + + unsubscribe(); + storage.destroy?.(); + }); + + it("eventDriven failure mode: missing write makes count fail", async () => { + // Simulate a "missing write" scenario by populating the change log with + // only two of three writes. The eventDriven count assertion (`toBe(3)`) + // must reject this case. + const incompleteWrites = [ + { type: "INSERT" as const, new: { name: "t1", type: "s1", option: "v1", success: true } }, + { type: "INSERT" as const, new: { name: "t3", type: "s3", option: "v3", success: true } }, + ]; + expect(incompleteWrites.length).toBe(2); + // The contract block's `toBe(3)` would fail here — verified via: + expect(() => expect(incompleteWrites.length).toBe(3)).toThrow(); + }); + + it("polling invariant: count + set equality, order unspecified", async () => { + const storage = new InMemoryTabularStorage< + typeof CompoundSchema, + typeof CompoundPrimaryKeyNames + >(CompoundSchema, CompoundPrimaryKeyNames); + + // Polling has no real ordering guarantees; we simulate by reading via the + // event bus and asserting set equality only. + const changes: TabularChangePayload>[] = []; + const unsubscribe = storage.subscribeToChanges((change) => changes.push(change)); + + await storage.put({ name: "t1", type: "s1", option: "v1", success: true }); + await storage.put({ name: "t2", type: "s2", option: "v2", success: false }); + await storage.put({ name: "t3", type: "s3", option: "v3", success: true }); + + const writes = changes.filter((c) => c.type === "INSERT" || c.type === "UPDATE"); + expect(writes.length).toBe(3); + // Set equality after sort — invariant the polling block enforces. + expect(writes.map((w) => w.new?.option).sort()).toEqual(["v1", "v2", "v3"]); + + unsubscribe(); + storage.destroy?.(); + }); + + it("polling failure mode: duplicate write breaks set equality", async () => { + // Simulate a duplicate-write scenario: 4 events for 3 writes, with one + // duplicated. The sorted-set assertion (`toEqual(["v1","v2","v3"])`) + // must reject this. + const dupOptions = ["v1", "v1", "v2", "v3"].sort(); + expect(() => expect(dupOptions).toEqual(["v1", "v2", "v3"])).toThrow(); + }); +}); diff --git a/packages/test/src/test/task-graph-job-queue/genericTaskGraphJobQueueTests.ts b/packages/test/src/test/task-graph-job-queue/genericTaskGraphJobQueueTests.ts index f87c71314..94418c0ca 100644 --- a/packages/test/src/test/task-graph-job-queue/genericTaskGraphJobQueueTests.ts +++ b/packages/test/src/test/task-graph-job-queue/genericTaskGraphJobQueueTests.ts @@ -100,9 +100,9 @@ export class TestJobTask extends Task< throw new Error(`Queue "${queueName}" not found`); } - const handle = await registeredQueue.client.submit(input, { + const handle = await registeredQueue.client.send(input, { jobRunId: this.runConfig.runnerId, - maxRetries: 10, + maxAttempts: 10, }); cleanup = handle.onProgress( diff --git a/packages/test/src/test/task/FetchTask.test.ts b/packages/test/src/test/task/FetchTask.test.ts index ef74d7763..2b167f47f 100644 --- a/packages/test/src/test/task/FetchTask.test.ts +++ b/packages/test/src/test/task/FetchTask.test.ts @@ -135,9 +135,9 @@ describe("FetchUrlTask", () => { mockFetch.mockImplementation(() => Promise.resolve(createMockResponse(mockResponse))); // Add jobs to queue via client - await client.submit({ url: "https://api.example.com/1" }); - await client.submit({ url: "https://api.example.com/2" }); - await client.submit({ url: "https://api.example.com/3" }); + await client.send({ url: "https://api.example.com/1" }); + await client.send({ url: "https://api.example.com/2" }); + await client.send({ url: "https://api.example.com/3" }); // Start the server and wait for processing await server.start(); diff --git a/packages/util/src/resource/DisposeStrategy.ts b/packages/util/src/resource/DisposeStrategy.ts index 8c76fe10b..f0a042fc0 100644 --- a/packages/util/src/resource/DisposeStrategy.ts +++ b/packages/util/src/resource/DisposeStrategy.ts @@ -9,9 +9,17 @@ import type { ResourceScope } from "./ResourceScope"; /** * Pluggable disposal policy for a {@link ResourceScope}. * - * A scope consults its strategy at four moments: + * A scope consults its strategy at five moments: * - `onRegister`: when a disposer is registered (may wrap it or set up - * per-resource state such as timers). + * per-resource state such as timers). Strategies that arm inactivity + * timers between runs must also clear any pending timer for the key + * being re-registered (an idle timer firing while a new run is in + * flight would dispose a resource the run is about to use). + * - `onRunStart` (optional): when the owning runner is about to start a + * new run. The {@link InactivityStrategy} uses this to clear all pending + * timers, closing the race between "timer armed at runComplete" and + * "next run begins before the timer fires". Optional to preserve + * backward compatibility with strategies implemented outside this tree. * - `touch`: when a task signals the resource is still in use (resets * inactivity timers, etc.). * - `onRunComplete`: when the owning run finishes (success, error, or abort) @@ -25,6 +33,12 @@ import type { ResourceScope } from "./ResourceScope"; */ export interface IDisposeStrategy { onRegister(key: string, disposer: () => Promise, scope: ResourceScope): () => Promise; + /** + * Called by `ResourceScope.runStart()` before a run begins. Optional — + * external strategies written against the original four-method shape + * continue to work; `ResourceScope` no-ops when this is undefined. + */ + onRunStart?(scope: ResourceScope): Promise | void; touch(key: string): void; onRunComplete(scope: ResourceScope): Promise; onScopeDestroy(scope: ResourceScope): Promise; diff --git a/packages/util/src/resource/ResourceScope.ts b/packages/util/src/resource/ResourceScope.ts index 94831dcde..bfcfed21a 100644 --- a/packages/util/src/resource/ResourceScope.ts +++ b/packages/util/src/resource/ResourceScope.ts @@ -73,6 +73,19 @@ export class ResourceScope { await Promise.allSettled(fns.map((fn) => fn())); } + /** + * Called by runners just before a new run begins. Delegates to the + * strategy's optional `onRunStart` hook (no-op when the strategy does + * not implement it). Closes the race where an inactivity timer armed + * by the previous `runComplete` could fire and dispose a resource the + * next run is about to use. + */ + async runStart(): Promise { + if (this.strategy.onRunStart) { + await this.strategy.onRunStart(this); + } + } + /** * Called by runners' `finally` blocks. Delegates to the strategy's * `onRunComplete` hook. diff --git a/packages/util/src/resource/strategies/InactivityStrategy.ts b/packages/util/src/resource/strategies/InactivityStrategy.ts index 684b76d2c..a11ae7e49 100644 --- a/packages/util/src/resource/strategies/InactivityStrategy.ts +++ b/packages/util/src/resource/strategies/InactivityStrategy.ts @@ -35,13 +35,33 @@ export class InactivityStrategy implements IDisposeStrategy { } onRegister( - _key: string, + key: string, disposer: () => Promise, _scope: ResourceScope ): () => Promise { + // Defensive: if a key is registered while an inactivity timer is still + // pending for it (e.g. the disposer ran, but a re-register raced with + // the timer body before the timer woke up), clear the pending timer so + // the new disposer cannot be torn down between runs. + const pending = this.timers.get(key); + if (pending !== undefined) { + clearTimeout(pending); + this.timers.delete(key); + } return disposer; } + /** + * Called by `ResourceScope.runStart()` before each run. Clears every + * pending inactivity timer — any resource still registered is part of + * the new run, and we never want a stale timer to dispose mid-run. + * The next `onRunComplete` re-arms the timers from scratch. + */ + onRunStart(_scope: ResourceScope): void { + for (const t of this.timers.values()) clearTimeout(t); + this.timers.clear(); + } + touch(key: string): void { const t = this.timers.get(key); if (t !== undefined) { diff --git a/providers/postgres/src/job-queue/PostgresJobStore.ts b/providers/postgres/src/job-queue/PostgresJobStore.ts new file mode 100644 index 000000000..7601e663d --- /dev/null +++ b/providers/postgres/src/job-queue/PostgresJobStore.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobStore, JobRecord, JobStatus, MessageId } from "@workglow/job-queue"; +import type { PostgresPendingWrite } from "./PostgresMessageQueue"; +import type { PostgresQueueStorage } from "./PostgresQueueStorage"; + +export class PostgresJobStore implements IJobStore { + /** @internal — shared with the paired message queue */ + public readonly core: PostgresQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: PostgresQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + get(id: MessageId): Promise | undefined> { + return this.core.get(id); + } + + async peek(status?: JobStatus, num?: number): Promise[]> { + return this.core.peek(status as any, num); + } + + size(status?: JobStatus): Promise { + return this.core.size(status as any); + } + + async getByRunId(runId: string): Promise[]> { + return this.core.getByRunId(runId); + } + + outputForInput(input: Input): Promise { + return this.core.outputForInput(input); + } + + async saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise { + await this.core.saveProgress(id, progress, message, details as Record); + } + + async saveResult(id: MessageId, output: Output): Promise { + const buf = this.pending.get(id) ?? {}; + buf.output = output ?? null; + this.pending.set(id, buf); + } + + async saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise { + const buf = this.pending.get(id) ?? {}; + buf.error = error; + buf.errorCode = errorCode; + buf.abortRequested = abortRequested; + this.pending.set(id, buf); + } + + async deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise { + await this.core.deleteJobsByStatusAndAge(status, olderThanMs); + } + + async delete(id: MessageId): Promise { + this.pending.delete(id); + await this.core.delete(id); + } + + async deleteAll(): Promise { + this.pending.clear(); + await this.core.deleteAll(); + } + + async abort(id: MessageId): Promise { + await this.core.abort(id); + } + + async saveStatus(id: MessageId, status: JobStatus): Promise { + await this.core.saveStatus(id, status); + } +} diff --git a/providers/postgres/src/job-queue/PostgresMessageQueue.ts b/providers/postgres/src/job-queue/PostgresMessageQueue.ts new file mode 100644 index 000000000..ecfaff6da --- /dev/null +++ b/providers/postgres/src/job-queue/PostgresMessageQueue.ts @@ -0,0 +1,226 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + IClaim, + IMessageQueue, + JobStorageFormat, + MessageId, + QueueChangePayload, + QueueStorageScope, + QueueSubscribeOptions, + SendOptions, +} from "@workglow/job-queue"; +import type { PostgresQueueStorage } from "./PostgresQueueStorage"; + +/** + * Per-id buffer that lets {@link IJobStore.saveResult}/{@link IJobStore.saveError} + * stage output/error until the terminal claim.ack()/fail() persists them in + * a single complete() call (avoids double-bumping `attempts`). + */ +export type PostgresPendingWrite = { + output?: Output | null; + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; +}; + +class PostgresClaim implements IClaim> { + constructor( + private readonly core: PostgresQueueStorage, + private readonly pending: Map>, + public readonly id: MessageId, + public readonly body: JobStorageFormat, + public readonly attempts: number, + private readonly workerId: string + ) {} + + async ack(result?: unknown): Promise { + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const output = + result !== undefined + ? result + : buf?.output !== undefined + ? buf.output + : (current.output ?? null); + await this.core.finalize(this.id, { + output: output as Output | null, + error: null, + error_code: null, + status: "COMPLETED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async retry(opts?: { delaySeconds?: number }): Promise { + this.pending.delete(this.id); + const delay = opts?.delaySeconds ?? 0; + const current = (await this.core.get(this.id)) ?? this.body; + await this.core.complete({ + ...current, + status: "PENDING", + lease_owner: null, + lease_expires_at: null, + visible_at: new Date(Date.now() + delay * 1000).toISOString(), + progress: 0, + progress_message: "", + progress_details: null, + }); + } + + async fail(opts?: { + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; + permanent?: boolean; + }): Promise { + void opts?.permanent; + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const error = + opts?.error !== undefined + ? opts.error + : buf?.error !== undefined + ? buf.error + : (current.error ?? null); + const errorCode = + opts?.errorCode !== undefined + ? opts.errorCode + : buf?.errorCode !== undefined + ? buf.errorCode + : (current.error_code ?? null); + const abortRequested = + opts?.abortRequested !== undefined ? opts.abortRequested : (buf?.abortRequested ?? false); + await this.core.finalize(this.id, { + error, + error_code: errorCode, + abort_requested_at: abortRequested + ? (current.abort_requested_at ?? new Date().toISOString()) + : (current.abort_requested_at ?? null), + status: "FAILED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async extendLease(ms: number): Promise { + await this.core.extendLease(this.id, this.workerId, ms); + } + + async disable(): Promise { + this.pending.delete(this.id); + const current = await this.core.get(this.id); + const completedAt = current?.completed_at ?? new Date().toISOString(); + await this.core.finalize(this.id, { + status: "DISABLED", + completed_at: completedAt, + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + }); + } +} + +export class PostgresMessageQueue implements IMessageQueue< + JobStorageFormat +> { + public readonly scope: QueueStorageScope; + + /** @internal — shared with the paired job store */ + public readonly core: PostgresQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: PostgresQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + this.scope = core.scope; + } + + async send(body: JobStorageFormat, opts?: SendOptions): Promise { + return this.core.add(applySendOptions(body, opts)); + } + + async sendBatch( + bodies: readonly JobStorageFormat[], + opts?: SendOptions + ): Promise { + const ids: MessageId[] = []; + for (const body of bodies) { + ids.push(await this.send(body, opts)); + } + return ids; + } + + async receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise>[]> { + const max = Math.max(1, opts.max ?? 1); + const claims: IClaim>[] = []; + while (claims.length < max) { + const job = await this.core.next(opts.workerId, { leaseMs: opts.leaseMs }); + if (!job) break; + claims.push( + new PostgresClaim( + this.core, + this.pending, + job.id, + job, + job.attempts ?? 0, + opts.workerId + ) + ); + } + return claims; + } + + async releaseClaim(id: MessageId): Promise { + this.pending.delete(id); + await this.core.releaseClaim(id); + } + + async migrate(): Promise { + await this.core.migrate(); + } + + getMigrations(): ReadonlyArray { + return this.core.getMigrations(); + } + + subscribeToChanges( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void { + return this.core.subscribeToChanges(callback, options); + } +} + +function applySendOptions( + body: JobStorageFormat, + opts?: SendOptions +): JobStorageFormat { + if (!opts) return body; + const out: JobStorageFormat = { ...body }; + if (opts.delaySeconds != null) { + out.visible_at = new Date(Date.now() + opts.delaySeconds * 1000).toISOString(); + } + if (opts.timeoutSeconds != null) { + out.deadline_at = new Date(Date.now() + opts.timeoutSeconds * 1000).toISOString(); + } + if (opts.fingerprint != null) out.fingerprint = opts.fingerprint; + if (opts.jobRunId != null) out.job_run_id = opts.jobRunId; + if (opts.maxAttempts != null) out.max_attempts = opts.maxAttempts; + return out; +} diff --git a/providers/postgres/src/job-queue/PostgresQueueStorage.ts b/providers/postgres/src/job-queue/PostgresQueueStorage.ts index cb5717af9..c382ae812 100644 --- a/providers/postgres/src/job-queue/PostgresQueueStorage.ts +++ b/providers/postgres/src/job-queue/PostgresQueueStorage.ts @@ -130,7 +130,7 @@ export class PostgresQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage | undefined> { - // Parameters: $1=status, $2=queue, $3=status, $4=worker_id, $5+=prefix params - const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(5); + public async next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined> { + const leaseMs = opts?.leaseMs ?? 30000; + // Parameters: $1=PROCESSING, $2=now+leaseMs interval, $3=queue, $4=workerId, $5=PENDING, $6=PROCESSING, $7+=prefix params + const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(7); const result = await this.db.query< JobStorageFormat, Array >( ` - UPDATE ${this.tableName} - SET status = $1, last_ran_at = NOW() AT TIME ZONE 'UTC', worker_id = $4 + UPDATE ${this.tableName} + SET status = $1, + last_attempted_at = NOW() AT TIME ZONE 'UTC', + lease_owner = $4, + lease_expires_at = NOW() AT TIME ZONE 'UTC' + ($2 * INTERVAL '1 millisecond'), + -- A reclaimed PROCESSING row was claimed by a now-crashed worker; + -- that constitutes one used-up attempt against max_attempts. + -- PENDING claims must not be charged here — JobQueueWorker's + -- existing validateJobState() will FAIL the job in the next-step + -- branch when attempts >= max_attempts. + attempts = CASE WHEN status = $6 THEN attempts + 1 ELSE attempts END, + -- Always clear any stale abort_requested_at on (re)claim. A PROCESSING + -- row may have had abort_requested_at set before the worker crashed; + -- the new owner must start with a clean slate or the worker will see + -- the abort flag immediately and never run user code. + abort_requested_at = NULL WHERE id = ( - SELECT id - FROM ${this.tableName} - WHERE queue = $2 - AND status = $3 + SELECT id + FROM ${this.tableName} + WHERE queue = $3 + AND ( + (status = $5 AND visible_at <= NOW() AT TIME ZONE 'UTC') + OR (status = $6 AND (lease_expires_at IS NULL OR lease_expires_at < NOW() AT TIME ZONE 'UTC')) + ) ${prefixConditions} - AND run_after <= NOW() AT TIME ZONE 'UTC' - ORDER BY run_after ASC - FOR UPDATE SKIP LOCKED + ORDER BY visible_at ASC + FOR UPDATE SKIP LOCKED LIMIT 1 ) RETURNING *`, - [JobStatus.PROCESSING, this.queueName, JobStatus.PENDING, workerId, ...prefixParams] + [ + JobStatus.PROCESSING, + leaseMs, + this.queueName, + workerId, + JobStatus.PENDING, + JobStatus.PROCESSING, + ...prefixParams, + ] ); return result?.rows?.[0] ?? undefined; } + /** + * Extend the lease on a currently PROCESSING job. + * @param id - The ID of the job to extend the lease for + * @param workerId - Worker ID that must match the current lease owner (lease_owner) + * @param ms - Number of milliseconds to extend the lease by + */ + public async extendLease(id: unknown, workerId: string, ms: number): Promise { + const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(5); + const result = await this.db.query( + `UPDATE ${this.tableName} + SET lease_expires_at = NOW() AT TIME ZONE 'UTC' + ($1 * INTERVAL '1 millisecond') + WHERE id = $2 AND queue = $3 AND lease_owner = $4 AND status = 'PROCESSING'${prefixConditions}`, + [ms, id, this.queueName, workerId, ...prefixParams] + ); + if (!result || result.rowCount === 0) { + throw new Error( + `extendLease failed: job ${String(id)} is not PROCESSING or lease is not owned by worker ${workerId}` + ); + } + } + /** * Retrieves the number of jobs in the queue with a specific status. * @param status - The status of the jobs to count @@ -280,7 +332,7 @@ export class PostgresQueueStorage implements IQueueStorage): Promise { @@ -301,24 +353,29 @@ export class PostgresQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage | null; + } + ): Promise { + // Build a dynamic SET clause covering only the fields the caller supplied — + // a partial overwrite. Everything else (in particular `attempts`, + // `visible_at`, `lease_expires_at`) is untouched. + const sets: string[] = []; + const params: Array = []; + let nextParam = 1; + const push = (col: string, value: unknown): void => { + sets.push(`${col} = $${nextParam}`); + params.push(value); + nextParam += 1; + }; + if ("output" in fields) { + push("output", fields.output != null ? JSON.stringify(fields.output) : null); + } + if ("error" in fields) push("error", fields.error ?? null); + if ("error_code" in fields) push("error_code", fields.error_code ?? null); + if ("status" in fields) push("status", fields.status); + if ("completed_at" in fields) push("completed_at", fields.completed_at ?? null); + if ("abort_requested_at" in fields) { + push("abort_requested_at", fields.abort_requested_at ?? null); + } + if ("lease_owner" in fields) push("lease_owner", fields.lease_owner ?? null); + if ("progress" in fields) push("progress", fields.progress ?? 0); + if ("progress_message" in fields) push("progress_message", fields.progress_message ?? ""); + if ("progress_details" in fields) { + push( + "progress_details", + fields.progress_details != null ? JSON.stringify(fields.progress_details) : null + ); + } + if (sets.length === 0) return; // nothing to write + const idParam = nextParam; + nextParam += 1; + const queueParam = nextParam; + nextParam += 1; + const { conditions: prefixConditions, params: prefixParams } = + this.buildPrefixWhereClause(nextParam); + await this.db.query( + `UPDATE ${this.tableName} + SET ${sets.join(", ")} + WHERE id = $${idParam} AND queue = $${queueParam}${prefixConditions}`, + [...params, id, this.queueName, ...prefixParams] + ); + } + /** * Clears all jobs from the queue. */ @@ -387,41 +509,68 @@ export class PostgresQueueStorage implements IQueueStorage { - const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(3); - await this.db.query( - ` - UPDATE ${this.tableName} - SET status = 'ABORTING' - WHERE id = $1 AND queue = $2${prefixConditions}`, - [jobId, this.queueName, ...prefixParams] - ); + // Abort PENDING → FAILED immediately + { + const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(4); + await this.db.query( + `UPDATE ${this.tableName} + SET status = 'FAILED', + abort_requested_at = NOW() AT TIME ZONE 'UTC', + completed_at = NOW() AT TIME ZONE 'UTC' + WHERE id = $1 AND queue = $2 AND status = $3${prefixConditions}`, + [jobId, this.queueName, JobStatus.PENDING, ...prefixParams] + ); + } + // Abort PROCESSING → set abort_requested_at only + { + const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(4); + await this.db.query( + `UPDATE ${this.tableName} + SET abort_requested_at = NOW() AT TIME ZONE 'UTC' + WHERE id = $1 AND queue = $2 AND status = $3${prefixConditions}`, + [jobId, this.queueName, JobStatus.PROCESSING, ...prefixParams] + ); + } } /** - * Releases a claimed job back to PENDING without incrementing run_attempts. + * Releases a claimed job back to PENDING without incrementing attempts. * @param jobId - The id of the claimed job to release. */ - public async release(jobId: unknown): Promise { + public async releaseClaim(jobId: unknown): Promise { const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(3); + // releaseClaim returns the row to PENDING without consuming an attempt. + // Clear abort_requested_at so an abort that was requested mid-claim does + // not survive the release and cancel the next worker that picks the row up. await this.db.query( ` UPDATE ${this.tableName} SET status = 'PENDING', - worker_id = NULL, + lease_owner = NULL, progress = 0, progress_message = '', - progress_details = NULL + progress_details = NULL, + abort_requested_at = NULL WHERE id = $1 AND queue = $2${prefixConditions}`, [jobId, this.queueName, ...prefixParams] ); } + /** Force-overwrite status without touching attempts (used to persist DISABLED after lease release). */ + public async saveStatus(jobId: unknown, status: string): Promise { + const { conditions: prefixConditions, params: prefixParams } = this.buildPrefixWhereClause(3); + await this.db.query( + `UPDATE ${this.tableName} SET status = $1 WHERE id = $2 AND queue = $3${prefixConditions}`, + [status, jobId, this.queueName, ...prefixParams] + ); + } + /** * Retrieves all jobs for a given job run ID. * @param job_run_id - The ID of the job run to retrieve diff --git a/providers/postgres/src/job-queue/common.ts b/providers/postgres/src/job-queue/common.ts index b9deb139b..1f7e2577a 100644 --- a/providers/postgres/src/job-queue/common.ts +++ b/providers/postgres/src/job-queue/common.ts @@ -8,6 +8,9 @@ export * from "./PostgresQueueStorage"; export * from "./PostgresRateLimiterStorage"; +export * from "./PostgresMessageQueue"; +export * from "./PostgresJobStore"; +export * from "./createPostgresQueue"; // Versioned migration sets for the queue + rate-limiter tables, plus the // runner that applies them. Re-exported here so callers can compose diff --git a/providers/postgres/src/job-queue/createPostgresQueue.ts b/providers/postgres/src/job-queue/createPostgresQueue.ts new file mode 100644 index 000000000..055fef610 --- /dev/null +++ b/providers/postgres/src/job-queue/createPostgresQueue.ts @@ -0,0 +1,35 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { QueueStorageOptions } from "@workglow/job-queue"; +import type { Pool } from "@workglow/postgres/storage"; +import { PostgresJobStore } from "./PostgresJobStore"; +import { PostgresMessageQueue, type PostgresPendingWrite } from "./PostgresMessageQueue"; +import { PostgresQueueStorage } from "./PostgresQueueStorage"; + +/** + * Factory for the paired Postgres message queue and job store. Both + * facades share a single underlying {@link PostgresQueueStorage} so writes + * through one are observable through the other. + */ +export function createPostgresQueue( + queueName: string, + pool: Pool, + opts?: QueueStorageOptions +): { + messageQueue: PostgresMessageQueue; + jobStore: PostgresJobStore; + /** @internal — exposed for callers that still need the legacy storage object. */ + core: PostgresQueueStorage; +} { + const core = new PostgresQueueStorage(pool, queueName, opts); + const pending = new Map>(); + return { + messageQueue: new PostgresMessageQueue(core, pending), + jobStore: new PostgresJobStore(core, pending), + core, + }; +} diff --git a/providers/postgres/src/migrations/postgresQueueMigrations.ts b/providers/postgres/src/migrations/postgresQueueMigrations.ts index 44bed82a1..7594ebcaf 100644 --- a/providers/postgres/src/migrations/postgresQueueMigrations.ts +++ b/providers/postgres/src/migrations/postgresQueueMigrations.ts @@ -22,6 +22,10 @@ import type { Pool } from "../storage/_postgres/node-bun"; * would not — silently producing version-skewed enums and runtime errors on * insert. Adding a status requires a NEW migration that runs * `ALTER TYPE job_status ADD VALUE IF NOT EXISTS '...'`. + * + * ABORTING was present in v1 and removed from the application model in PR 2. + * It remains in the v1 enum literal so existing databases are not broken; + * the application simply no longer writes that value. */ const JOB_STATUS_V1: readonly string[] = [ "PENDING", @@ -33,9 +37,10 @@ const JOB_STATUS_V1: readonly string[] = [ ]; /** - * Sanity check: if a developer adds a status to {@link JobStatus} without - * also writing a follow-up migration that ALTER TYPE-adds it, queries that - * insert the new status will fail at runtime against any DB still on v1. + * Sanity check: every current {@link JobStatus} value must be covered by the + * v1 enum (or a subsequent ALTER TYPE migration). ABORTING was intentionally + * removed from the application model; it is still legal in the DB schema but + * we skip it here so the check does not reject a valid removal. * * Run lazily from {@link postgresQueueMigrations} (NOT at module import) so * that consumers re-exporting this module via barrel files don't crash on @@ -43,13 +48,7 @@ const JOB_STATUS_V1: readonly string[] = [ */ function assertJobStatusMatchesV1(): void { const current = new Set(Object.values(JobStatus)); - for (const v of JOB_STATUS_V1) { - if (!current.has(v as JobStatus)) { - throw new Error( - `JobStatus const is missing v1 enum value "${v}"; v1 migration values are frozen.` - ); - } - } + // Every current status must be present in the v1 enum (or added by a later migration). for (const v of current) { if (!JOB_STATUS_V1.includes(v)) { throw new Error( @@ -68,6 +67,14 @@ function assertJobStatusMatchesV1(): void { * table names get tracked independently in `_storage_migrations`. The v1 * payload covers schema + indexes + LISTEN/NOTIFY plumbing; the trigger is * idempotent (`CREATE OR REPLACE FUNCTION` + `DROP TRIGGER IF EXISTS`). + * + * v1 is FROZEN byte-for-byte against the pre-PR shape — it MUST keep + * creating the `run_after`/`run_attempts`/`max_retries`/`last_ran_at`/ + * `worker_id` columns and the corresponding `run_after`-keyed indexes. + * Renames and index swaps live in v3 (with `IF EXISTS` guards so a fresh + * install — which still goes through v1 — can apply v3 without errors). + * Mutating v1 would silently produce divergent schemas between fresh and + * already-migrated DBs and break older deployments mid-rollout. */ export function postgresQueueMigrations( tableName: string, @@ -189,5 +196,68 @@ export function postgresQueueMigrations( } }, }, + { + component, + version: 2, + description: "Add abort_requested_at and lease_expires_at columns", + async up(db: Pool) { + await db.query(` + ALTER TABLE ${tableName} + ADD COLUMN IF NOT EXISTS abort_requested_at timestamp with time zone, + ADD COLUMN IF NOT EXISTS lease_expires_at timestamp with time zone + `); + }, + }, + { + component, + version: 3, + description: + "Rename run_after→visible_at, last_ran_at→last_attempted_at, run_attempts→attempts, max_retries→max_attempts, worker_id→lease_owner; drop run_after-keyed indexes and recreate visible_at-keyed", + async up(db: Pool) { + // Each rename is guarded by IF EXISTS — fresh installs (which still + // run v1 → v2 → v3) skip every branch and end up with the v1 schema + // exactly. Existing installs from before this PR get renamed in place. + await db.query(` + DO $$ + BEGIN + IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='${tableName}' AND column_name='run_after' AND table_schema=current_schema()) THEN + EXECUTE 'ALTER TABLE ${tableName} RENAME COLUMN run_after TO visible_at'; + END IF; + IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='${tableName}' AND column_name='last_ran_at' AND table_schema=current_schema()) THEN + EXECUTE 'ALTER TABLE ${tableName} RENAME COLUMN last_ran_at TO last_attempted_at'; + END IF; + IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='${tableName}' AND column_name='run_attempts' AND table_schema=current_schema()) THEN + EXECUTE 'ALTER TABLE ${tableName} RENAME COLUMN run_attempts TO attempts'; + END IF; + IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='${tableName}' AND column_name='max_retries' AND table_schema=current_schema()) THEN + EXECUTE 'ALTER TABLE ${tableName} RENAME COLUMN max_retries TO max_attempts'; + EXECUTE 'ALTER TABLE ${tableName} ALTER COLUMN max_attempts SET DEFAULT 10'; + END IF; + IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='${tableName}' AND column_name='worker_id' AND table_schema=current_schema()) THEN + EXECUTE 'ALTER TABLE ${tableName} RENAME COLUMN worker_id TO lease_owner'; + END IF; + END $$ + `); + + // Drop the v1 run_after-keyed indexes and recreate them keyed on + // visible_at. CREATE INDEX cannot be wrapped in CONCURRENTLY here + // because the migration runs inside a transaction. The old indexes + // are useless after the rename — PostgreSQL automatically retargets + // them onto `visible_at` post-rename, but their NAMES still encode + // the old column, which is confusing for operators and slot-binds + // against the wrong stats; doing an explicit DROP + CREATE swap + // keeps names and schemas consistent. + await db.query(`DROP INDEX IF EXISTS job_fetcher${indexSuffix}_idx`); + await db.query(`DROP INDEX IF EXISTS job_queue_fetcher${indexSuffix}_idx`); + await db.query(` + CREATE INDEX IF NOT EXISTS job_fetcher${indexSuffix}_idx + ON ${tableName} (${prefixIndexPrefix}id, status, visible_at) + `); + await db.query(` + CREATE INDEX IF NOT EXISTS job_queue_fetcher${indexSuffix}_idx + ON ${tableName} (${prefixIndexPrefix}queue, status, visible_at) + `); + }, + }, ]; } diff --git a/providers/sqlite/src/job-queue/SqliteJobStore.ts b/providers/sqlite/src/job-queue/SqliteJobStore.ts new file mode 100644 index 000000000..6845a5172 --- /dev/null +++ b/providers/sqlite/src/job-queue/SqliteJobStore.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobStore, JobRecord, JobStatus, MessageId } from "@workglow/job-queue"; +import type { SqlitePendingWrite } from "./SqliteMessageQueue"; +import { SqliteQueueStorage } from "./SqliteQueueStorage"; + +export class SqliteJobStore implements IJobStore { + /** @internal — shared with the paired message queue */ + public readonly core: SqliteQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: SqliteQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + get(id: MessageId): Promise | undefined> { + return this.core.get(id); + } + + async peek(status?: JobStatus, num?: number): Promise[]> { + return this.core.peek(status, num); + } + + size(status?: JobStatus): Promise { + return this.core.size(status as any); + } + + async getByRunId(runId: string): Promise[]> { + return this.core.getByRunId(runId); + } + + outputForInput(input: Input): Promise { + return this.core.outputForInput(input); + } + + async saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise { + await this.core.saveProgress(id, progress, message, details ?? {}); + } + + async saveResult(id: MessageId, output: Output): Promise { + const buf = this.pending.get(id) ?? {}; + buf.output = output ?? null; + this.pending.set(id, buf); + } + + async saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise { + const buf = this.pending.get(id) ?? {}; + buf.error = error; + buf.errorCode = errorCode; + buf.abortRequested = abortRequested; + this.pending.set(id, buf); + } + + async deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise { + await this.core.deleteJobsByStatusAndAge(status, olderThanMs); + } + + async delete(id: MessageId): Promise { + this.pending.delete(id); + await this.core.delete(id); + } + + async deleteAll(): Promise { + this.pending.clear(); + await this.core.deleteAll(); + } + + async abort(id: MessageId): Promise { + await this.core.abort(id); + } + + async saveStatus(id: MessageId, status: JobStatus): Promise { + await this.core.saveStatus(id, status); + } +} diff --git a/providers/sqlite/src/job-queue/SqliteMessageQueue.ts b/providers/sqlite/src/job-queue/SqliteMessageQueue.ts new file mode 100644 index 000000000..8358cf4ad --- /dev/null +++ b/providers/sqlite/src/job-queue/SqliteMessageQueue.ts @@ -0,0 +1,226 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + IClaim, + IMessageQueue, + JobStorageFormat, + MessageId, + QueueChangePayload, + QueueStorageScope, + QueueSubscribeOptions, + SendOptions, +} from "@workglow/job-queue"; +import { JobStatus } from "@workglow/job-queue"; +import { SqliteQueueStorage } from "./SqliteQueueStorage"; + +/** + * Per-id buffer that lets {@link IJobStore.saveResult}/{@link IJobStore.saveError} + * stage output/error until the terminal claim.ack()/fail() persists them in + * a single complete() call (avoids double-bumping `attempts`). + */ +export type SqlitePendingWrite = { + output?: Output | null; + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; +}; + +class SqliteClaim implements IClaim> { + constructor( + private readonly core: SqliteQueueStorage, + private readonly pending: Map>, + public readonly id: MessageId, + public readonly body: JobStorageFormat, + public readonly attempts: number, + private readonly workerId: string + ) {} + + async ack(result?: unknown): Promise { + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const output = + result !== undefined + ? result + : buf?.output !== undefined + ? buf.output + : (current.output ?? null); + await this.core.finalize(this.id, { + output: output as Output | null, + error: null, + error_code: null, + status: JobStatus.COMPLETED, + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async retry(opts?: { delaySeconds?: number }): Promise { + this.pending.delete(this.id); + const delay = opts?.delaySeconds ?? 0; + const current = (await this.core.get(this.id)) ?? this.body; + await this.core.complete({ + ...current, + status: JobStatus.PENDING, + lease_owner: null, + lease_expires_at: null, + visible_at: new Date(Date.now() + delay * 1000).toISOString(), + progress: 0, + progress_message: "", + progress_details: null, + }); + } + + async fail(opts?: { + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; + permanent?: boolean; + }): Promise { + void opts?.permanent; + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const error = + opts?.error !== undefined + ? opts.error + : buf?.error !== undefined + ? buf.error + : (current.error ?? null); + const errorCode = + opts?.errorCode !== undefined + ? opts.errorCode + : buf?.errorCode !== undefined + ? buf.errorCode + : (current.error_code ?? null); + const abortRequested = + opts?.abortRequested !== undefined ? opts.abortRequested : (buf?.abortRequested ?? false); + await this.core.finalize(this.id, { + error, + error_code: errorCode, + abort_requested_at: abortRequested + ? (current.abort_requested_at ?? new Date().toISOString()) + : (current.abort_requested_at ?? null), + status: JobStatus.FAILED, + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async extendLease(ms: number): Promise { + await this.core.extendLease(this.id, this.workerId, ms); + } + + async disable(): Promise { + this.pending.delete(this.id); + const current = await this.core.get(this.id); + const completedAt = current?.completed_at ?? new Date().toISOString(); + await this.core.finalize(this.id, { + status: JobStatus.DISABLED, + completed_at: completedAt, + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + }); + } +} + +export class SqliteMessageQueue implements IMessageQueue< + JobStorageFormat +> { + public readonly scope: QueueStorageScope = "process"; + + /** @internal — shared with the paired job store */ + public readonly core: SqliteQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: SqliteQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + async send(body: JobStorageFormat, opts?: SendOptions): Promise { + return this.core.add(applySendOptions(body, opts)); + } + + async sendBatch( + bodies: readonly JobStorageFormat[], + opts?: SendOptions + ): Promise { + const ids: MessageId[] = []; + for (const body of bodies) { + ids.push(await this.send(body, opts)); + } + return ids; + } + + async receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise>[]> { + const max = Math.max(1, opts.max ?? 1); + const claims: IClaim>[] = []; + while (claims.length < max) { + const job = await this.core.next(opts.workerId, { leaseMs: opts.leaseMs }); + if (!job) break; + claims.push( + new SqliteClaim( + this.core, + this.pending, + job.id, + job, + job.attempts ?? 0, + opts.workerId + ) + ); + } + return claims; + } + + async releaseClaim(id: MessageId): Promise { + this.pending.delete(id); + await this.core.releaseClaim(id); + } + + async migrate(): Promise { + await this.core.migrate(); + } + + getMigrations(): ReadonlyArray { + return this.core.getMigrations(); + } + + subscribeToChanges( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void { + return this.core.subscribeToChanges(callback, options); + } +} + +function applySendOptions( + body: JobStorageFormat, + opts?: SendOptions +): JobStorageFormat { + if (!opts) return body; + const out: JobStorageFormat = { ...body }; + if (opts.delaySeconds != null) { + out.visible_at = new Date(Date.now() + opts.delaySeconds * 1000).toISOString(); + } + if (opts.timeoutSeconds != null) { + out.deadline_at = new Date(Date.now() + opts.timeoutSeconds * 1000).toISOString(); + } + if (opts.fingerprint != null) out.fingerprint = opts.fingerprint; + if (opts.jobRunId != null) out.job_run_id = opts.jobRunId; + if (opts.maxAttempts != null) out.max_attempts = opts.maxAttempts; + return out; +} diff --git a/providers/sqlite/src/job-queue/SqliteQueueStorage.ts b/providers/sqlite/src/job-queue/SqliteQueueStorage.ts index 0972138a0..20345efee 100644 --- a/providers/sqlite/src/job-queue/SqliteQueueStorage.ts +++ b/providers/sqlite/src/job-queue/SqliteQueueStorage.ts @@ -126,7 +126,7 @@ export class SqliteQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage { + const now = new Date().toISOString(); const prefixConditions = this.buildPrefixWhereClause(); const prefixParams = this.getPrefixParamValues(); - const AbortQuery = ` + // Abort PENDING → FAILED immediately + const AbortPendingQuery = ` UPDATE ${this.tableName} - SET status = ? - WHERE id = ? AND queue = ?${prefixConditions}`; - const stmt = this.db.prepare(AbortQuery); - stmt.run(JobStatus.ABORTING, String(jobId), this.queueName, ...prefixParams); + SET status = ?, abort_requested_at = ?, completed_at = ? + WHERE id = ? AND queue = ? AND status = ?${prefixConditions}`; + const stmtPending = this.db.prepare(AbortPendingQuery); + stmtPending.run( + JobStatus.FAILED, + now, + now, + String(jobId), + this.queueName, + JobStatus.PENDING, + ...prefixParams + ); + + // Abort PROCESSING → set abort_requested_at only + const AbortProcessingQuery = ` + UPDATE ${this.tableName} + SET abort_requested_at = ? + WHERE id = ? AND queue = ? AND status = ?${prefixConditions}`; + const stmtProcessing = this.db.prepare(AbortProcessingQuery); + stmtProcessing.run(now, String(jobId), this.queueName, JobStatus.PROCESSING, ...prefixParams); } /** - * Releases a claimed job back to PENDING without incrementing run_attempts. + * Releases a claimed job back to PENDING without incrementing attempts. * @param jobId - The id of the claimed job to release. */ - public async release(jobId: unknown): Promise { + public async releaseClaim(jobId: unknown): Promise { const prefixConditions = this.buildPrefixWhereClause(); const prefixParams = this.getPrefixParamValues(); + // releaseClaim returns the row to PENDING without consuming an attempt. + // Clear abort_requested_at so an abort that was requested mid-claim does + // not survive the release and cancel the next worker that picks it up. const ReleaseQuery = ` UPDATE ${this.tableName} SET status = ?, - worker_id = NULL, + lease_owner = NULL, progress = 0, progress_message = '', - progress_details = NULL + progress_details = NULL, + abort_requested_at = NULL WHERE id = ? AND queue = ?${prefixConditions}`; const stmt = this.db.prepare(ReleaseQuery); stmt.run(JobStatus.PENDING, String(jobId), this.queueName, ...prefixParams); } + /** Force-overwrite status without touching attempts (used to persist DISABLED after lease release). */ + public saveStatus(jobId: unknown, status: string): void { + const prefixConditions = this.buildPrefixWhereClause(); + const prefixParams = this.getPrefixParamValues(); + const stmt = this.db.prepare( + `UPDATE ${this.tableName} SET status = ? WHERE id = ? AND queue = ?${prefixConditions}` + ); + stmt.run(status, String(jobId), this.queueName, ...prefixParams); + } + /** * Retrieves all jobs for a given job run ID. * @param job_run_id - The ID of the job run to retrieve @@ -312,18 +344,25 @@ export class SqliteQueueStorage implements IQueueStorage | undefined> { + public async next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined> { const now = new Date().toISOString(); + const leaseMs = opts?.leaseMs ?? 30000; + const leaseExpiry = new Date(Date.now() + leaseMs).toISOString(); const prefixConditions = this.buildPrefixWhereClause(); const prefixParams = this.getPrefixParamValues(); - // Then, get the next job to process + // Claim either a PENDING job ready to run, or a PROCESSING job with an expired lease const stmt = this.db.prepare< unknown[], JobStorageFormat & { @@ -332,28 +371,46 @@ export class SqliteQueueStorage implements IQueueStorage( + // The `CASE WHEN status = 'PROCESSING'` clause bumps attempts only for + // lease-expiry reclaim (a crashed-worker scenario — one used-up attempt + // against max_attempts). PENDING claims do not bump here; the worker's + // existing validateJobState() FAILs the job in the next-step branch when + // attempts >= max_attempts. abort_requested_at is always cleared on + // (re)claim so a stale flag from a previous worker can't immediately + // abort the new lease. ` - UPDATE ${this.tableName} - SET status = ?, last_ran_at = ?, worker_id = ? + UPDATE ${this.tableName} + SET status = ?, + last_attempted_at = ?, + lease_owner = ?, + lease_expires_at = ?, + attempts = CASE WHEN status = ? THEN attempts + 1 ELSE attempts END, + abort_requested_at = NULL WHERE id = ( - SELECT id - FROM ${this.tableName} - WHERE queue = ? - AND status = ?${prefixConditions} - AND run_after <= ? - ORDER BY run_after ASC + SELECT id + FROM ${this.tableName} + WHERE queue = ? + AND ( + (status = ? AND visible_at <= ?) + OR (status = ? AND (lease_expires_at IS NULL OR lease_expires_at < ?)) + )${prefixConditions} + ORDER BY visible_at ASC LIMIT 1 ) RETURNING *` ); const result = stmt.get( - JobStatus.PROCESSING, - now, - workerId, - this.queueName, - JobStatus.PENDING, - ...prefixParams, - now + JobStatus.PROCESSING, // SET status = PROCESSING + now, // last_attempted_at + workerId, // lease_owner + leaseExpiry, // lease_expires_at + JobStatus.PROCESSING, // CASE WHEN status = PROCESSING (lease-expiry reclaim bumps attempts) + this.queueName, // WHERE queue = ? + JobStatus.PENDING, // status = PENDING + now, // visible_at <= now + JobStatus.PROCESSING, // status = PROCESSING (lease-expiry reclaim) + now, // lease_expires_at < now + ...prefixParams ); if (!result) return undefined; @@ -365,6 +422,37 @@ export class SqliteQueueStorage implements IQueueStorage { + const leaseExpiry = new Date(Date.now() + ms).toISOString(); + const prefixConditions = this.buildPrefixWhereClause(); + const prefixParams = this.getPrefixParamValues(); + + const stmt = this.db.prepare( + `UPDATE ${this.tableName} + SET lease_expires_at = ? + WHERE id = ? AND queue = ? AND lease_owner = ? AND status = ?${prefixConditions}` + ); + const info = stmt.run( + leaseExpiry, + String(id), + this.queueName, + workerId, + JobStatus.PROCESSING, + ...prefixParams + ) as { changes: number }; + if (info.changes === 0) { + throw new Error( + `extendLease failed: job ${String(id)} is not PROCESSING or lease is not owned by worker ${workerId}` + ); + } + } + /** * Retrieves the number of jobs in the queue with a specific status. * @param status - The status of the jobs to count @@ -388,7 +476,7 @@ export class SqliteQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage | null; + } + ): Promise { + const sets: string[] = []; + const params: Array = []; + const push = (col: string, value: unknown): void => { + sets.push(`${col} = ?`); + params.push(value); + }; + if ("output" in fields) { + push("output", fields.output != null ? JSON.stringify(fields.output) : null); + } + if ("error" in fields) push("error", fields.error ?? null); + if ("error_code" in fields) push("error_code", fields.error_code ?? null); + if ("status" in fields) push("status", fields.status); + if ("completed_at" in fields) push("completed_at", fields.completed_at ?? null); + if ("abort_requested_at" in fields) + push("abort_requested_at", fields.abort_requested_at ?? null); + if ("lease_owner" in fields) push("lease_owner", fields.lease_owner ?? null); + if ("progress" in fields) push("progress", fields.progress ?? 0); + if ("progress_message" in fields) push("progress_message", fields.progress_message ?? ""); + if ("progress_details" in fields) { + push( + "progress_details", + fields.progress_details != null ? JSON.stringify(fields.progress_details) : null + ); + } + if (sets.length === 0) return; + const prefixConditions = this.buildPrefixWhereClause(); + const prefixParams = this.getPrefixParamValues(); + const stmt = this.db.prepare( + `UPDATE ${this.tableName} + SET ${sets.join(", ")} + WHERE id = ? AND queue = ?${prefixConditions}` + ); + stmt.run(...(params as never[]), String(id), this.queueName, ...prefixParams); + } + public async deleteAll(): Promise { const prefixConditions = this.buildPrefixWhereClause(); const prefixParams = this.getPrefixParamValues(); diff --git a/providers/sqlite/src/job-queue/common.ts b/providers/sqlite/src/job-queue/common.ts index f5dda9d0f..cd0ff8c87 100644 --- a/providers/sqlite/src/job-queue/common.ts +++ b/providers/sqlite/src/job-queue/common.ts @@ -7,6 +7,9 @@ // organize-imports-ignore export * from "./SqliteQueueStorage"; +export * from "./SqliteMessageQueue"; +export * from "./SqliteJobStore"; +export * from "./createSqliteQueue"; export * from "./SqliteRateLimiterStorage"; // Versioned migration sets for the queue + rate-limiter tables, plus the diff --git a/providers/sqlite/src/job-queue/createSqliteQueue.ts b/providers/sqlite/src/job-queue/createSqliteQueue.ts new file mode 100644 index 000000000..0278b3ff8 --- /dev/null +++ b/providers/sqlite/src/job-queue/createSqliteQueue.ts @@ -0,0 +1,34 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { Sqlite } from "@workglow/sqlite/storage"; +import { SqliteJobStore } from "./SqliteJobStore"; +import { SqliteMessageQueue, type SqlitePendingWrite } from "./SqliteMessageQueue"; +import { SqliteQueueStorage, type SqliteQueueStorageOptions } from "./SqliteQueueStorage"; + +/** + * Factory for the paired SQLite message queue and job store. Both facades + * share a single underlying {@link SqliteQueueStorage} so writes through one + * are observable through the other. + */ +export function createSqliteQueue( + queueName: string, + db: Sqlite.Database, + opts?: SqliteQueueStorageOptions +): { + messageQueue: SqliteMessageQueue; + jobStore: SqliteJobStore; + /** @internal — exposed for callers that still need the legacy storage object. */ + core: SqliteQueueStorage; +} { + const core = new SqliteQueueStorage(db, queueName, opts); + const pending = new Map>(); + return { + messageQueue: new SqliteMessageQueue(core, pending), + jobStore: new SqliteJobStore(core, pending), + core, + }; +} diff --git a/providers/sqlite/src/migrations/sqliteQueueMigrations.ts b/providers/sqlite/src/migrations/sqliteQueueMigrations.ts index 6bfb8a07c..78e174851 100644 --- a/providers/sqlite/src/migrations/sqliteQueueMigrations.ts +++ b/providers/sqlite/src/migrations/sqliteQueueMigrations.ts @@ -14,7 +14,16 @@ import { SqliteDialect, } from "@workglow/storage"; -/** Initial migration set for the SQLite queue table identified by `tableName`. */ +/** + * Initial migration set for the SQLite queue table identified by `tableName`. + * + * v1 is FROZEN byte-for-byte against the pre-PR shape — it creates the + * `run_after`/`run_attempts`/`max_retries`/`last_ran_at`/`worker_id` + * columns and the `run_after`-keyed index. Renames and the index swap + * live in v3, guarded by `PRAGMA table_info` lookups so fresh installs + * (which still run v1 → v2 → v3) end up at the same final schema as + * already-migrated DBs. + */ export function sqliteQueueMigrations( tableName: string, prefixes: readonly PrefixColumn[] @@ -60,5 +69,52 @@ export function sqliteQueueMigrations( `); }, }, + { + component, + version: 2, + description: "Add abort_requested_at and lease_expires_at columns", + up(db: Sqlite.Database) { + db.exec(` + ALTER TABLE ${tableName} ADD COLUMN abort_requested_at TEXT; + ALTER TABLE ${tableName} ADD COLUMN lease_expires_at TEXT; + `); + }, + }, + { + component, + version: 3, + description: + "Rename run_after→visible_at, last_ran_at→last_attempted_at, run_attempts→attempts, max_retries→max_attempts, worker_id→lease_owner; drop run_after-keyed index and recreate visible_at-keyed", + up(db: Sqlite.Database) { + // PRAGMA table_info guards each rename so fresh installs (which + // arrive at v3 having just created the v1 schema in this same + // migration run) are no-ops here. + const cols: string[] = db + .prepare<[], { name: string }>(`PRAGMA table_info(${tableName})`) + .all() + .map((r) => r.name); + const renames: [string, string][] = [ + ["run_after", "visible_at"], + ["last_ran_at", "last_attempted_at"], + ["run_attempts", "attempts"], + ["max_retries", "max_attempts"], + ["worker_id", "lease_owner"], + ]; + for (const [oldName, newName] of renames) { + if (cols.includes(oldName)) { + db.exec(`ALTER TABLE ${tableName} RENAME COLUMN ${oldName} TO ${newName};`); + } + } + + // SQLite carries indexes across RENAME COLUMN transparently, but the + // index name still encodes the old column intent. Drop the v1 + // run_after-keyed index and recreate it keyed on visible_at so the + // schema is self-describing. `IF EXISTS` covers fresh installs too. + db.exec(` + DROP INDEX IF EXISTS job_queue_fetcher${indexSuffix}_idx; + CREATE INDEX IF NOT EXISTS job_queue_fetcher${indexSuffix}_idx ON ${tableName} (${prefixIndexPrefix}queue, status, visible_at); + `); + }, + }, ]; } diff --git a/providers/supabase/src/job-queue/SupabaseJobStore.ts b/providers/supabase/src/job-queue/SupabaseJobStore.ts new file mode 100644 index 000000000..b1770fc1a --- /dev/null +++ b/providers/supabase/src/job-queue/SupabaseJobStore.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobStore, JobRecord, JobStatus, MessageId } from "@workglow/job-queue"; +import type { SupabasePendingWrite } from "./SupabaseMessageQueue"; +import type { SupabaseQueueStorage } from "./SupabaseQueueStorage"; + +export class SupabaseJobStore implements IJobStore { + /** @internal — shared with the paired message queue */ + public readonly core: SupabaseQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: SupabaseQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + } + + get(id: MessageId): Promise | undefined> { + return this.core.get(id); + } + + async peek(status?: JobStatus, num?: number): Promise[]> { + return this.core.peek(status as any, num); + } + + size(status?: JobStatus): Promise { + return this.core.size(status as any); + } + + async getByRunId(runId: string): Promise[]> { + return this.core.getByRunId(runId); + } + + outputForInput(input: Input): Promise { + return this.core.outputForInput(input); + } + + async saveProgress( + id: MessageId, + progress: number, + message: string, + details: Record | null + ): Promise { + await this.core.saveProgress(id, progress, message, details as Record); + } + + async saveResult(id: MessageId, output: Output): Promise { + const buf = this.pending.get(id) ?? {}; + buf.output = output ?? null; + this.pending.set(id, buf); + } + + async saveError( + id: MessageId, + error: string, + errorCode: string | null, + abortRequested: boolean + ): Promise { + const buf = this.pending.get(id) ?? {}; + buf.error = error; + buf.errorCode = errorCode; + buf.abortRequested = abortRequested; + this.pending.set(id, buf); + } + + async deleteByStatusAndAge(status: JobStatus, olderThanMs: number): Promise { + await this.core.deleteJobsByStatusAndAge(status, olderThanMs); + } + + async delete(id: MessageId): Promise { + this.pending.delete(id); + await this.core.delete(id); + } + + async deleteAll(): Promise { + this.pending.clear(); + await this.core.deleteAll(); + } + + async abort(id: MessageId): Promise { + await this.core.abort(id); + } + + async saveStatus(id: MessageId, status: JobStatus): Promise { + await this.core.saveStatus(id, status); + } +} diff --git a/providers/supabase/src/job-queue/SupabaseMessageQueue.ts b/providers/supabase/src/job-queue/SupabaseMessageQueue.ts new file mode 100644 index 000000000..b29bbb8b3 --- /dev/null +++ b/providers/supabase/src/job-queue/SupabaseMessageQueue.ts @@ -0,0 +1,226 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + IClaim, + IMessageQueue, + JobStorageFormat, + MessageId, + QueueChangePayload, + QueueStorageScope, + QueueSubscribeOptions, + SendOptions, +} from "@workglow/job-queue"; +import type { SupabaseQueueStorage } from "./SupabaseQueueStorage"; + +/** + * Per-id buffer that lets {@link IJobStore.saveResult}/{@link IJobStore.saveError} + * stage output/error until the terminal claim.ack()/fail() persists them in + * a single complete() call (avoids double-bumping `attempts`). + */ +export type SupabasePendingWrite = { + output?: Output | null; + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; +}; + +class SupabaseClaim implements IClaim> { + constructor( + private readonly core: SupabaseQueueStorage, + private readonly pending: Map>, + public readonly id: MessageId, + public readonly body: JobStorageFormat, + public readonly attempts: number, + private readonly workerId: string + ) {} + + async ack(result?: unknown): Promise { + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const output = + result !== undefined + ? result + : buf?.output !== undefined + ? buf.output + : (current.output ?? null); + await this.core.finalize(this.id, { + output: output as Output | null, + error: null, + error_code: null, + status: "COMPLETED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async retry(opts?: { delaySeconds?: number }): Promise { + this.pending.delete(this.id); + const delay = opts?.delaySeconds ?? 0; + const current = (await this.core.get(this.id)) ?? this.body; + await this.core.complete({ + ...current, + status: "PENDING", + lease_owner: null, + lease_expires_at: null, + visible_at: new Date(Date.now() + delay * 1000).toISOString(), + progress: 0, + progress_message: "", + progress_details: null, + }); + } + + async fail(opts?: { + error?: string | null; + errorCode?: string | null; + abortRequested?: boolean; + permanent?: boolean; + }): Promise { + void opts?.permanent; + const buf = this.pending.get(this.id); + this.pending.delete(this.id); + const current = (await this.core.get(this.id)) ?? this.body; + const error = + opts?.error !== undefined + ? opts.error + : buf?.error !== undefined + ? buf.error + : (current.error ?? null); + const errorCode = + opts?.errorCode !== undefined + ? opts.errorCode + : buf?.errorCode !== undefined + ? buf.errorCode + : (current.error_code ?? null); + const abortRequested = + opts?.abortRequested !== undefined ? opts.abortRequested : (buf?.abortRequested ?? false); + await this.core.finalize(this.id, { + error, + error_code: errorCode, + abort_requested_at: abortRequested + ? (current.abort_requested_at ?? new Date().toISOString()) + : (current.abort_requested_at ?? null), + status: "FAILED", + completed_at: current.completed_at ?? new Date().toISOString(), + }); + } + + async extendLease(ms: number): Promise { + await this.core.extendLease(this.id, this.workerId, ms); + } + + async disable(): Promise { + this.pending.delete(this.id); + const current = await this.core.get(this.id); + const completedAt = current?.completed_at ?? new Date().toISOString(); + await this.core.finalize(this.id, { + status: "DISABLED", + completed_at: completedAt, + lease_owner: null, + progress: 0, + progress_message: "", + progress_details: null, + }); + } +} + +export class SupabaseMessageQueue implements IMessageQueue< + JobStorageFormat +> { + public readonly scope: QueueStorageScope; + + /** @internal — shared with the paired job store */ + public readonly core: SupabaseQueueStorage; + + /** @internal — shared transient buffer for saveResult/saveError. */ + private readonly pending: Map>; + + constructor( + core: SupabaseQueueStorage, + pending: Map> + ) { + this.core = core; + this.pending = pending; + this.scope = core.scope; + } + + async send(body: JobStorageFormat, opts?: SendOptions): Promise { + return this.core.add(applySendOptions(body, opts)); + } + + async sendBatch( + bodies: readonly JobStorageFormat[], + opts?: SendOptions + ): Promise { + const ids: MessageId[] = []; + for (const body of bodies) { + ids.push(await this.send(body, opts)); + } + return ids; + } + + async receive(opts: { + workerId: string; + leaseMs: number; + max?: number; + }): Promise>[]> { + const max = Math.max(1, opts.max ?? 1); + const claims: IClaim>[] = []; + while (claims.length < max) { + const job = await this.core.next(opts.workerId, { leaseMs: opts.leaseMs }); + if (!job) break; + claims.push( + new SupabaseClaim( + this.core, + this.pending, + job.id, + job, + job.attempts ?? 0, + opts.workerId + ) + ); + } + return claims; + } + + async releaseClaim(id: MessageId): Promise { + this.pending.delete(id); + await this.core.releaseClaim(id); + } + + async migrate(): Promise { + await this.core.migrate(); + } + + getMigrations(): ReadonlyArray { + return this.core.getMigrations(); + } + + subscribeToChanges( + callback: (change: QueueChangePayload) => void, + options?: QueueSubscribeOptions + ): () => void { + return this.core.subscribeToChanges(callback, options); + } +} + +function applySendOptions( + body: JobStorageFormat, + opts?: SendOptions +): JobStorageFormat { + if (!opts) return body; + const out: JobStorageFormat = { ...body }; + if (opts.delaySeconds != null) { + out.visible_at = new Date(Date.now() + opts.delaySeconds * 1000).toISOString(); + } + if (opts.timeoutSeconds != null) { + out.deadline_at = new Date(Date.now() + opts.timeoutSeconds * 1000).toISOString(); + } + if (opts.fingerprint != null) out.fingerprint = opts.fingerprint; + if (opts.jobRunId != null) out.job_run_id = opts.jobRunId; + if (opts.maxAttempts != null) out.max_attempts = opts.maxAttempts; + return out; +} diff --git a/providers/supabase/src/job-queue/SupabaseQueueStorage.ts b/providers/supabase/src/job-queue/SupabaseQueueStorage.ts index e89b558ff..7aad321a6 100644 --- a/providers/supabase/src/job-queue/SupabaseQueueStorage.ts +++ b/providers/supabase/src/job-queue/SupabaseQueueStorage.ts @@ -168,10 +168,14 @@ export class SupabaseQueueStorage implements IQueueStorage { // Note: For Supabase, table creation should typically be done through migrations - // This setup assumes the table already exists or uses exec_sql RPC function - const createTypeSql = `CREATE TYPE job_status AS ENUM (${Object.values(JobStatus) + // This setup assumes the table already exists or uses exec_sql RPC function. + // ABORTING is included in the enum for backward compatibility with existing data, + // but the application no longer writes that value. + const enumValues = [...Object.values(JobStatus), "ABORTING"] + .filter((v, i, a) => a.indexOf(v) === i) .map((v) => `'${v}'`) - .join(",")})`; + .join(","); + const createTypeSql = `CREATE TYPE job_status AS ENUM (${enumValues})`; const { error: typeError } = await this.client.rpc("exec_sql", { query: createTypeSql }); // Ignore error if type already exists (code 42710) @@ -194,10 +198,10 @@ export class SupabaseQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage[]) ?? []; @@ -332,29 +358,49 @@ export class SupabaseQueueStorage implements IQueueStorage | undefined> { + public async next( + workerId: string, + opts?: { leaseMs?: number } + ): Promise | undefined> { + const leaseMs = opts?.leaseMs ?? 30000; + if (!Number.isFinite(leaseMs) || leaseMs < 0) { + throw new Error(`Invalid leaseMs: ${leaseMs}`); + } const prefixConditions = this.buildPrefixWhereSql(); const validatedQueueName = this.validateSqlValue(this.queueName, "queueName"); const validatedWorkerId = this.validateSqlValue(workerId, "workerId"); const escapedQueueName = this.escapeSqlString(validatedQueueName); const escapedWorkerId = this.escapeSqlString(validatedWorkerId); - // Use the same atomic UPDATE...WHERE id = (SELECT...FOR UPDATE SKIP LOCKED) pattern as PostgresQueueStorage const sql = ` UPDATE ${this.tableName} - SET status = '${JobStatus.PROCESSING}', last_ran_at = NOW() AT TIME ZONE 'UTC', worker_id = '${escapedWorkerId}' + SET status = '${JobStatus.PROCESSING}', + last_attempted_at = NOW() AT TIME ZONE 'UTC', + lease_owner = '${escapedWorkerId}', + lease_expires_at = NOW() AT TIME ZONE 'UTC' + (${Number(leaseMs)} * INTERVAL '1 millisecond'), + -- Lease-expiry reclaim consumes one attempt against max_attempts; + -- PENDING claims do not (the worker's validateJobState will FAIL + -- the job when attempts >= max_attempts at next-step time). + attempts = CASE WHEN status = '${JobStatus.PROCESSING}' THEN attempts + 1 ELSE attempts END, + -- Always clear stale abort_requested_at on (re)claim so a flag set + -- by an earlier worker doesn't immediately abort the new lease. + abort_requested_at = NULL WHERE id = ( SELECT id FROM ${this.tableName} WHERE queue = '${escapedQueueName}' - AND status = '${JobStatus.PENDING}' + AND ( + (status = '${JobStatus.PENDING}' AND visible_at <= NOW() AT TIME ZONE 'UTC') + OR (status = '${JobStatus.PROCESSING}' AND (lease_expires_at IS NULL OR lease_expires_at < NOW() AT TIME ZONE 'UTC')) + ) ${prefixConditions} - AND run_after <= NOW() AT TIME ZONE 'UTC' - ORDER BY run_after ASC + ORDER BY visible_at ASC FOR UPDATE SKIP LOCKED LIMIT 1 ) @@ -372,6 +418,46 @@ export class SupabaseQueueStorage implements IQueueStorage; } + /** + * Extend the lease on a currently PROCESSING job. + * @param id - The ID of the job to extend the lease for + * @param workerId - Worker ID that must match the current lease owner (lease_owner) + * @param ms - Number of milliseconds to extend the lease by + */ + public async extendLease(id: unknown, workerId: string, ms: number): Promise { + const validatedWorkerId = this.validateSqlValue(workerId, "workerId"); + const escapedWorkerId = this.escapeSqlString(validatedWorkerId); + const numericId = Number(id); + if (!Number.isFinite(numericId)) { + throw new Error(`Invalid job id: ${id}`); + } + if (!Number.isFinite(ms) || ms < 0) { + throw new Error(`Invalid lease extension ms: ${ms}`); + } + + const prefixConditions = this.buildPrefixWhereSql(); + + const sql = ` + UPDATE ${this.tableName} + SET lease_expires_at = NOW() AT TIME ZONE 'UTC' + (${Number(ms)} * INTERVAL '1 millisecond') + WHERE id = ${numericId} + AND queue = '${this.escapeSqlString(this.validateSqlValue(this.queueName, "queueName"))}' + AND lease_owner = '${escapedWorkerId}' + AND status = '${JobStatus.PROCESSING}' + ${prefixConditions} + RETURNING id`; + + const { data, error } = await this.client.rpc("exec_sql", { query: sql }); + if (error) throw error; + + // exec_sql returns affected rows; if empty, the lease was lost + if (!data || !Array.isArray(data) || data.length === 0) { + throw new Error( + `extendLease failed: job ${String(id)} is not PROCESSING or lease is not owned by worker ${workerId}` + ); + } + } + /** * Retrieves the number of jobs in the queue with a specific status. * @param status - The status of the jobs to count @@ -412,7 +498,7 @@ export class SupabaseQueueStorage implements IQueueStorage): Promise { @@ -428,7 +514,7 @@ export class SupabaseQueueStorage implements IQueueStorage implements IQueueStorage maxRetries) { + // Check if the next attempt would exceed max attempts + if (nextAttempts >= maxAttempts) { // Update to FAILED status instead of rescheduling let failQuery = this.client .from(this.tableName) .update({ status: JobStatus.FAILED, - error: "Max retries reached", - error_code: "MAX_RETRIES_REACHED", + error: "Max attempts reached", + error_code: "MAX_ATTEMPTS_REACHED", progress: 100, progress_message: "", progress_details: null, completed_at: now, - last_ran_at: now, + last_attempted_at: now, }) .eq("id", jobDetails.id) .eq("queue", this.queueName); @@ -475,19 +562,22 @@ export class SupabaseQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage implements IQueueStorage { + public async releaseClaim(jobId: unknown): Promise { + // releaseClaim returns the row to PENDING without consuming an attempt. + // Clear abort_requested_at so an abort that was requested mid-claim does + // not survive the release and immediately cancel the next claim. let query = this.client .from(this.tableName) .update({ status: JobStatus.PENDING, - worker_id: null, + lease_owner: null, progress: 0, progress_message: "", progress_details: null, + abort_requested_at: null, }) .eq("id", jobId) .eq("queue", this.queueName); @@ -560,6 +654,52 @@ export class SupabaseQueueStorage implements IQueueStorage | null; + } + ): Promise { + // Partial update — Supabase's PostgREST `update()` only writes the + // properties present on the object passed in. + const patch: Record = {}; + if ("output" in fields) patch.output = fields.output ?? null; + if ("error" in fields) patch.error = fields.error ?? null; + if ("error_code" in fields) patch.error_code = fields.error_code ?? null; + if ("status" in fields) patch.status = fields.status; + if ("completed_at" in fields) patch.completed_at = fields.completed_at ?? null; + if ("abort_requested_at" in fields) { + patch.abort_requested_at = fields.abort_requested_at ?? null; + } + if ("lease_owner" in fields) patch.lease_owner = fields.lease_owner ?? null; + if ("progress" in fields) patch.progress = fields.progress ?? 0; + if ("progress_message" in fields) patch.progress_message = fields.progress_message ?? ""; + if ("progress_details" in fields) patch.progress_details = fields.progress_details ?? null; + if (Object.keys(patch).length === 0) return; + let query = this.client + .from(this.tableName) + .update(patch) + .eq("id", id as never) + .eq("queue", this.queueName); + query = this.applyPrefixFilters(query); + const { error } = await query; + if (error) throw error; + } + /** * Clears all jobs from the queue. */ @@ -599,21 +739,54 @@ export class SupabaseQueueStorage implements IQueueStorage { + const now = new Date().toISOString(); + + // Abort PENDING → FAILED immediately + { + let query = this.client + .from(this.tableName) + .update({ + status: JobStatus.FAILED, + abort_requested_at: now, + completed_at: now, + }) + .eq("id", jobId) + .eq("queue", this.queueName) + .eq("status", JobStatus.PENDING); + query = this.applyPrefixFilters(query); + const { error } = await query; + if (error) throw error; + } + + // Abort PROCESSING → set abort_requested_at only + { + let query = this.client + .from(this.tableName) + .update({ abort_requested_at: now }) + .eq("id", jobId) + .eq("queue", this.queueName) + .eq("status", JobStatus.PROCESSING); + query = this.applyPrefixFilters(query); + const { error } = await query; + if (error) throw error; + } + } + + /** Force-overwrite status without touching attempts (used to persist DISABLED after lease release). */ + public async saveStatus(jobId: unknown, status: string): Promise { let query = this.client .from(this.tableName) - .update({ status: JobStatus.ABORTING }) + .update({ status }) .eq("id", jobId) .eq("queue", this.queueName); - - query = this.applyPrefixFilters(query); - const { error } = await query; - + query = this.applyPrefixFilters(query as any) as any; + const { error } = await (query as any); if (error) throw error; } diff --git a/providers/supabase/src/job-queue/common.ts b/providers/supabase/src/job-queue/common.ts index 8fd33b13c..bc778e30e 100644 --- a/providers/supabase/src/job-queue/common.ts +++ b/providers/supabase/src/job-queue/common.ts @@ -8,3 +8,6 @@ export * from "./SupabaseQueueStorage"; export * from "./SupabaseRateLimiterStorage"; +export * from "./SupabaseMessageQueue"; +export * from "./SupabaseJobStore"; +export * from "./createSupabaseQueue"; diff --git a/providers/supabase/src/job-queue/createSupabaseQueue.ts b/providers/supabase/src/job-queue/createSupabaseQueue.ts new file mode 100644 index 000000000..9473ad60f --- /dev/null +++ b/providers/supabase/src/job-queue/createSupabaseQueue.ts @@ -0,0 +1,35 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { SupabaseClient } from "@supabase/supabase-js"; +import type { QueueStorageOptions } from "@workglow/job-queue"; +import { SupabaseJobStore } from "./SupabaseJobStore"; +import { SupabaseMessageQueue, type SupabasePendingWrite } from "./SupabaseMessageQueue"; +import { SupabaseQueueStorage } from "./SupabaseQueueStorage"; + +/** + * Factory for the paired Supabase message queue and job store. Both + * facades share a single underlying {@link SupabaseQueueStorage} so writes + * through one are observable through the other. + */ +export function createSupabaseQueue( + queueName: string, + client: SupabaseClient, + opts?: QueueStorageOptions +): { + messageQueue: SupabaseMessageQueue; + jobStore: SupabaseJobStore; + /** @internal — exposed for callers that still need the legacy storage object. */ + core: SupabaseQueueStorage; +} { + const core = new SupabaseQueueStorage(client, queueName, opts); + const pending = new Map>(); + return { + messageQueue: new SupabaseMessageQueue(core, pending), + jobStore: new SupabaseJobStore(core, pending), + core, + }; +}