From 875125b1cad4efdbf4636e73bc82d752b1381616 Mon Sep 17 00:00:00 2001 From: Pranay Prakash Date: Wed, 12 Nov 2025 20:53:02 -0500 Subject: [PATCH 1/2] Fix workflow failing because of function timeouts --- .changeset/cyan-ducks-wonder.md | 5 +++++ packages/core/src/runtime.ts | 30 ++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) create mode 100644 .changeset/cyan-ducks-wonder.md diff --git a/.changeset/cyan-ducks-wonder.md b/.changeset/cyan-ducks-wonder.md new file mode 100644 index 000000000..123897616 --- /dev/null +++ b/.changeset/cyan-ducks-wonder.md @@ -0,0 +1,5 @@ +--- +"@workflow/core": patch +--- + +Allow step retrying if it fails without proper cleanup diff --git a/packages/core/src/runtime.ts b/packages/core/src/runtime.ts index 069accbb3..ba20bc08b 100644 --- a/packages/core/src/runtime.ts +++ b/packages/core/src/runtime.ts @@ -640,12 +640,14 @@ export const stepEntrypoint = let result: unknown; const attempt = step.attempt + 1; try { - if (step.status !== 'pending') { - // We should only be running the step if it's pending - // (initial state, or state set on re-try), so the step has been + if (!['pending', 'running'].includes(step.status)) { + // We should only be running the step if it's either + // a) pending - initial state, or state set on re-try + // b) running - if a step fails mid-execution, like a function timeout + // so the step has been // invoked erroneously. console.error( - `[Workflows] "${workflowRunId}" - Step invoked erroneously, expected status "pending", got "${step.status}" instead, skipping execution` + `[Workflows] "${workflowRunId}" - Step invoked erroneously, expected status "pending" or "running", got "${step.status}" instead, skipping execution` ); span?.setAttributes({ ...Attribute.StepSkipped(true), @@ -698,11 +700,24 @@ export const stepEntrypoint = () => stepFn(...args) ); + // NOTE: None of the code from this point is guaranteed to run + // Since the step might fail or cause a function timeout and the process might be SIGKILL'd + // The workflow runtime must be resilient to the below code not executing on a failed step + result = dehydrateStepReturnValue(result, ops); waitUntil(Promise.all(ops)); - // Update the event log with the step result + // Mark the step as completed first. This order is important. If a concurrent + // execution marked the step as complete, this request should throw, and + // this prevent the step_completed event in the event log + // TODO: this should really be atomic and handled by the world + await world.steps.update(workflowRunId, stepId, { + status: 'completed', + output: result as Serializable, + }); + + // Then, append the event log with the step result await world.events.create(workflowRunId, { eventType: 'step_completed', correlationId: stepId, @@ -711,11 +726,6 @@ export const stepEntrypoint = }, }); - await world.steps.update(workflowRunId, stepId, { - status: 'completed', - output: result as Serializable, - }); - span?.setAttributes({ ...Attribute.StepStatus('completed'), ...Attribute.StepResultType(typeof result), From a5f6eee077cd249b38ab17ce45efd97ae4a5b498 Mon Sep 17 00:00:00 2001 From: Pranay Prakash Date: Sun, 16 Nov 2025 12:03:39 -0800 Subject: [PATCH 2/2] Update packages/core/src/runtime.ts Co-authored-by: Peter Wielander --- packages/core/src/runtime.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/core/src/runtime.ts b/packages/core/src/runtime.ts index ba20bc08b..f9200535c 100644 --- a/packages/core/src/runtime.ts +++ b/packages/core/src/runtime.ts @@ -644,8 +644,7 @@ export const stepEntrypoint = // We should only be running the step if it's either // a) pending - initial state, or state set on re-try // b) running - if a step fails mid-execution, like a function timeout - // so the step has been - // invoked erroneously. + // otherwise, the step has been invoked erroneously console.error( `[Workflows] "${workflowRunId}" - Step invoked erroneously, expected status "pending" or "running", got "${step.status}" instead, skipping execution` );