From 55c45ab9b14f5bbf832d2e2f4d6508dd7db2366c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20Nerl=C3=B8e?= Date: Wed, 22 Oct 2025 10:36:58 +0200 Subject: [PATCH 1/3] fix(supervisor): prevent escalating duplicate reconnections in failedPodHandler --- .../src/services/failedPodHandler.ts | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts index 26a589e677..ff2de78ff7 100644 --- a/apps/supervisor/src/services/failedPodHandler.ts +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -25,6 +25,7 @@ export class FailedPodHandler { private readonly informer: Informer; private readonly reconnectIntervalMs: number; + private reconnecting = false; // Metrics private readonly register: Registry; @@ -250,21 +251,41 @@ export class FailedPodHandler { } private makeOnError(informerName: string) { - return () => this.onError(informerName); + return (err?: unknown) => this.onError(informerName, err); } - private async onError(informerName: string) { + private async onError(informerName: string, err?: unknown) { if (!this.isRunning) { this.logger.warn("onError: informer not running"); return; } - this.logger.error("error event fired", { informerName }); - this.informerEventsTotal.inc({ namespace: this.namespace, verb: "error" }); + // Guard against multiple simultaneous reconnections + if (this.reconnecting) { + this.logger.debug("onError: reconnection already in progress, skipping", { + informerName, + }); + return; + } - // Reconnect on errors - await setTimeout(this.reconnectIntervalMs); - await this.informer.start(); + this.reconnecting = true; + + try { + const error = err instanceof Error ? err : undefined; + this.logger.error("error event fired", { + informerName, + error: error?.message, + errorType: error?.name, + errorStack: error?.stack, + }); + this.informerEventsTotal.inc({ namespace: this.namespace, verb: "error" }); + + // Reconnect on errors + await setTimeout(this.reconnectIntervalMs); + await this.informer.start(); + } finally { + this.reconnecting = false; + } } private makeOnConnect(informerName: string) { From 47361ff047675cfd2de0376f6f7d81976a8c695f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20Nerl=C3=B8e?= Date: Wed, 22 Oct 2025 18:18:49 +0200 Subject: [PATCH 2/3] fix: added catch handler for informer.start() failures --- apps/supervisor/src/services/failedPodHandler.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts index ff2de78ff7..02b78362ba 100644 --- a/apps/supervisor/src/services/failedPodHandler.ts +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -283,6 +283,14 @@ export class FailedPodHandler { // Reconnect on errors await setTimeout(this.reconnectIntervalMs); await this.informer.start(); + } catch (handlerError) { + const error = handlerError instanceof Error ? handlerError : undefined; + this.logger.error("onError: reconnection attempt failed", { + informerName, + error: error?.message, + errorType: error?.name, + errorStack: error?.stack, + }); } finally { this.reconnecting = false; } From 1f0a5bb29f7f107acffcef9404ee8eb6f84887d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20Nerl=C3=B8e?= Date: Wed, 22 Oct 2025 19:34:25 +0200 Subject: [PATCH 3/3] fix: removed 'errorStack' from error log --- apps/supervisor/src/services/failedPodHandler.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts index 02b78362ba..0721724376 100644 --- a/apps/supervisor/src/services/failedPodHandler.ts +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -276,7 +276,6 @@ export class FailedPodHandler { informerName, error: error?.message, errorType: error?.name, - errorStack: error?.stack, }); this.informerEventsTotal.inc({ namespace: this.namespace, verb: "error" });