From b2a07891578650928503eb30d4e1a1420288c7d4 Mon Sep 17 00:00:00 2001 From: lubien Date: Wed, 22 Apr 2026 09:43:39 -0300 Subject: [PATCH 1/2] deploy: retry uncordon with exponential backoff in bluegreen strategy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A transient HTTP 408 from the uncordon API endpoint was enough to fail an otherwise healthy bluegreen deployment and trigger a full rollback, destroying every newly-created green machine. MarkGreenMachinesAsReadyForTraffic now wraps each Uncordon call with retry.Do using exponential backoff (up to 5 attempts, starting at 500 ms, capped at 30 s). A retry line is printed to stderr so the operator can see what is happening. Two new fields on blueGreen control the retry behaviour: uncordonRetryAttempts – total attempts (default 5) uncordonRetryDelay – initial delay (default 500 ms) Both are zeroed out in the test helper so existing and new tests finish instantly. Tests added in TestMarkGreenMachinesAsReadyForTrafficRetries: - succeeds immediately when no errors occur - succeeds after transient uncordon failures are retried - fails after all retry attempts are exhausted The mock gains an uncordonTransientFailures counter that fails exactly N times before succeeding, allowing precise retry-path coverage. --- internal/command/deploy/mock_client_test.go | 12 +++++ internal/command/deploy/strategy_bluegreen.go | 21 +++++++- .../command/deploy/strategy_bluegreen_test.go | 54 ++++++++++++++++++- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/internal/command/deploy/mock_client_test.go b/internal/command/deploy/mock_client_test.go index 5169342d2a..e2c21d3611 100644 --- a/internal/command/deploy/mock_client_test.go +++ b/internal/command/deploy/mock_client_test.go @@ -26,6 +26,10 @@ type mockFlapsClient struct { breakDestroy bool breakLease bool + // uncordonTransientFailures causes Uncordon to fail this many times before + // succeeding, simulating transient API errors for retry tests. + uncordonTransientFailures int + // mu to protect the members below. mu sync.Mutex machines []*fly.Machine @@ -308,6 +312,14 @@ func (m *mockFlapsClient) Uncordon(ctx context.Context, appName, machineID strin return fmt.Errorf("failed to uncordon %s", machineID) } + m.mu.Lock() + defer m.mu.Unlock() + + if m.uncordonTransientFailures > 0 { + m.uncordonTransientFailures-- + return fmt.Errorf("transient error uncordoning %s", machineID) + } + return nil } diff --git a/internal/command/deploy/strategy_bluegreen.go b/internal/command/deploy/strategy_bluegreen.go index 319df2a6b8..de57f5b7ee 100644 --- a/internal/command/deploy/strategy_bluegreen.go +++ b/internal/command/deploy/strategy_bluegreen.go @@ -85,6 +85,9 @@ type blueGreen struct { waitBeforeStop time.Duration waitBeforeCordon time.Duration + + uncordonRetryAttempts uint + uncordonRetryDelay time.Duration } func BlueGreenStrategy(md *machineDeployment, blueMachines []*machineUpdateEntry) *blueGreen { @@ -123,6 +126,9 @@ func (bg *blueGreen) initialize() { bg.waitBeforeStop = 10 * time.Second bg.waitBeforeCordon = 10 * time.Second + + bg.uncordonRetryAttempts = 5 + bg.uncordonRetryDelay = 500 * time.Millisecond } func (bg *blueGreen) isAborted() bool { @@ -478,7 +484,20 @@ func (bg *blueGreen) MarkGreenMachinesAsReadyForTraffic(ctx context.Context) err if bg.isAborted() { return ErrAborted } - err := bg.flaps.Uncordon(ctx, bg.app.Name, gm.Machine().ID, "") + err := retry.Do( + func() error { + return bg.flaps.Uncordon(ctx, bg.app.Name, gm.Machine().ID, "") + }, + retry.Context(ctx), + retry.Attempts(bg.uncordonRetryAttempts), + retry.Delay(bg.uncordonRetryDelay), + retry.MaxDelay(30*time.Second), + retry.DelayType(retry.BackOffDelay), + retry.OnRetry(func(n uint, err error) { + fmt.Fprintf(bg.io.ErrOut, " Retrying uncordon for machine %s (attempt %d/%d): %v\n", + bg.colorize.Bold(gm.FormattedMachineId()), n+2, bg.uncordonRetryAttempts, err) + }), + ) if err != nil { return err } diff --git a/internal/command/deploy/strategy_bluegreen_test.go b/internal/command/deploy/strategy_bluegreen_test.go index d6771b7608..874e26aa7b 100644 --- a/internal/command/deploy/strategy_bluegreen_test.go +++ b/internal/command/deploy/strategy_bluegreen_test.go @@ -2,11 +2,12 @@ package deploy import ( "context" + "fmt" "testing" "time" "github.com/stretchr/testify/assert" - "github.com/superfly/fly-go" + fly "github.com/superfly/fly-go" "github.com/superfly/fly-go/flaps" "github.com/superfly/flyctl/internal/appconfig" "github.com/superfly/flyctl/internal/flapsutil" @@ -53,6 +54,7 @@ func newBlueGreenStrategy(client flapsutil.FlapsClient, numberOfExistingMachines // Don't have to wait during tests. strategy.waitBeforeStop = 0 strategy.waitBeforeCordon = 0 + strategy.uncordonRetryDelay = 0 return strategy } @@ -97,6 +99,56 @@ func TestDeploy(t *testing.T) { }) } +func TestMarkGreenMachinesAsReadyForTrafficRetries(t *testing.T) { + ios, _, _, _ := iostreams.Test() + + // makeStrategyWithGreenMachines builds a blueGreen with pre-populated green + // machines, letting us test MarkGreenMachinesAsReadyForTraffic in isolation + // without running the full deploy pipeline. + makeStrategyWithGreenMachines := func(client *mockFlapsClient, greenCount int) *blueGreen { + bg := newBlueGreenStrategy(client, 0) + for i := range greenCount { + bg.greenMachines = append(bg.greenMachines, &machineUpdateEntry{ + leasableMachine: machine.NewLeasableMachine(client, ios, "test-app", &fly.Machine{ID: fmt.Sprintf("green-%d", i+1)}, false), + launchInput: &fly.LaunchMachineInput{}, + }) + } + return bg + } + + ctx := context.Background() + + t.Run("succeeds immediately when no errors occur", func(t *testing.T) { + client := &mockFlapsClient{} + bg := makeStrategyWithGreenMachines(client, 3) + + err := bg.MarkGreenMachinesAsReadyForTraffic(ctx) + assert.NoError(t, err) + }) + + t.Run("succeeds after transient uncordon failures are retried", func(t *testing.T) { + client := &mockFlapsClient{uncordonTransientFailures: 2} + bg := makeStrategyWithGreenMachines(client, 1) + + err := bg.MarkGreenMachinesAsReadyForTraffic(ctx) + assert.NoError(t, err) + + client.mu.Lock() + remaining := client.uncordonTransientFailures + client.mu.Unlock() + assert.Equal(t, 0, remaining, "all transient failures should have been consumed by retries") + }) + + t.Run("fails after all retry attempts are exhausted", func(t *testing.T) { + client := &mockFlapsClient{breakUncordon: true} + bg := makeStrategyWithGreenMachines(client, 1) + bg.uncordonRetryAttempts = 3 + + err := bg.MarkGreenMachinesAsReadyForTraffic(ctx) + assert.ErrorContains(t, err, "failed to uncordon") + }) +} + func FuzzDeploy(f *testing.F) { flapsClient := &mockFlapsClient{} From 1d949324fb329dffcf7455fcfac95894e1b70a0e Mon Sep 17 00:00:00 2001 From: lubien Date: Wed, 22 Apr 2026 09:43:39 -0300 Subject: [PATCH 2/2] deploy: retry uncordon with exponential backoff in bluegreen strategy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit What and Why: A transient HTTP 408 from the uncordon API endpoint was enough to fail an otherwise healthy bluegreen deployment and trigger a full rollback, destroying every newly-created green machine even though they had all started and passed health checks. Retrying the uncordon call with exponential backoff makes the deployment resilient to transient API timeouts without any change to the rollback logic. How: MarkGreenMachinesAsReadyForTraffic wraps each Uncordon call in retry.Do (already imported) with exponential backoff: up to 5 attempts, initial delay 500 ms, capped at 30 s. A message is printed to stderr on each retry so the operator can see what is happening. Two new fields on blueGreen control the behaviour: uncordonRetryAttempts – total attempts (default 5) uncordonRetryDelay – initial delay (default 500 ms) Both are zeroed in the test helper so all tests remain instant. The mock gains an uncordonTransientFailures counter (protected by the existing mutex) that fails exactly N times then succeeds, enabling targeted retry-path coverage in TestMarkGreenMachinesAsReadyForTrafficRetries: - succeeds immediately when no errors occur - succeeds after transient uncordon failures are retried - fails after all retry attempts are exhausted Related to: n/a --- - [x] n/a --- internal/command/deploy/mock_client_test.go | 1 + internal/command/deploy/strategy_bluegreen_test.go | 1 + 2 files changed, 2 insertions(+) diff --git a/internal/command/deploy/mock_client_test.go b/internal/command/deploy/mock_client_test.go index e2c21d3611..b7b30d84d3 100644 --- a/internal/command/deploy/mock_client_test.go +++ b/internal/command/deploy/mock_client_test.go @@ -317,6 +317,7 @@ func (m *mockFlapsClient) Uncordon(ctx context.Context, appName, machineID strin if m.uncordonTransientFailures > 0 { m.uncordonTransientFailures-- + return fmt.Errorf("transient error uncordoning %s", machineID) } diff --git a/internal/command/deploy/strategy_bluegreen_test.go b/internal/command/deploy/strategy_bluegreen_test.go index 874e26aa7b..cc08fe5449 100644 --- a/internal/command/deploy/strategy_bluegreen_test.go +++ b/internal/command/deploy/strategy_bluegreen_test.go @@ -113,6 +113,7 @@ func TestMarkGreenMachinesAsReadyForTrafficRetries(t *testing.T) { launchInput: &fly.LaunchMachineInput{}, }) } + return bg }