From 3c3a355aa25720a6bee8af2cf6340f24b0498b2b Mon Sep 17 00:00:00 2001 From: Allison Pierson Date: Wed, 31 May 2023 19:36:10 -0500 Subject: [PATCH] `migrate-to-v2 debug`: tool to diagnose & repair failed migrations --- internal/command/migrate_to_v2/debug.go | 335 ++++++++++++++++++ .../command/migrate_to_v2/migrate_to_v2.go | 3 + internal/command/migrate_to_v2/nomad.go | 37 +- internal/command/status/machines.go | 2 +- internal/command/status/status.go | 2 +- 5 files changed, 366 insertions(+), 13 deletions(-) create mode 100644 internal/command/migrate_to_v2/debug.go diff --git a/internal/command/migrate_to_v2/debug.go b/internal/command/migrate_to_v2/debug.go new file mode 100644 index 0000000000..297d4dc11e --- /dev/null +++ b/internal/command/migrate_to_v2/debug.go @@ -0,0 +1,335 @@ +package migrate_to_v2 + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/AlecAivazis/survey/v2" + "github.com/jpillora/backoff" + "github.com/samber/lo" + "github.com/spf13/cobra" + "github.com/superfly/flyctl/api" + "github.com/superfly/flyctl/client" + "github.com/superfly/flyctl/flaps" + "github.com/superfly/flyctl/internal/appconfig" + "github.com/superfly/flyctl/internal/command" + "github.com/superfly/flyctl/internal/command/apps" + "github.com/superfly/flyctl/internal/command/machine" + "github.com/superfly/flyctl/internal/command/status" + "github.com/superfly/flyctl/internal/flag" + "github.com/superfly/flyctl/internal/render" + "github.com/superfly/flyctl/iostreams" +) + +func newDebug() *cobra.Command { + const ( + usage = `debug` + long = `Debug an app that has been migrated to Apps V2` + short = long + ) + cmd := command.New( + usage, short, long, runDebug, + command.RequireSession, + command.RequireAppName, + command.LoadAppConfigIfPresent, + ) + cmd.Args = cobra.NoArgs + flag.Add(cmd, + flag.AppConfig(), + flag.App(), + ) + return cmd +} + +func hasMigrated(ctx context.Context, app *api.AppCompact, machines []*api.Machine) bool { + client := client.FromContext(ctx).API() + + if app.PlatformVersion == appconfig.DetachedPlatform { + return true + } + + // Check that the app is not currently on nomad + if app.PlatformVersion == appconfig.NomadPlatform { + return false + } + + // Look for a machine tied to a previous alloc + for _, machine := range machines { + if machine.Config != nil && machine.Config.Metadata != nil { + if _, ok := machine.Config.Metadata[api.MachineConfigMetadataKeyFlyPreviousAlloc]; ok { + return true + } + } + } + + // Look for a release created by admin-bot@fly.io + releases, err := client.GetAppReleasesMachines(ctx, app.Name, "", 25) + if err != nil { + return false + } + for _, release := range releases { + // Technically, I don't think this is the only time we could use admin-bot@fly.io, + // but we use it infrequently and soon we'll be done dealing with this, + // so it's probably an acceptable way to determine this for now. + if release.User.Email == "admin-bot@fly.io" { + return true + } + } + return false +} + +const timedOutErr = "timed out" + +func backoffWait(ctx context.Context, cutoff time.Duration, cb func() (bool, error)) error { + ctx, cancelFn := context.WithTimeout(ctx, cutoff) + defer cancelFn() + b := &backoff.Backoff{ + Min: 1 * time.Second, + Max: 10 * time.Second, + Factor: 1.2, + Jitter: true, + } + for { + // Check for deadline + select { + case <-ctx.Done(): + return errors.New(timedOutErr) + default: + } + done, err := cb() + if err != nil { + return err + } + if done { + return nil + } + time.Sleep(b.Duration()) + } +} + +func unsuspend(ctx context.Context, app *api.AppCompact) error { + + if app.Status != "suspended" { + return nil + } + + client := client.FromContext(ctx).API() + if app.Status == "suspended" { + _, err := client.ResumeApp(ctx, app.Name) + if err != nil { + return err + } + } + + err := backoffWait(ctx, 1*time.Minute, func() (bool, error) { + app, err := client.GetAppCompact(ctx, app.Name) + if err != nil { + return false, err + } + if app.Status != "suspended" { + return true, nil + } + return false, nil + }) + if err != nil { + if err.Error() == timedOutErr { + return errors.New("timed out waiting for app to unsuspend") + } + return err + } + + return nil +} + +func runDebug(ctx context.Context) error { + io := iostreams.FromContext(ctx) + client := client.FromContext(ctx).API() + app, err := client.GetAppCompact(ctx, flag.GetApp(ctx)) + if err != nil { + return err + } + flapsClient, err := flaps.NewFromAppName(ctx, app.Name) + if err != nil { + return fmt.Errorf("could not create flaps client: %w", err) + } + ctx = flaps.NewContext(ctx, flapsClient) + + // Grab the list of machines + // Useful to have, but also used to determine if the app has been migrated + var machines []*api.Machine + if app.PlatformVersion != appconfig.NomadPlatform { + machines, err = flapsClient.List(ctx, "") + if err != nil { + return fmt.Errorf("could not list machines: %w", err) + } + } + + if !hasMigrated(ctx, app, machines) { + return fmt.Errorf("app has not been migrated to Apps V2") + } + + if app.PlatformVersion != appconfig.MachinesPlatform { + err := unsuspend(ctx, app) + if err != nil { + return fmt.Errorf("could not unsuspend app: %w", err) + } + } + + // Grab nomad allocs now that we know the app has been migrated + allocs, err := client.GetAllocations(ctx, app.Name, false) + if err != nil { + return fmt.Errorf("could not list nomad allocs: %w", err) + } + + if app.PlatformVersion == appconfig.DetachedPlatform { + fmt.Fprintf(io.Out, `The app's platform version is 'detached' +This means that the app is stuck in a half-migrated state, and wasn't able to +be fully recovered during the migration error rollback process. + +Fixing this depends on how far the app got in the migration process. +Please use these tools to troubleshoot and attempt to repair the app. +`) + return fixDetachedApp(ctx, app, machines, allocs) + } + + if app.PlatformVersion == appconfig.MachinesPlatform { + if len(allocs) != 0 { + fmt.Fprintf(io.Out, "Detected nomad allocs running on V2 app, cleaning up.\n") + return zeroNomadUseMachines(ctx, app, allocs) + } + } + + fmt.Fprintf(io.Out, "No issues detected.\n") + + return nil +} + +func zeroNomadUseMachines(ctx context.Context, app *api.AppCompact, allocs []*api.AllocationStatus) error { + + io := iostreams.FromContext(ctx) + + if app.PlatformVersion != appconfig.DetachedPlatform { + err := setPlatformVersion(ctx, appconfig.DetachedPlatform) + if err != nil { + return err + } + } + + fmt.Fprintf(io.Out, "Destroying nomad allocs and setting platform version to machines/Apps V2.\n") + vmGroups := lo.Uniq(lo.Map(allocs, func(alloc *api.AllocationStatus, _ int) string { + return alloc.TaskName + })) + err := scaleNomadToZero(ctx, app, "", vmGroups) + if err != nil { + return err + } + err = setPlatformVersion(ctx, appconfig.MachinesPlatform) + if err != nil { + return err + } + fmt.Fprint(io.Out, "Done!\n") + return nil +} + +func setPlatformVersion(ctx context.Context, ver string) error { + return apps.UpdateAppPlatformVersion(ctx, flag.GetApp(ctx), ver) +} + +func fixDetachedApp( + ctx context.Context, + app *api.AppCompact, + machines []*api.Machine, + allocs []*api.AllocationStatus, +) error { + io := iostreams.FromContext(ctx) + client := client.FromContext(ctx).API() + if len(machines) == 0 && len(allocs) == 0 { + fmt.Fprintf(io.Out, "No machines or allocs found. Setting platform version to machines/Apps V2.\n") + return setPlatformVersion(ctx, appconfig.MachinesPlatform) + } + + if len(machines) == 0 { + fmt.Fprintf(io.Out, "No Apps v2 machines found. Setting platform version to nomad.\n") + setPlatformVersion(ctx, appconfig.NomadPlatform) + } + + if len(allocs) == 0 { + fmt.Fprintf(io.Out, "No legacy nomad allocs found. Setting platform version to machines/Apps V2.\n") + return setPlatformVersion(ctx, appconfig.MachinesPlatform) + } + + const ( + PrintNomad = "List Nomad allocs" + PrintMachines = "List Machines" + DestroyNomadUseMachines = "Destroy remaining Nomad allocs and use Apps V2" + DestroyMachinesUseNomad = "Destroy existing machines and use Nomad" + Exit = "Exit" + ) + + // Lifted from command/status/status.go + var appStatus *api.AppStatus + var err error + if appStatus, err = client.GetAppStatus(ctx, app.Name, false); err != nil { + return fmt.Errorf("failed retrieving app %s: %w", app.Name, err) + } + var backupRegions []api.Region + if appStatus.Deployed { + if _, backupRegions, err = client.ListAppRegions(ctx, app.Name); err != nil { + return fmt.Errorf("failed retrieving backup regions for %s: %w", app.Name, err) + } + } + + for { + var opt struct{ Choice string } + err := survey.Ask([]*survey.Question{{ + Name: "choice", + Prompt: &survey.Select{ + Message: "What would you like to do?", + Options: []string{ + PrintNomad, + PrintMachines, + DestroyNomadUseMachines, + DestroyMachinesUseNomad, + Exit, + }, + Default: PrintMachines, + }, + }}, &opt) + if err != nil { + return err + } + switch opt.Choice { + case PrintNomad: + fmt.Fprintf(io.Out, "Nomad allocs:\n") + err = render.AllocationStatuses(io.Out, "Nomad Allocs", backupRegions, appStatus.Allocations...) + if err != nil { + return err + } + case PrintMachines: + if err := status.RenderMachineStatus(ctx, app, io.Out); err != nil { + return err + } + case DestroyNomadUseMachines: + return zeroNomadUseMachines(ctx, app, allocs) + case DestroyMachinesUseNomad: + fmt.Fprintf(io.Out, "Destroying machines and setting platform version to nomad.\n") + + for _, mach := range machines { + err := machine.Destroy(ctx, app, mach, true) + if err != nil { + return fmt.Errorf("could not destroy machine: %w", err) + } + } + err = setPlatformVersion(ctx, appconfig.NomadPlatform) + if err != nil { + return err + } + fmt.Fprint(io.Out, "Done!\n") + return nil + case Exit: + return nil + } + } +} diff --git a/internal/command/migrate_to_v2/migrate_to_v2.go b/internal/command/migrate_to_v2/migrate_to_v2.go index ad4f972b56..8a081a4102 100644 --- a/internal/command/migrate_to_v2/migrate_to_v2.go +++ b/internal/command/migrate_to_v2/migrate_to_v2.go @@ -76,6 +76,9 @@ func newMigrateToV2() *cobra.Command { Description: "Specify primary region if one is not set in fly.toml", }, ) + + cmd.AddCommand(newDebug()) + return cmd } diff --git a/internal/command/migrate_to_v2/nomad.go b/internal/command/migrate_to_v2/nomad.go index 0dd94ef864..8adacee9d8 100644 --- a/internal/command/migrate_to_v2/nomad.go +++ b/internal/command/migrate_to_v2/nomad.go @@ -9,7 +9,10 @@ import ( "github.com/briandowns/spinner" "github.com/jpillora/backoff" "github.com/samber/lo" + "github.com/superfly/flyctl/api" + "github.com/superfly/flyctl/client" "github.com/superfly/flyctl/gql" + "github.com/superfly/flyctl/iostreams" ) func (m *v2PlatformMigrator) lockApp(ctx context.Context) error { @@ -56,34 +59,46 @@ func (m *v2PlatformMigrator) unlockApp(ctx context.Context) error { } func (m *v2PlatformMigrator) scaleNomadToZero(ctx context.Context) error { + err := scaleNomadToZero(ctx, m.appCompact, m.appLock, lo.Keys(m.oldVmCounts)) + if err != nil { + return err + } + m.recovery.scaledToZero = true + return nil +} + +func scaleNomadToZero(ctx context.Context, app *api.AppCompact, lock string, vmGroups []string) error { + + gqlClient := client.FromContext(ctx).API().GenqClient + input := gql.SetVMCountInput{ - AppId: m.appConfig.AppName, - LockId: m.appLock, - GroupCounts: lo.MapToSlice(m.oldVmCounts, func(name string, count int) gql.VMCountInput { + AppId: app.Name, + LockId: lock, + GroupCounts: lo.Map(vmGroups, func(name string, _ int) gql.VMCountInput { return gql.VMCountInput{Group: name, Count: 0} }), } if len(input.GroupCounts) > 0 { - - _, err := gql.SetNomadVMCount(ctx, m.gqlClient, input) + _, err := gql.SetNomadVMCount(ctx, gqlClient, input) if err != nil { return err } } - err := m.waitForAllocsZero(ctx) + err := waitForAllocsZero(ctx, app) if err != nil { return err } - m.recovery.scaledToZero = true return nil } -func (m *v2PlatformMigrator) waitForAllocsZero(ctx context.Context) error { +func waitForAllocsZero(ctx context.Context, app *api.AppCompact) error { + io := iostreams.FromContext(ctx) + apiClient := client.FromContext(ctx).API() s := spinner.New(spinner.CharSets[9], 200*time.Millisecond) - s.Writer = m.io.ErrOut - s.Prefix = fmt.Sprintf("Waiting for nomad allocs for '%s' to be destroyed ", m.appCompact.Name) + s.Writer = io.ErrOut + s.Prefix = fmt.Sprintf("Waiting for nomad allocs for '%s' to be destroyed ", app.Name) s.Start() defer s.Stop() @@ -97,7 +112,7 @@ func (m *v2PlatformMigrator) waitForAllocsZero(ctx context.Context) error { for { select { case <-time.After(b.Duration()): - currentAllocs, err := m.apiClient.GetAllocations(ctx, m.appCompact.Name, false) + currentAllocs, err := apiClient.GetAllocations(ctx, app.Name, false) if err != nil { return err } diff --git a/internal/command/status/machines.go b/internal/command/status/machines.go index 62580348d7..61befabf52 100644 --- a/internal/command/status/machines.go +++ b/internal/command/status/machines.go @@ -72,7 +72,7 @@ func getImage(machines []*api.Machine) (string, error) { return latestImage, nil } -func renderMachineStatus(ctx context.Context, app *api.AppCompact, out io.Writer) error { +func RenderMachineStatus(ctx context.Context, app *api.AppCompact, out io.Writer) error { var ( io = iostreams.FromContext(ctx) colorize = io.ColorScheme() diff --git a/internal/command/status/status.go b/internal/command/status/status.go index d47ad78a0c..00da332315 100644 --- a/internal/command/status/status.go +++ b/internal/command/status/status.go @@ -106,7 +106,7 @@ func once(ctx context.Context, out io.Writer) (err error) { platformVersion := app.PlatformVersion if platformVersion == "machines" { - err = renderMachineStatus(ctx, app, out) + err = RenderMachineStatus(ctx, app, out) return } else { command.PromptToMigrate(ctx, app)