Skip to content

Commit

Permalink
feat: support etcd recovery from snapshot on bootstrap
Browse files Browse the repository at this point in the history
When Talos `controlplane` node is waiting for a bootstrap, `etcd`
contents can be recovered from a snapshot created with
`talosctl etcd snapshot` on a healthy cluster.

Bootstrap process goes same way as before, but the etcd data directory
is recovered from the snapshot.

This flow enables disaster recovery for the control plane: given that
periodic backups are available, destroy control plane nodes, re-create
them with the same config, and bootstrap one node with the saved
snapshot to recover etcd state at the time of the snapshot.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
  • Loading branch information
smira authored and talos-bot committed Apr 8, 2021
1 parent 247bd50 commit e065021
Show file tree
Hide file tree
Showing 19 changed files with 2,158 additions and 1,397 deletions.
24 changes: 23 additions & 1 deletion api/machine/machine.proto
Expand Up @@ -16,7 +16,16 @@ import "common/common.proto";
service MachineService {
rpc ApplyConfiguration(ApplyConfigurationRequest)
returns (ApplyConfigurationResponse);

// Bootstrap method makes control plane node enter etcd bootstrap mode.
//
// Node aborts etcd join sequence and creates single-node etcd cluster.
//
// If recover_etcd argument is specified, etcd is recovered from a snapshot
// uploaded with EtcdRecover.

rpc Bootstrap(BootstrapRequest) returns (BootstrapResponse);

rpc Containers(ContainersRequest) returns (ContainersResponse);
rpc Copy(CopyRequest) returns (stream common.Data);
rpc CPUInfo(google.protobuf.Empty) returns (CPUInfoResponse);
Expand All @@ -31,6 +40,12 @@ service MachineService {
rpc EtcdForfeitLeadership(EtcdForfeitLeadershipRequest)
returns (EtcdForfeitLeadershipResponse);

// EtcdRecover method uploads etcd data snapshot created with EtcdSnapshot
// to the node.
//
// Snapshot can be later used to recover the cluster via Bootstrap method.
rpc EtcdRecover(stream common.Data) returns (EtcdRecoverResponse);

// EtcdSnapshot method creates etcd data snapshot (backup) from the local etcd instance
// and streams it back to the client.
//
Expand Down Expand Up @@ -88,7 +103,9 @@ message Reboot { common.Metadata metadata = 1; }
message RebootResponse { repeated Reboot messages = 1; }

// rpc bootstrap
message BootstrapRequest {}
message BootstrapRequest {
bool recover_etcd = 1;
}

// The bootstrap message containing the bootstrap status.
message Bootstrap { common.Metadata metadata = 1; }
Expand Down Expand Up @@ -771,6 +788,11 @@ message EtcdMemberListResponse { repeated EtcdMemberList messages = 1; }

message EtcdSnapshotRequest {}

message EtcdRecover {
common.Metadata metadata = 1;
}
message EtcdRecoverResponse { repeated EtcdRecover messages = 1; }

// rpc generateConfiguration

message RouteConfig {
Expand Down
49 changes: 45 additions & 4 deletions cmd/talosctl/cmd/talos/bootstrap.go
Expand Up @@ -7,21 +7,61 @@ package talos
import (
"context"
"fmt"
"os"

"github.com/spf13/cobra"
"go.etcd.io/etcd/etcdctl/v3/snapshot"

machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
"github.com/talos-systems/talos/pkg/machinery/client"
)

var bootstrapCmdFlags struct {
recoverFrom string
}

// bootstrapCmd represents the bootstrap command.
var bootstrapCmd = &cobra.Command{
Use: "bootstrap",
Short: "Bootstrap the cluster",
Long: ``,
Args: cobra.NoArgs,
Short: "Bootstrap the etcd cluster on the specified node.",
Long: `When Talos cluster is created etcd service on control plane nodes enter the join loop waiting
to join etcd peers from other control plane nodes. One node should be picked as the boostrap node.
When boostrap command is issued, the node aborts join process and bootstraps etcd cluster as a single node cluster.
Other control plane nodes will join etcd cluster once Kubernetes is boostrapped on the bootstrap node.
This command should not be used when "init" type node are used.
Talos etcd cluster can be recovered from a known snapshot with '--recover-from=' flag.`,
Args: cobra.NoArgs,
RunE: func(cmd *cobra.Command, args []string) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
if err := c.Bootstrap(ctx); err != nil {
if bootstrapCmdFlags.recoverFrom != "" {
manager := snapshot.NewV3(nil)

status, err := manager.Status(bootstrapCmdFlags.recoverFrom)
if err != nil {
return err
}

fmt.Printf("recovering from snapshot %q: hash %08x, revision %d, total keys %d, total size %d\n",
bootstrapCmdFlags.recoverFrom, status.Hash, status.Revision, status.TotalKey, status.TotalSize)

snapshot, err := os.Open(bootstrapCmdFlags.recoverFrom)
if err != nil {
return fmt.Errorf("error opening snapshot file: %w", err)
}

defer snapshot.Close() //nolint: errcheck

_, err = c.EtcdRecover(ctx, snapshot)
if err != nil {
return fmt.Errorf("error uploading snapshot: %w", err)
}
}

if err := c.Bootstrap(ctx, &machineapi.BootstrapRequest{
RecoverEtcd: bootstrapCmdFlags.recoverFrom != "",
}); err != nil {
return fmt.Errorf("error executing bootstrap: %s", err)
}

Expand All @@ -31,5 +71,6 @@ var bootstrapCmd = &cobra.Command{
}

func init() {
bootstrapCmd.Flags().StringVar(&bootstrapCmdFlags.recoverFrom, "recover-from", "", "recover etcd cluster from the snapshot")
addCommand(bootstrapCmd)
}
11 changes: 11 additions & 0 deletions cmd/talosctl/cmd/talos/etcd.go
Expand Up @@ -15,6 +15,7 @@ import (
"text/tabwriter"

"github.com/spf13/cobra"
"go.etcd.io/etcd/etcdctl/v3/snapshot"

"github.com/talos-systems/talos/cmd/talosctl/pkg/talos/helpers"
"github.com/talos-systems/talos/pkg/cli"
Expand Down Expand Up @@ -183,6 +184,16 @@ var etcdSnapshotCmd = &cobra.Command{

fmt.Printf("etcd snapshot saved to %q (%d bytes)\n", dbPath, size)

manager := snapshot.NewV3(nil)

status, err := manager.Status(dbPath)
if err != nil {
return err
}

fmt.Printf("snapshot info: hash %08x, revision %d, total keys %d, total size %d\n",
status.Hash, status.Revision, status.TotalKey, status.TotalSize)

return nil
})
},
Expand Down
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -81,6 +81,7 @@ require (
github.com/vmware/vmw-guestinfo v0.0.0-20200218095840-687661b8bd8e
go.etcd.io/etcd/api/v3 v3.5.0-alpha.0
go.etcd.io/etcd/client/v3 v3.5.0-alpha.0
go.etcd.io/etcd/etcdctl/v3 v3.5.0-alpha.0
go.etcd.io/etcd/pkg/v3 v3.5.0-alpha.0
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
Expand Down

0 comments on commit e065021

Please sign in to comment.