Skip to content

Commit

Permalink
feat: provide an option to recover etcd from data directory copy
Browse files Browse the repository at this point in the history
Sometimes `talosctl etcd snapshot` might not be available, for example
when etcd is not healthy. In that case it's possible to copy raw etcd
data directory with `talosctl cp /var/lib/etcd .` and use
`member/snap/db` to recover the cluster. But such copy won't pass
integrity checks, so they should be disabled explicitly.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
(cherry picked from commit 0bd8b0e)
  • Loading branch information
smira committed Apr 19, 2021
1 parent 54f243c commit 9fde88c
Show file tree
Hide file tree
Showing 7 changed files with 1,172 additions and 1,134 deletions.
9 changes: 8 additions & 1 deletion api/machine/machine.proto
Expand Up @@ -106,9 +106,16 @@ message ApplyConfigurationResponse { repeated ApplyConfiguration messages = 1; }
message Reboot { common.Metadata metadata = 1; }
message RebootResponse { repeated Reboot messages = 1; }

// rpc bootstrap
// rpc Bootstrap
message BootstrapRequest {
// Enable etcd recovery from the snapshot.
//
// Snapshot should be uploaded before this call via EtcdRecover RPC.
bool recover_etcd = 1;
// Skip hash check on the snapshot (etcd).
//
// Enable this when recovering from data directory copy to skip integrity check.
bool recover_skip_hash_check = 2;
}

// The bootstrap message containing the bootstrap status.
Expand Down
7 changes: 5 additions & 2 deletions cmd/talosctl/cmd/talos/bootstrap.go
Expand Up @@ -17,7 +17,8 @@ import (
)

var bootstrapCmdFlags struct {
recoverFrom string
recoverFrom string
recoverSkipHashCheck bool
}

// bootstrapCmd represents the bootstrap command.
Expand Down Expand Up @@ -60,7 +61,8 @@ Talos etcd cluster can be recovered from a known snapshot with '--recover-from='
}

if err := c.Bootstrap(ctx, &machineapi.BootstrapRequest{
RecoverEtcd: bootstrapCmdFlags.recoverFrom != "",
RecoverEtcd: bootstrapCmdFlags.recoverFrom != "",
RecoverSkipHashCheck: bootstrapCmdFlags.recoverSkipHashCheck,
}); err != nil {
return fmt.Errorf("error executing bootstrap: %s", err)
}
Expand All @@ -72,5 +74,6 @@ Talos etcd cluster can be recovered from a known snapshot with '--recover-from='

func init() {
bootstrapCmd.Flags().StringVar(&bootstrapCmdFlags.recoverFrom, "recover-from", "", "recover etcd cluster from the snapshot")
bootstrapCmd.Flags().BoolVar(&bootstrapCmdFlags.recoverSkipHashCheck, "recover-skip-hash-check", false, "skip integrity check when recovering etcd (use when recovering from data directory copy)")
addCommand(bootstrapCmd)
}
Expand Up @@ -1731,8 +1731,9 @@ func BootstrapEtcd(seq runtime.Sequence, data interface{}) (runtime.TaskExecutio
}

svc := &services.Etcd{
Bootstrap: true,
RecoverFromSnapshot: req.RecoverEtcd,
Bootstrap: true,
RecoverFromSnapshot: req.RecoverEtcd,
RecoverSkipHashCheck: req.RecoverSkipHashCheck,
}

if err = system.Services(r).Unload(ctx, svc.ID(r)); err != nil {
Expand Down
7 changes: 5 additions & 2 deletions internal/app/machined/pkg/system/services/etcd.go
Expand Up @@ -46,8 +46,9 @@ import (
// Etcd implements the Service interface. It serves as the concrete type with
// the required methods.
type Etcd struct {
Bootstrap bool
RecoverFromSnapshot bool
Bootstrap bool
RecoverFromSnapshot bool
RecoverSkipHashCheck bool

args []string
}
Expand Down Expand Up @@ -503,6 +504,8 @@ func (e *Etcd) recoverFromSnapshot(hostname, primaryAddr string) error {
PeerURLs: []string{"https://" + net.FormatAddress(primaryAddr) + ":2380"},

InitialCluster: fmt.Sprintf("%s=https://%s:2380", hostname, net.FormatAddress(primaryAddr)),

SkipHashCheck: e.RecoverSkipHashCheck,
}); err != nil {
return fmt.Errorf("error recovering from the snapshot: %w", err)
}
Expand Down
2,264 changes: 1,141 additions & 1,123 deletions pkg/machinery/api/machine/machine.pb.go

Large diffs are not rendered by default.

9 changes: 7 additions & 2 deletions website/content/docs/v0.10/Reference/api.md
Expand Up @@ -669,12 +669,17 @@ The bootstrap message containing the bootstrap status.
<a name="machine.BootstrapRequest"></a>

### BootstrapRequest
rpc bootstrap
rpc Bootstrap


| Field | Type | Label | Description |
| ----- | ---- | ----- | ----------- |
| recover_etcd | [bool](#bool) | | |
| recover_etcd | [bool](#bool) | | Enable etcd recovery from the snapshot.

Snapshot should be uploaded before this call via EtcdRecover RPC. |
| recover_skip_hash_check | [bool](#bool) | | Skip hash check on the snapshot (etcd).

Enable this when recovering from data directory copy to skip integrity check. |



Expand Down
5 changes: 3 additions & 2 deletions website/content/docs/v0.10/Reference/cli.md
Expand Up @@ -60,8 +60,9 @@ talosctl bootstrap [flags]
### Options

```
-h, --help help for bootstrap
--recover-from string recover etcd cluster from the snapshot
-h, --help help for bootstrap
--recover-from string recover etcd cluster from the snapshot
--recover-skip-hash-check skip integrity check when recovering etcd (use when recovering from data directory copy)
```

### Options inherited from parent commands
Expand Down

0 comments on commit 9fde88c

Please sign in to comment.