Skip to content

Commit

Permalink
feat: audit and remove etcd members if machines no longer exist
Browse files Browse the repository at this point in the history
This PR introduces some auditing functionality for etcd in the workload
clusters. If there are members in the workload etcd that don't
correspond to any CP node, the members will be removed. This helps in
the case where a machine is deleted out from under us and a new CP
machine comes up but can't join etcd.

Signed-off-by: Spencer Smith <robertspencersmith@gmail.com>
  • Loading branch information
rsmitty authored and talos-bot committed Jun 7, 2021
1 parent 182f656 commit fb0257d
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 28 deletions.
107 changes: 107 additions & 0 deletions controllers/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ package controllers

import (
"context"
"fmt"
"strings"

"github.com/talos-systems/talos/pkg/machinery/api/machine"
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
Expand Down Expand Up @@ -45,3 +47,108 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *

return nil
}

// forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
// This is used in times when the machine was deleted out from under us.
func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error {
r.Log.Info("Removing etcd member", "memberName", memberName)

err := c.EtcdRemoveMember(
ctx,
&machine.EtcdRemoveMemberRequest{
Member: memberName,
},
)
if err != nil {
return err
}

return nil
}

// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
// It uses the first controlplane node returned as the etcd endpoint
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster client.ObjectKey, cpName string) error {
machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
if err != nil {
return err
}

if len(machines) == 0 {
return nil
}

for _, machine := range machines {
// nb: we'll assume any machine that doesn't have a noderef is new and we can audit later because
// otherwise a new etcd member can get removed before even getting the noderef set by the CAPI controllers.
if machine.Status.NodeRef == nil {
return fmt.Errorf("some CP machines do not have a noderef")
}
}
// Select the first CP machine that's not being deleted and has a noderef
var designatedCPMachine capiv1.Machine

for _, machine := range machines {
if machine.ObjectMeta.DeletionTimestamp.IsZero() && machine.Status.NodeRef != nil {
designatedCPMachine = machine
break
}
}

clientset, err := r.kubeconfigForCluster(ctx, cluster)
if err != nil {
return err
}

c, err := r.talosconfigForMachine(ctx, clientset, designatedCPMachine)
if err != nil {
return err
}

// Save the first internal IP of the designated machine to use as our node target
// and setup the ctx to target it
var firstIntAddr string

for _, addr := range designatedCPMachine.Status.Addresses {
if addr.Type == capiv1.MachineInternalIP {
firstIntAddr = addr.Address
break
}
}

nodeCtx := talosclient.WithNodes(ctx, firstIntAddr)

response, err := c.EtcdMemberList(nodeCtx, &machine.EtcdMemberListRequest{})
if err != nil {
return err
}

// Only querying one CP node, so only 1 message should return.
memberList := response.Messages[0]

if len(memberList.Members) == 0 {
return nil
}

// For each etcd member, look through the list of machines and see if noderef matches
for _, member := range memberList.Members {
present := false
for _, machine := range machines {
// break apart the noderef name in case it's an fqdn (like in AWS)
machineNodeNameExploded := strings.Split(machine.Status.NodeRef.Name, ".")

if machineNodeNameExploded[0] == member {
present = true
break
}
}

if !present {
r.Log.Info("found etcd member that doesn't exist as controlplane machine", "member", member)

r.forceEtcdLeave(nodeCtx, c, cluster, member)
}
}

return nil
}
6 changes: 6 additions & 0 deletions controllers/taloscontrolplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,12 @@ func (r *TalosControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Resu
return ctrl.Result{}, err
}

// Audit the etcd member list to remove any nodes that no longer exist
if err := r.auditEtcd(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name); err != nil {
logger.Info("failed to check etcd membership list", "error", err)
return ctrl.Result{Requeue: true}, nil
}

return ctrl.Result{}, nil
}

Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ go 1.16
require (
cloud.google.com/go v0.47.0 // indirect
github.com/go-logr/logr v0.1.0
github.com/onsi/ginkgo v1.12.1
github.com/onsi/ginkgo v1.15.0
github.com/onsi/gomega v1.10.1
github.com/pkg/errors v0.9.1
github.com/talos-systems/cluster-api-bootstrap-provider-talos v0.2.0-alpha.10
github.com/talos-systems/talos/pkg/machinery v0.0.0-20210218160848-32d25885288f
github.com/talos-systems/cluster-api-bootstrap-provider-talos v0.2.0-alpha.12
github.com/talos-systems/talos/pkg/machinery v0.0.0-20210520203624-828772cec9a3
k8s.io/api v0.17.9
k8s.io/apimachinery v0.17.9
k8s.io/apiserver v0.17.9
Expand Down
Loading

0 comments on commit fb0257d

Please sign in to comment.