manager/role_manager.go

package manager

import (
	"context"
	"time"

	"code.cloudfoundry.org/clock"
	"github.com/docker/swarmkit/api"
	"github.com/docker/swarmkit/log"
	"github.com/docker/swarmkit/manager/state/raft"
	"github.com/docker/swarmkit/manager/state/raft/membership"
	"github.com/docker/swarmkit/manager/state/store"
)

const (
	// roleReconcileInterval is how often to retry removing a node, if a reconciliation or
	// removal failed
	roleReconcileInterval = 5 * time.Second

	// removalTimeout is how long to wait before a raft member removal fails to be applied
	// to the store
	removalTimeout = 5 * time.Second
)

// roleManager reconciles the raft member list with desired role changes.
type roleManager struct {
	ctx    context.Context
	cancel func()

	store    *store.MemoryStore
	raft     *raft.Node
	doneChan chan struct{}

	// pendingReconciliation contains changed nodes that have not yet been reconciled in
	// the raft member list.
	pendingReconciliation map[string]*api.Node

	// pendingRemoval contains the IDs of nodes that have been deleted - if these correspond
	// to members in the raft cluster, those members need to be removed from raft
	pendingRemoval map[string]struct{}

	// leave this nil except for tests which need to inject a fake time source
	clocksource clock.Clock
}

// newRoleManager creates a new roleManager.
func newRoleManager(store *store.MemoryStore, raftNode *raft.Node) *roleManager {
	ctx, cancel := context.WithCancel(context.Background())
	return &roleManager{
		ctx:                   ctx,
		cancel:                cancel,
		store:                 store,
		raft:                  raftNode,
		doneChan:              make(chan struct{}),
		pendingReconciliation: make(map[string]*api.Node),
		pendingRemoval:        make(map[string]struct{}),
	}
}

// getTicker returns a ticker based on the configured clock source
func (rm *roleManager) getTicker(interval time.Duration) clock.Ticker {
	if rm.clocksource == nil {
		return clock.NewClock().NewTicker(interval)
	}
	return rm.clocksource.NewTicker(interval)

}

// Run is roleManager's main loop.  On startup, it looks at every node object in the cluster and
// attempts to reconcile the raft member list with all the nodes' desired roles.  If any nodes
// need to be demoted or promoted, it will add them to a reconciliation queue, and if any raft
// members' node have been deleted, it will add them to a removal queue.

// These queues are processed immediately, and any nodes that failed to be processed are
// processed again in the next reconciliation interval, so that nodes will hopefully eventually
// be reconciled.  As node updates come in, any promotions or demotions are also added to the
// reconciliation queue and reconciled.  As node removals come in, they are added to the removal
// queue to be removed from the raft cluster.

// Removal from a raft cluster is idempotent (and it's the only raft cluster change that will occur
// during reconciliation or removal), so it's fine if a node is in both the removal and reconciliation
// queues.

// The ctx param is only used for logging.
func (rm *roleManager) Run(ctx context.Context) {
	defer close(rm.doneChan)

	var (
		nodes []*api.Node

		// ticker and tickerCh are used to time the reconciliation interval, which will
		// periodically attempt to re-reconcile nodes that failed to reconcile the first
		// time through
		ticker   clock.Ticker
		tickerCh <-chan time.Time
	)

	watcher, cancelWatch, err := store.ViewAndWatch(rm.store,
		func(readTx store.ReadTx) error {
			var err error
			nodes, err = store.FindNodes(readTx, store.All)
			return err
		},
		api.EventUpdateNode{},
		api.EventDeleteNode{})
	defer cancelWatch()

	if err != nil {
		log.G(ctx).WithError(err).Error("failed to check nodes for role changes")
	} else {
		// Assume all raft members have been deleted from the cluster, until the node list
		// tells us otherwise.  We can make this assumption because the node object must
		// exist first before the raft member object.

		// Background life-cycle for a manager: it joins the cluster, getting a new TLS
		// certificate. To get a TLS certificate, it makes an RPC call to the CA server,
		// which on successful join adds its information to the cluster node list and
		// eventually generates a TLS certificate for it. Once it has a TLS certificate,
		// it can contact the other nodes, and makes an RPC call to request to join the
		// raft cluster.  The node it contacts will add the node to the raft membership.
		for _, member := range rm.raft.GetMemberlist() {
			rm.pendingRemoval[member.NodeID] = struct{}{}
		}
		for _, node := range nodes {
			// if the node exists, we don't want it removed from the raft membership cluster
			// necessarily
			delete(rm.pendingRemoval, node.ID)

			// reconcile each existing node
			rm.pendingReconciliation[node.ID] = node
			rm.reconcileRole(ctx, node)
		}
		for nodeID := range rm.pendingRemoval {
			rm.evictRemovedNode(ctx, nodeID)
		}
		// If any reconciliations or member removals failed, we want to try again, so
		// make sure that we start the ticker so we can try again and again every
		// roleReconciliationInterval seconds until the queues are both empty.
		if len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0 {
			ticker = rm.getTicker(roleReconcileInterval)
			tickerCh = ticker.C()
		}
	}

	for {
		select {
		case event := <-watcher:
			switch ev := event.(type) {
			case api.EventUpdateNode:
				rm.pendingReconciliation[ev.Node.ID] = ev.Node
				rm.reconcileRole(ctx, ev.Node)
			case api.EventDeleteNode:
				rm.pendingRemoval[ev.Node.ID] = struct{}{}
				rm.evictRemovedNode(ctx, ev.Node.ID)
			}
			// If any reconciliations or member removals failed, we want to try again, so
			// make sure that we start the ticker so we can try again and again every
			// roleReconciliationInterval seconds until the queues are both empty.
			if (len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0) && ticker == nil {
				ticker = rm.getTicker(roleReconcileInterval)
				tickerCh = ticker.C()
			}
		case <-tickerCh:
			for _, node := range rm.pendingReconciliation {
				rm.reconcileRole(ctx, node)
			}
			for nodeID := range rm.pendingRemoval {
				rm.evictRemovedNode(ctx, nodeID)
			}
			if len(rm.pendingReconciliation) == 0 && len(rm.pendingRemoval) == 0 {
				ticker.Stop()
				ticker = nil
				tickerCh = nil
			}
		case <-rm.ctx.Done():
			if ticker != nil {
				ticker.Stop()
			}
			return
		}
	}
}

// evictRemovedNode evicts a removed node from the raft cluster membership.  This is to cover an edge case in which
// a node might have been removed, but somehow the role was not reconciled (possibly a demotion and a removal happened
// in rapid succession before the raft membership configuration went through).
func (rm *roleManager) evictRemovedNode(ctx context.Context, nodeID string) {
	// Check if the member still exists in the membership
	member := rm.raft.GetMemberByNodeID(nodeID)
	if member != nil {
		// We first try to remove the raft node from the raft cluster.  On the next tick, if the node
		// has been removed from the cluster membership, we then delete it from the removed list
		rm.removeMember(ctx, member)
		return
	}
	delete(rm.pendingRemoval, nodeID)
}

// removeMember removes a member from the raft cluster membership
func (rm *roleManager) removeMember(ctx context.Context, member *membership.Member) {
	// Quorum safeguard - quorum should have been checked before a node was allowed to be demoted, but if in the
	// intervening time some other node disconnected, removing this node would result in a loss of cluster quorum.
	// We leave it
	if !rm.raft.CanRemoveMember(member.RaftID) {
		// TODO(aaronl): Retry later
		log.G(ctx).Debugf("can't demote node %s at this time: removing member from raft would result in a loss of quorum", member.NodeID)
		return
	}

	rmCtx, rmCancel := context.WithTimeout(rm.ctx, removalTimeout)
	defer rmCancel()

	if member.RaftID == rm.raft.Config.ID {
		// Don't use rmCtx, because we expect to lose
		// leadership, which will cancel this context.
		log.G(ctx).Info("demoted; transferring leadership")
		err := rm.raft.TransferLeadership(context.Background())
		if err == nil {
			return
		}
		log.G(ctx).WithError(err).Info("failed to transfer leadership")
	}
	if err := rm.raft.RemoveMember(rmCtx, member.RaftID); err != nil {
		// TODO(aaronl): Retry later
		log.G(ctx).WithError(err).Debugf("can't demote node %s at this time", member.NodeID)
	}
}

// reconcileRole looks at the desired role for a node, and if it is being demoted or promoted, updates the
// node role accordingly.   If the node is being demoted, it also removes the node from the raft cluster membership.
func (rm *roleManager) reconcileRole(ctx context.Context, node *api.Node) {
	if node.Role == node.Spec.DesiredRole {
		// Nothing to do.
		delete(rm.pendingReconciliation, node.ID)
		return
	}

	// Promotion can proceed right away.
	if node.Spec.DesiredRole == api.NodeRoleManager && node.Role == api.NodeRoleWorker {
		err := rm.store.Update(func(tx store.Tx) error {
			updatedNode := store.GetNode(tx, node.ID)
			if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role {
				return nil
			}
			updatedNode.Role = api.NodeRoleManager
			return store.UpdateNode(tx, updatedNode)
		})
		if err != nil {
			log.G(ctx).WithError(err).Errorf("failed to promote node %s", node.ID)
		} else {
			delete(rm.pendingReconciliation, node.ID)
		}
	} else if node.Spec.DesiredRole == api.NodeRoleWorker && node.Role == api.NodeRoleManager {
		// Check for node in memberlist
		member := rm.raft.GetMemberByNodeID(node.ID)
		if member != nil {
			// We first try to remove the raft node from the raft cluster.  On the next tick, if the node
			// has been removed from the cluster membership, we then update the store to reflect the fact
			// that it has been successfully demoted, and if that works, remove it from the pending list.
			rm.removeMember(ctx, member)
			return
		}

		err := rm.store.Update(func(tx store.Tx) error {
			updatedNode := store.GetNode(tx, node.ID)
			if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role {
				return nil
			}
			updatedNode.Role = api.NodeRoleWorker

			return store.UpdateNode(tx, updatedNode)
		})
		if err != nil {
			log.G(ctx).WithError(err).Errorf("failed to demote node %s", node.ID)
		} else {
			delete(rm.pendingReconciliation, node.ID)
		}
	}
}

// Stop stops the roleManager and waits for the main loop to exit.
func (rm *roleManager) Stop() {
	rm.cancel()
	<-rm.doneChan
}