Skip to content

Commit 650dda7

Browse files
authoredSep 27, 2024
Implement Force Removal Taint Logic (#246)
* Add force taint logic * Comment updates * Update controller_test * Update controller_scale_node_group_test * Update controller filter logic * Update scale_down_test * Add logic for forceTaints and dryMode in controller filter * Add force tainted node check to controller tests * Add force taint removal unit tests * Add force taint docs
1 parent f5e40a7 commit 650dda7

9 files changed

+250
-44
lines changed
 

‎docs/node-termination.md

+12
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,15 @@ metadata:
5252
annotations:
5353
atlassian.com/no-delete: "testing some long running bpf tracing"
5454
```
55+
56+
## Force Tainting
57+
58+
Force Tainting provides a mechanism for Escalator to promptly remove nodes tainted by an external system, or administrator.
59+
60+
This is implemented by applying a taint to the node, with the following key:
61+
62+
```
63+
atlassian.com/escalator-force
64+
```
65+
66+
The node will be removed as soon as all non-daemonset pods are completed. This is useful when nodes must be removed asap, but running pods should not be killed.

‎pkg/controller/controller.go

+60-28
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ type NodeGroupState struct {
3131
scaleUpLock scaleLock
3232

3333
// used for tracking which nodes are tainted. testing when in dry mode
34-
taintTracker []string
34+
taintTracker []string
35+
forceTaintTracker []string
3536

3637
// used for tracking scale delta across runs, useful for reducing hysteresis
3738
scaleDelta int
@@ -54,11 +55,12 @@ type Opts struct {
5455
// scaleOpts provides options for a scale function
5556
// wraps options that would be passed as args
5657
type scaleOpts struct {
57-
nodes []*v1.Node
58-
taintedNodes []*v1.Node
59-
untaintedNodes []*v1.Node
60-
nodeGroup *NodeGroupState
61-
nodesDelta int
58+
nodes []*v1.Node
59+
taintedNodes []*v1.Node
60+
forceTaintedNodes []*v1.Node
61+
untaintedNodes []*v1.Node
62+
nodeGroup *NodeGroupState
63+
nodesDelta int
6264
}
6365

6466
// NewController creates a new controller with the specified options
@@ -116,36 +118,52 @@ func (c *Controller) dryMode(nodeGroup *NodeGroupState) bool {
116118
}
117119

118120
// filterNodes separates nodes between tainted and untainted nodes
119-
func (c *Controller) filterNodes(nodeGroup *NodeGroupState, allNodes []*v1.Node) (untaintedNodes, taintedNodes, cordonedNodes []*v1.Node) {
121+
func (c *Controller) filterNodes(nodeGroup *NodeGroupState, allNodes []*v1.Node) (untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes []*v1.Node) {
120122
untaintedNodes = make([]*v1.Node, 0, len(allNodes))
121123
taintedNodes = make([]*v1.Node, 0, len(allNodes))
124+
forceTaintedNodes = make([]*v1.Node, 0, len(allNodes))
122125
cordonedNodes = make([]*v1.Node, 0, len(allNodes))
123126

124127
for _, node := range allNodes {
128+
var tainted bool
129+
var forceTainted bool
130+
var untainted bool
131+
125132
if c.dryMode(nodeGroup) {
126-
var contains bool
127133
for _, name := range nodeGroup.taintTracker {
128134
if node.Name == name {
129-
contains = true
135+
tainted = true
130136
break
131137
}
132138
}
133-
if !contains {
134-
untaintedNodes = append(untaintedNodes, node)
135-
} else {
136-
taintedNodes = append(taintedNodes, node)
139+
140+
for _, name := range nodeGroup.forceTaintTracker {
141+
if node.Name == name {
142+
forceTainted = true
143+
break
144+
}
137145
}
146+
147+
untainted = !forceTainted && !tainted
148+
138149
} else {
139150
// If the node is Unschedulable (cordoned), separate it out from the tainted/untainted
140151
if node.Spec.Unschedulable {
141152
cordonedNodes = append(cordonedNodes, node)
142153
continue
143154
}
144-
if _, tainted := k8s.GetToBeRemovedTaint(node); !tainted {
145-
untaintedNodes = append(untaintedNodes, node)
146-
} else {
147-
taintedNodes = append(taintedNodes, node)
148-
}
155+
156+
_, forceTainted = k8s.GetToBeForceRemovedTaint(node)
157+
_, tainted = k8s.GetToBeRemovedTaint(node)
158+
untainted = !forceTainted && !tainted
159+
}
160+
161+
if forceTainted {
162+
forceTaintedNodes = append(forceTaintedNodes, node)
163+
} else if tainted {
164+
taintedNodes = append(taintedNodes, node)
165+
} else if untainted {
166+
untaintedNodes = append(untaintedNodes, node)
149167
}
150168
}
151169

@@ -210,20 +228,22 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
210228
}
211229

212230
// Filter into untainted and tainted nodes
213-
untaintedNodes, taintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes)
231+
untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes)
214232

215233
// Metrics and Logs
216234
log.WithField("nodegroup", nodegroup).Infof("pods total: %v", len(pods))
217235
log.WithField("nodegroup", nodegroup).Infof("nodes remaining total: %v", len(allNodes))
218236
log.WithField("nodegroup", nodegroup).Infof("cordoned nodes remaining total: %v", len(cordonedNodes))
219237
log.WithField("nodegroup", nodegroup).Infof("nodes remaining untainted: %v", len(untaintedNodes))
220238
log.WithField("nodegroup", nodegroup).Infof("nodes remaining tainted: %v", len(taintedNodes))
239+
log.WithField("nodegroup", nodegroup).Infof("nodes remaining force tainted: %v", len(forceTaintedNodes))
221240
log.WithField("nodegroup", nodegroup).Infof("Minimum Node: %v", nodeGroup.Opts.MinNodes)
222241
log.WithField("nodegroup", nodegroup).Infof("Maximum Node: %v", nodeGroup.Opts.MaxNodes)
223242
metrics.NodeGroupNodes.WithLabelValues(nodegroup).Set(float64(len(allNodes)))
224243
metrics.NodeGroupNodesCordoned.WithLabelValues(nodegroup).Set(float64(len(cordonedNodes)))
225244
metrics.NodeGroupNodesUntainted.WithLabelValues(nodegroup).Set(float64(len(untaintedNodes)))
226245
metrics.NodeGroupNodesTainted.WithLabelValues(nodegroup).Set(float64(len(taintedNodes)))
246+
metrics.NodeGroupNodesForceTainted.WithLabelValues(nodegroup).Set(float64(len(forceTaintedNodes)))
227247
metrics.NodeGroupPods.WithLabelValues(nodegroup).Set(float64(len(pods)))
228248

229249
// We want to be really simple right now so we don't do anything if we are outside the range of allowed nodes
@@ -288,11 +308,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
288308
if len(untaintedNodes) < nodeGroup.Opts.MinNodes {
289309
log.WithField("nodegroup", nodegroup).Warn("There are less untainted nodes than the minimum")
290310
result, err := c.ScaleUp(scaleOpts{
291-
nodes: allNodes,
292-
nodesDelta: nodeGroup.Opts.MinNodes - len(untaintedNodes),
293-
nodeGroup: nodeGroup,
294-
taintedNodes: taintedNodes,
295-
untaintedNodes: untaintedNodes,
311+
nodes: allNodes,
312+
nodesDelta: nodeGroup.Opts.MinNodes - len(untaintedNodes),
313+
nodeGroup: nodeGroup,
314+
taintedNodes: taintedNodes,
315+
forceTaintedNodes: forceTaintedNodes,
316+
untaintedNodes: untaintedNodes,
296317
})
297318
if err != nil {
298319
log.WithField("nodegroup", nodegroup).Error(err)
@@ -382,10 +403,21 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
382403
log.WithField("nodegroup", nodegroup).Debugf("Delta: %v", nodesDelta)
383404

384405
scaleOptions := scaleOpts{
385-
nodes: allNodes,
386-
taintedNodes: taintedNodes,
387-
untaintedNodes: untaintedNodes,
388-
nodeGroup: nodeGroup,
406+
nodes: allNodes,
407+
taintedNodes: taintedNodes,
408+
forceTaintedNodes: forceTaintedNodes,
409+
untaintedNodes: untaintedNodes,
410+
nodeGroup: nodeGroup,
411+
}
412+
413+
// Check for nodes tainted for force removal
414+
var forceRemoved int
415+
var forceActionErr error
416+
forceRemoved, forceActionErr = c.TryRemoveForceTaintedNodes(scaleOptions)
417+
log.WithField("nodegroup", nodegroup).Infof("Reaper: There were %v empty nodes force deleted this round", forceRemoved)
418+
419+
if forceActionErr != nil {
420+
log.WithField("nodegroup", nodegroup).Error(forceActionErr)
389421
}
390422

391423
// Perform a scale up, do nothing or scale down based on the nodes delta

‎pkg/controller/controller_scale_node_group_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func TestUntaintNodeGroupMinNodes(t *testing.T) {
126126
_, err = controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name])
127127
assert.NoError(t, err)
128128

129-
untainted, tainted, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes)
129+
untainted, tainted, _, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes)
130130
// Ensure that the tainted nodes where untainted
131131
assert.Equal(t, minNodes, len(untainted))
132132
assert.Equal(t, 0, len(tainted))
@@ -193,7 +193,7 @@ func TestUntaintNodeGroupMaxNodes(t *testing.T) {
193193
_, err = controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name])
194194
require.NoError(t, err)
195195

196-
untainted, tainted, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes)
196+
untainted, tainted, _, _ := controller.filterNodes(nodeGroupsState[nodeGroup.Name], nodes)
197197
// Ensure that the tainted nodes where untainted
198198
assert.Equal(t, maxNodes, len(untainted))
199199
assert.Equal(t, 0, len(tainted))

‎pkg/controller/controller_test.go

+16-7
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ func TestControllerFilterNodes(t *testing.T) {
105105
Name: "n6",
106106
Tainted: false,
107107
}),
108+
6: test.BuildTestNode(test.NodeOpts{
109+
Name: "n7",
110+
ForceTainted: true,
111+
}),
108112
}
109113

110114
type args struct {
@@ -113,11 +117,12 @@ func TestControllerFilterNodes(t *testing.T) {
113117
master bool
114118
}
115119
tests := []struct {
116-
name string
117-
args args
118-
wantUntaintedNodes []*v1.Node
119-
wantTaintedNodes []*v1.Node
120-
wantCordonedNodes []*v1.Node
120+
name string
121+
args args
122+
wantUntaintedNodes []*v1.Node
123+
wantTaintedNodes []*v1.Node
124+
wantForceTaintedNodes []*v1.Node
125+
wantCordonedNodes []*v1.Node
121126
}{
122127
{
123128
"basic filter not drymode",
@@ -132,6 +137,7 @@ func TestControllerFilterNodes(t *testing.T) {
132137
},
133138
[]*v1.Node{nodes[1], nodes[3], nodes[5]},
134139
[]*v1.Node{nodes[0], nodes[2], nodes[4]},
140+
[]*v1.Node{nodes[6]},
135141
[]*v1.Node{},
136142
},
137143
{
@@ -141,13 +147,15 @@ func TestControllerFilterNodes(t *testing.T) {
141147
Opts: NodeGroupOptions{
142148
DryMode: true,
143149
},
144-
taintTracker: []string{"n1", "n3", "n5"},
150+
taintTracker: []string{"n1", "n3", "n5"},
151+
forceTaintTracker: []string{"n7"},
145152
},
146153
nodes,
147154
true,
148155
},
149156
[]*v1.Node{nodes[1], nodes[3], nodes[5]},
150157
[]*v1.Node{nodes[0], nodes[2], nodes[4]},
158+
[]*v1.Node{nodes[6]},
151159
[]*v1.Node{},
152160
},
153161
}
@@ -158,9 +166,10 @@ func TestControllerFilterNodes(t *testing.T) {
158166
DryMode: tt.args.master,
159167
},
160168
}
161-
gotUntaintedNodes, gotTaintedNodes, gotCordonedNodes := c.filterNodes(tt.args.nodeGroup, tt.args.allNodes)
169+
gotUntaintedNodes, gotTaintedNodes, gotForceTaintedNodes, gotCordonedNodes := c.filterNodes(tt.args.nodeGroup, tt.args.allNodes)
162170
assert.Equal(t, tt.wantUntaintedNodes, gotUntaintedNodes)
163171
assert.Equal(t, tt.wantTaintedNodes, gotTaintedNodes)
172+
assert.Equal(t, tt.wantForceTaintedNodes, gotForceTaintedNodes)
164173
assert.Equal(t, tt.wantCordonedNodes, gotCordonedNodes)
165174
})
166175
}

‎pkg/controller/scale_down.go

+24
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,26 @@ func safeFromDeletion(node *v1.Node) (string, bool) {
4545
return "", false
4646
}
4747

48+
// TryRemoveForceTaintedNodes attempts to remove nodes that are
49+
// * Tainted with force remove and empty
50+
func (c *Controller) TryRemoveForceTaintedNodes(opts scaleOpts) (int, error) {
51+
var toBeDeleted []*v1.Node
52+
drymode := c.dryMode(opts.nodeGroup)
53+
54+
for _, candidate := range opts.forceTaintedNodes {
55+
if k8s.NodeEmpty(candidate, opts.nodeGroup.NodeInfoMap) {
56+
if !drymode {
57+
toBeDeleted = append(toBeDeleted, candidate)
58+
}
59+
log.WithField("drymode", drymode).WithField("nodegroup", opts.nodeGroup.Opts.Name).Infof("Node %v, %v ready to be force deleted", candidate.Name, candidate.Spec.ProviderID)
60+
} else {
61+
log.WithField("drymode", drymode).WithField("nodegroup", opts.nodeGroup.Opts.Name).Infof("Node %v, %v not ready to be force deleted", candidate.Name, candidate.Spec.ProviderID)
62+
}
63+
}
64+
65+
return TryDeleteNodes(c, opts, toBeDeleted)
66+
}
67+
4868
// TryRemoveTaintedNodes attempts to remove nodes are
4969
// * tainted and empty
5070
// * have passed their grace period
@@ -98,6 +118,10 @@ func (c *Controller) TryRemoveTaintedNodes(opts scaleOpts) (int, error) {
98118
}
99119
}
100120

121+
return TryDeleteNodes(c, opts, toBeDeleted)
122+
}
123+
124+
func TryDeleteNodes(c *Controller, opts scaleOpts, toBeDeleted []*v1.Node) (int, error) {
101125
if len(toBeDeleted) > 0 {
102126
podsRemaining := 0
103127
for _, nodeToBeDeleted := range toBeDeleted {

0 commit comments

Comments
 (0)
Failed to load comments.