-
Notifications
You must be signed in to change notification settings - Fork 1
/
kube-aws-updater
executable file
·458 lines (395 loc) · 15.1 KB
/
kube-aws-updater
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
#!/usr/bin/env bash
# Shell style guide: https://google.github.io/styleguide/shell.xml
# ## (-) master and worker steps
# ## (*) worker specific steps
#
# - label nodes
# - disable scheduling
# - double ASG size
# - wait for 2x group
# - stop ASG actions
# - drain/terminate labelled nodes (sequentially)
# - resize ASG to original size
# - re-enable ASG actions
set -o errexit
set -o nounset
set -o pipefail
# Static vars
LOG_FILE='kube-updater.log'
# if there is no "hostname" command, try "hostnamectl hostname"
hn=hostname
command -v hostname || hn="hostnamectl hostname"
source ./report.sh
log() {
echo -e "[$(date +'%Y-%m-%dT%H:%M:%S')]: ${*}" >&2
}
usage() {
cat <<EOF
Usage: $0 -c <kube_context> -p <aws_profile> -r <role> -s [retire_time_label]
-t [timeout] -n [node_batch_size] -e [true] -l [promtail_cfg]
-s Resume node rolling. Argument is retire_time label
-t Node drain timeout. How long to wait for the node to drain before
shutting it down (in seconds, default ${timeout}s)
-l Path to promtail config (when set stdout will be streamed to loki by proxy of a log file)
-n Node batch size. How many nodes to add to the ASG. Also, how many nodes
will be drained before waiting for the ASG to stabilise again. It needs
to be a multiple of 3 (default ${node_batch_size})
-e Enable reports. If "true", show reports on cluster state on every
step (default "false")
EOF
} >&2
parse_opts() {
# flags
aws_opts=()
aws_profile=''
kube_context=''
role=''
resume=''
retire_time=$(date +"%Y-%m-%dT%H-%M-%SZ")
timeout=600
promtail_cfg=''
node_batch_size=3
while getopts 'c:p:r:hs:t:n:e:l:' flag; do
case "${flag}" in
c) kube_context="${OPTARG}" ;;
p) aws_profile="${OPTARG}" ;;
r) role="${OPTARG}" ;;
s) resume="${OPTARG}" ;;
t) timeout="${OPTARG}" ;;
l) promtail_cfg="${OPTARG}" ;;
n) node_batch_size="${OPTARG}" ;;
e) report="${OPTARG}" ;;
h) usage && exit 0 ;;
*) log "Unexpected option: ${flag}" && usage && exit 1 ;;
esac
done
### Validation
if [[ -z "${kube_context}" ]]; then
usage
exit 1
fi
if [[ -n "${resume}" ]] && [[ -z "${role}" ]] ; then
log "If you are resuming, you need to provide a role"
usage
exit 1
fi
if [[ -n "${resume}" ]]; then
retire_time=${resume}
fi
if (( node_batch_size == 0 )); then
log "Unsupported node_batch_size value '${node_batch_size}'. Please use a value greater than 0"
exit 1
fi
if (( node_batch_size % 3 != 0 )); then
log "Unsupported node_batch_size value '${node_batch_size}'. Please use a multiple of 3"
exit 1
fi
if [[ -n "${aws_profile}" ]]; then
if aws configure list --profile "${aws_profile}" &> /dev/null; then
aws_opts=("--profile=${aws_profile}")
aws "${aws_opts[@]}" sts get-caller-identity &>/dev/null \
|| (log "failed to get-caller-identity profile=${aws_profile}"; exit 1)
else
log "Invalid profile: ${aws_profile}"
exit 1
fi
fi
if [[ -n "${promtail_cfg}" ]]; then
if ! command -v promtail &> /dev/null; then
echo "Promtail not installed"
exit 1
fi
promtail --config.file="${promtail_cfg}" --client.external-labels=context="${kube_context}",host="$($hn)" &
touch "${LOG_FILE}"
sleep 2 # give promtail time to start
exec &> >(tee "${LOG_FILE}") # redirect execution through tee
fi
}
checkdeps() {
local missing_deps=""
for d in "${@}"; do
if ! command -v "${d}" &> /dev/null; then
missing_deps+="${d} "
fi
done
if [[ -n "${missing_deps}" ]]; then
log "Missing dependencies: ${missing_deps}"
exit 1
fi
}
function retry() {
local n=1
local max=12
local delay=8
while true; do
if "$@"; then
break
else
if [[ $n -lt $max ]]; then
((n++))
log "command failed: attempt=$n max=$max"
sleep $delay;
else
log "the command has failed after $n attempts"
exit 1
fi
fi
done
}
label_for_cycling() {
local role=$1
local nodes
nodes=$(retry kubectl --context="${kube_context}" get nodes -l "role=${role}" -o json | jq -r '.items[].metadata.name')
log "${kube_context}: nodes=$(log "${nodes}" | wc -l) role=${role}"
log "labelling for retirement: role=${role}"
for node in ${nodes}; do
retry kubectl --context="${kube_context}" label node "${node}" "retiring=${retire_time}" --overwrite=true
retry kubectl --context="${kube_context}" cordon "${node}"
done
}
kill_node() {
local node=$1
set +e
time timeout "${timeout}" kubectl --context="${kube_context}" drain "${node}" --ignore-daemonsets --force --delete-emptydir-data
local rc=$?
set -e
if [[ ${rc} -eq 0 ]]; then
log "drained successfully"
elif [[ ${rc} -eq 124 ]]; then
log "timeout reached, continuing: timeout=${timeout}"
else
log "kubectl drain error: rc=${rc}"
fi
local instance_id
instance_id=$(retry aws "${aws_opts[@]}" --output=json ec2 describe-instances --filters "Name=network-interface.private-dns-name,Values=${node}" |\
jq -r '.Reservations[].Instances[].InstanceId')
log "aws terminating: node=${node} instance-id=${instance_id}"
retry aws "${aws_opts[@]}" ec2 terminate-instances --instance-ids="${instance_id}"
}
upscale_asg() {
local role=$1
local asg_name=$2
local asg_count=$3
# - increase ASG size
retry aws "${aws_opts[@]}" --output=json autoscaling update-auto-scaling-group \
--auto-scaling-group-name "${asg_name}" \
--desired-capacity "$(( asg_count + node_batch_size ))" \
--max-size "$(( asg_count + node_batch_size ))"
# - wait for new nodes
wait_for_ready_nodes "${role}" "${node_batch_size}"
# - stop ASG actions
## auto-scaling processes:
# https://docs.aws.amazon.com/autoscaling/ec2/userguide/as-suspend-resume-processes.html#process-types
#
# - Launch # Do not suspend (suspended before draining the final batch of nodes)
# - Terminate # Do not suspend
# - HealthCheck # Do not suspend
# - ReplaceUnhealthy # Do not suspend
# - AZRebalance
# - AlarmNotification
# - ScheduledActions
# - AddToLoadBalancer # Do not suspend
#
# Suspends processes on the autoscaling group. As we drain and terminate
# instances we don't want the ASG to spin up new ones as a replacement or to get
# back to the desired capacity.
#
# We do however want the ASG to detect that we are terminating instances so
# we want ASG to check the health of the instance and "Terminate" (remove from
# the group)
retry aws "${aws_opts[@]}" --output=json autoscaling suspend-processes \
--auto-scaling-group-name "${asg_name}" --scaling-processes \
"AZRebalance" \
"AlarmNotification" \
"ScheduledActions"
}
assert_asg_balance() {
local role=$1
local grace_delta=1
# Check that topology node labels are populated by the respective controller
while [[ $(kubectl --context "${kube_context}" get nodes -lrole="master" --no-headers -ocustom-columns=':.metadata.labels.topology\.kubernetes\.io\/zone' | grep "<none>" | wc -l) -gt 0 ]]; do
echo "Waiting for cloud controller to populate topology.kubernetes.io/zone label on all nodes";
sleep 1;
done
# check that nodes are balanced across three AZs
local nodes_per_zone
nodes_per_zone=$(kubectl --context "${kube_context}" get nodes -l "role=${role}" --no-headers -ocustom-columns=':.metadata.labels.topology\.kubernetes\.io\/zone' |\
sort |\
uniq -c)
local npz
readarray -t npz < <(echo "${nodes_per_zone}" | awk '{print $1}')
if [ "${#npz[@]}" -ne 3 ]; then
log "Expected nodes across three zones. Node distribution:\n$nodes_per_zone\nCannot proceed, exiting"
exit 1
fi
# shellcheck disable=SC2252
if [ "${npz[0]}" != "${npz[1]}" ] || [ "${npz[0]}" != "${npz[2]}" ] || [ "${npz[1]}" != "${npz[2]}" ]; then
log "Nodes are not balanced across zones. Node distribution:\n$nodes_per_zone\nWe can tolerate deviations of up to ${grace_delta} nodes"
# Bash has no method to get the absolute value of a number so instead of
# abs(x0 - x1) > grace_delta we can check (x0 - x1) ^ 2 > grace_delta ^ 2
if (( (npz[0] - npz[1]) ** 2 > grace_delta ** 2 )) || (( (npz[0] - npz[2]) ** 2 > grace_delta ** 2 )) || (( (npz[1] - npz[2]) ** 2 > grace_delta ** 2 )); then
log "Exiting"
exit 1
fi
fi
}
downscale_asg() {
local asg_name=$1
local asg_count=$2
local role=$3
# sleep some time to allow ASG to catch and see the "Terminating" instance
sleep 64
# wait for ASG to catch up and have the desired number of instances
set +e
local ic=0
while [[ ${ic} -ne ${asg_count} ]]; do
sleep 32
ic=$(retry aws "${aws_opts[@]}" --output=json autoscaling describe-auto-scaling-groups \
--auto-scaling-group-names "${asg_name}" |\
jq -r -e '.AutoScalingGroups[0].Instances | length')
log "waiting ASG to scale down: actual=${ic} desired=${asg_count} asg=\"${asg_name}\""
done
set -e
# - resize ASG to original size
retry aws "${aws_opts[@]}" --output=json autoscaling update-auto-scaling-group \
--auto-scaling-group-name "${asg_name}" --desired-capacity "${asg_count}" \
--max-size "${asg_count}"
# Ensure that AZs are balanced before re-enabling ASG actions
assert_asg_balance "${role}"
# - re-enable ASG actions
retry aws "${aws_opts[@]}" --output=json autoscaling resume-processes \
--auto-scaling-group-name "${asg_name}" --scaling-processes \
"Launch" \
"AZRebalance" \
"AlarmNotification" \
"ScheduledActions"
}
wait_for_ready_nodes() {
local role=$1
local node_count=$2
# wait for the required number of ready, not retired nodes
set +e
local nc=0
while : ; do
nc=$(retry kubectl --context="${kube_context}" get nodes -l "role=${role},!retiring" -o json |\
jq '[.items[] | select(.status.conditions[] | select(.type == "Ready" and .status == "True" ))] | length')
log "waiting for ready nodes: actual=${nc} desired=${node_count} role=\"${role}\""
if [[ ${nc} -ge ${node_count} ]]; then
break
fi
sleep 32
done
set -e
}
drain_nodes() {
local role=$1
local retire_time=$2
local asg_name=$3
local asg_count=$4
# Rearrange the order of the nodes based on their availailability zone. Given
# six nodes spread evenly in zones A, B and C, we want the resulting array to
# list the nodes in this order:
# node-a-0, node-b-0, node-c-0, node-a-1, node-b-1, node-c-1
# This way, we avoid draining too many nodes in the same zone sequentially,
# which can seriously unbalance the ASG and cause scheduling issues, since we
# don't strictly control in which AZ any new nodes will be launched.
local old_nodes_by_zone
# Each item in this array is a space-separated list of node names for a given
# AZ. For example:
# [ "node-a-0 node-a-1" "node-b-0 node-b-1" "node-c-0 node-c-1" ]
readarray -t old_nodes_by_zone < <(kubectl --context="${kube_context}" get nodes -l "role=${role},retiring=${retire_time}" -ojson |\
jq -r '.items | group_by(.metadata.labels["topology.kubernetes.io/zone"])[] | [.[].metadata.name] | join(" ")')
local old_nodes
declare -t old_nodes
local zc="${#old_nodes_by_zone[*]}"
local i j
i=0
# This nested loop created the interleaved list of nodes in old_nodes. For
# example, given the node array from above, the resulting array will be:
# [node-a-0 node-b-0 node-c-0 node-a-1 node-b-1 node-c-1]
for z in "${old_nodes_by_zone[@]}"; do
j=0
for n in ${z}; do
old_nodes[((i + j * zc))]=${n}
j=$((j + 1))
done
i=$((i + 1))
done
local old_nodes_count="${#old_nodes[*]}"
# - drain/terminate labelled nodes (sequentially)
# Under normal operation, nc == node_batch_size. When resuming, it's adjusted
# to take the already terminated old nodes under consideration.
local nc=$(( node_batch_size + asg_count - old_nodes_count ))
for old_node in "${old_nodes[@]}"; do
# If we just finished draining the last node a batch and the next batch is
# not the last one (old_nodes_count - node_batch_size), wait for any pending
# nodes to become ready. This ensures we operate on a balanced cluster, if
# the ASG is slow to react.
if (( nc < old_nodes_count )) && (( nc % node_batch_size == 0 )); then
log "waiting for new nodes to become ready before starting to drain the next node batch"
wait_for_ready_nodes "${role}" "${nc}"
# When we reach the last node batch (ready nodes == asg_count), we should
# wait for any pending new nodes to become ready and then pause the
# "Launch" action. This ensures that the ASG size at the end is the desired
# and that the ASG will not terminate any instances resizing and resuming
# all operations.
elif (( nc == asg_count )); then
run_report
log "final node batch, waiting for final nodes to become ready and disabling the ASG 'Launch' process"
wait_for_ready_nodes "${role}" "${nc}"
retry aws "${aws_opts[@]}" --output=json autoscaling suspend-processes \
--auto-scaling-group-name "${asg_name}" --scaling-processes "Launch"
run_report
fi
kill_node "${old_node}"
nc=$((nc + 1))
done
}
update() {
local role=$1
if [[ -z "${resume}" ]]; then
label_for_cycling "${role}"
fi
local instance_address instance_id asgs asg_name asg_count
while : ; do
instance_address=$(retry kubectl --context="${kube_context}" get nodes -l "role=${role},retiring=${retire_time}" -o json | jq -r '.items[0].metadata.name')
[[ -z ${instance_address} ]] && log "error: instance_address is empty" && exit 1
instance_id=$(retry aws "${aws_opts[@]}" --output=json ec2 describe-instances --filters "Name=network-interface.private-dns-name,Values=${instance_address}" |\
jq -r '.Reservations[].Instances[].InstanceId')
[[ -z ${instance_id} ]] && log "error: instance_id is empty" && exit 1
asgs=$(retry aws "${aws_opts[@]}" --output=json autoscaling describe-auto-scaling-groups)
[[ -z ${asgs} ]] && log "error: asgs is empty" && exit 1
asg_name=$(echo "${asgs}" | jq -r ".AutoScalingGroups[] | select(.Instances[].InstanceId==\"${instance_id}\") | .AutoScalingGroupName")
asg_count=$(echo "${asgs}" | jq -r ".AutoScalingGroups[] | select(.AutoScalingGroupName==\"${asg_name}\") | .MinSize")
[[ -z ${asg_count} ]] && log "error: asg_count is empty" && exit 1
if [[ -z "${asg_name}" ]]; then
log "could not discover the ASG name, a node might be terminating, sleeping and trying again..."
sleep 60
else
break
fi
done
log "asg_name: asg_name=\"${asg_name}\" asg_count=${asg_count}"
if (( node_batch_size > asg_count )); then
log "node_batch_size cannot be greater than the asg size!"
exit 1
fi
run_report
if [[ -z "${resume}" ]]; then
upscale_asg "${role}" "${asg_name}" "${asg_count}"
assert_asg_balance "${role}"
fi
run_report
drain_nodes "${role}" "${retire_time}" "${asg_name}" "${asg_count}"
run_report
downscale_asg "${asg_name}" "${asg_count}" "${role}"
run_report
}
# Cleanup child processes on exit
trap "kill 0" EXIT
checkdeps jq kubectl aws timeout xargs readarray
parse_opts "${@}"
initialize_report
log "kube cluster: ${kube_context}"
update "${role}"
log "run: result=\"success\""