kube-aws-updater

#!/usr/bin/env bash

# Shell style guide: https://google.github.io/styleguide/shell.xml

# ## (-) master and worker steps
# ## (*) worker specific steps
#
# - label nodes
# - disable scheduling
# - double ASG size
# - wait for 2x group
# - stop ASG actions
# - drain/terminate labelled nodes (sequentially)
# - resize ASG to original size
# - re-enable ASG actions

set -o errexit
set -o nounset
set -o pipefail

# Static vars
LOG_FILE='kube-updater.log'

# if there is no "hostname" command, try "hostnamectl hostname"
hn=hostname
command -v hostname || hn="hostnamectl hostname"

source ./report.sh

log() {
  echo -e "[$(date +'%Y-%m-%dT%H:%M:%S')]: ${*}" >&2
}

usage() {
  cat <<EOF
Usage: $0 -c <kube_context> -p <aws_profile> -r <role> -s [retire_time_label]
        -t [timeout] -n [node_batch_size] -e [true] -l [promtail_cfg]

  -s    Resume node rolling. Argument is retire_time label
  -t    Node drain timeout. How long to wait for the node to drain before
        shutting it down (in seconds, default ${timeout}s)
  -l    Path to promtail config (when set stdout will be streamed to loki by proxy of a log file)
  -n    Node batch size. How many nodes to add to the ASG. Also, how many nodes
        will be drained before waiting for the ASG to stabilise again. It needs
        to be a multiple of 3 (default ${node_batch_size})
  -e    Enable reports. If "true", show reports on cluster state on every
        step (default "false")
EOF
} >&2

parse_opts() {
  # flags
  aws_opts=()
  aws_profile=''
  kube_context=''
  role=''
  resume=''
  retire_time=$(date +"%Y-%m-%dT%H-%M-%SZ")
  timeout=600
  promtail_cfg=''
  node_batch_size=3

  while getopts 'c:p:r:hs:t:n:e:l:' flag; do
    case "${flag}" in
      c) kube_context="${OPTARG}" ;;
      p) aws_profile="${OPTARG}" ;;
      r) role="${OPTARG}" ;;
      s) resume="${OPTARG}" ;;
      t) timeout="${OPTARG}" ;;
      l) promtail_cfg="${OPTARG}" ;;
      n) node_batch_size="${OPTARG}" ;;
      e) report="${OPTARG}" ;;
      h) usage && exit 0 ;;
      *) log "Unexpected option: ${flag}" && usage && exit 1 ;;
    esac
  done

  ### Validation
  if [[ -z "${kube_context}" ]]; then
    usage
    exit 1
  fi

  if [[ -n "${resume}" ]] && [[ -z "${role}" ]] ; then
    log "If you are resuming, you need to provide a role"
    usage
    exit 1
  fi

  if [[ -n "${resume}" ]]; then
      retire_time=${resume}
  fi

  if (( node_batch_size == 0 )); then
      log "Unsupported node_batch_size value '${node_batch_size}'. Please use a value greater than 0"
      exit 1
  fi
  if (( node_batch_size % 3 != 0 )); then
      log "Unsupported node_batch_size value '${node_batch_size}'. Please use a multiple of 3"
      exit 1
  fi

  if [[ -n "${aws_profile}" ]]; then
    if aws configure list --profile "${aws_profile}" &> /dev/null; then
      aws_opts=("--profile=${aws_profile}")
      aws "${aws_opts[@]}" sts get-caller-identity &>/dev/null \
        || (log "failed to get-caller-identity profile=${aws_profile}"; exit 1)
    else
      log "Invalid profile: ${aws_profile}"
      exit 1
    fi
  fi

  if [[ -n "${promtail_cfg}" ]]; then
    if ! command -v promtail &> /dev/null; then
      echo "Promtail not installed"
      exit 1
    fi

    promtail --config.file="${promtail_cfg}" --client.external-labels=context="${kube_context}",host="$($hn)" &
    touch "${LOG_FILE}"
    sleep 2 # give promtail time to start
    exec &> >(tee "${LOG_FILE}") # redirect execution through tee
  fi
}

checkdeps() {
  local missing_deps=""
  for d in "${@}"; do
    if ! command -v "${d}" &> /dev/null; then
      missing_deps+="${d} "
    fi
  done

  if [[ -n "${missing_deps}" ]]; then
    log "Missing dependencies: ${missing_deps}"
    exit 1
  fi
}

function retry() {
  local n=1
  local max=12
  local delay=8
  while true; do
    if "$@"; then
      break
    else
      if [[ $n -lt $max ]]; then
        ((n++))
        log "command failed: attempt=$n max=$max"
        sleep $delay;
      else
        log "the command has failed after $n attempts"
        exit 1
      fi
    fi
  done
}

label_for_cycling() {
  local role=$1
  local nodes
  nodes=$(retry kubectl --context="${kube_context}" get nodes -l "role=${role}" -o json | jq -r '.items[].metadata.name')
  log "${kube_context}: nodes=$(log "${nodes}" | wc -l) role=${role}"
  log "labelling for retirement: role=${role}"
  for node in ${nodes}; do
    retry kubectl --context="${kube_context}" label node "${node}" "retiring=${retire_time}" --overwrite=true
    retry kubectl --context="${kube_context}" cordon "${node}"
  done
}

kill_node() {
  local node=$1

  set +e
  time timeout "${timeout}" kubectl --context="${kube_context}" drain "${node}" --ignore-daemonsets --force --delete-emptydir-data
  local rc=$?
  set -e
  if [[ ${rc} -eq 0 ]]; then
    log "drained successfully"
  elif [[ ${rc} -eq 124 ]]; then
    log "timeout reached, continuing: timeout=${timeout}"
  else
    log "kubectl drain error: rc=${rc}"
  fi

  local instance_id
  instance_id=$(retry aws "${aws_opts[@]}" --output=json ec2 describe-instances --filters "Name=network-interface.private-dns-name,Values=${node}" |\
    jq -r '.Reservations[].Instances[].InstanceId')
  log "aws terminating: node=${node} instance-id=${instance_id}"
  retry aws "${aws_opts[@]}" ec2 terminate-instances --instance-ids="${instance_id}"
}

upscale_asg() {
  local role=$1
  local asg_name=$2
  local asg_count=$3

  # - increase ASG size
  retry aws "${aws_opts[@]}" --output=json autoscaling update-auto-scaling-group \
    --auto-scaling-group-name "${asg_name}" \
    --desired-capacity "$(( asg_count + node_batch_size ))" \
    --max-size "$(( asg_count + node_batch_size ))"

  # - wait for new nodes
  wait_for_ready_nodes "${role}" "${node_batch_size}"

  # - stop ASG actions
  ## auto-scaling processes:
  # https://docs.aws.amazon.com/autoscaling/ec2/userguide/as-suspend-resume-processes.html#process-types
  #
  # - Launch            # Do not suspend (suspended before draining the final batch of nodes)
  # - Terminate         # Do not suspend
  # - HealthCheck       # Do not suspend
  # - ReplaceUnhealthy  # Do not suspend
  # - AZRebalance
  # - AlarmNotification
  # - ScheduledActions
  # - AddToLoadBalancer # Do not suspend
  #
  # Suspends processes on the autoscaling group. As we drain and terminate
  # instances we don't want the ASG to spin up new ones as a replacement or to get
  # back to the desired capacity.
  #
  # We do however want the ASG to detect that we are terminating instances so
  # we want ASG to check the health of the instance and "Terminate" (remove from
  # the group)
  retry aws "${aws_opts[@]}" --output=json autoscaling suspend-processes \
    --auto-scaling-group-name "${asg_name}" --scaling-processes \
    "AZRebalance" \
    "AlarmNotification" \
    "ScheduledActions"
}

assert_asg_balance() {
  local role=$1
  local grace_delta=1

  # Check that topology node labels are populated by the respective controller
  while [[ $(kubectl --context "${kube_context}" get nodes -lrole="master" --no-headers -ocustom-columns=':.metadata.labels.topology\.kubernetes\.io\/zone' | grep "<none>" | wc -l) -gt 0 ]]; do
    echo "Waiting for cloud controller to populate topology.kubernetes.io/zone label on all nodes";
    sleep 1;
  done
  # check that nodes are balanced across three AZs
  local nodes_per_zone
  nodes_per_zone=$(kubectl --context "${kube_context}" get nodes -l "role=${role}" --no-headers -ocustom-columns=':.metadata.labels.topology\.kubernetes\.io\/zone' |\
      sort |\
      uniq -c)
  local npz
  readarray -t npz < <(echo "${nodes_per_zone}" | awk '{print $1}')
  if [ "${#npz[@]}" -ne 3 ]; then
      log "Expected nodes across three zones. Node distribution:\n$nodes_per_zone\nCannot proceed, exiting"
      exit 1
  fi
  # shellcheck disable=SC2252
  if [ "${npz[0]}" != "${npz[1]}" ] || [ "${npz[0]}" != "${npz[2]}" ] || [ "${npz[1]}" != "${npz[2]}" ]; then
      log "Nodes are not balanced across zones. Node distribution:\n$nodes_per_zone\nWe can tolerate deviations of up to ${grace_delta} nodes"
      # Bash has no method to get the absolute value of a number so instead of
      # abs(x0 - x1) > grace_delta we can check (x0 - x1) ^ 2 > grace_delta ^ 2
      if (( (npz[0] - npz[1]) ** 2 > grace_delta ** 2 )) || (( (npz[0] - npz[2]) ** 2 > grace_delta ** 2 )) || (( (npz[1] - npz[2]) ** 2 > grace_delta ** 2 )); then
        log "Exiting"
        exit 1
      fi
  fi
}

downscale_asg() {
  local asg_name=$1
  local asg_count=$2
  local role=$3

  # sleep some time to allow ASG to catch and see the "Terminating" instance
  sleep 64

  # wait for ASG to catch up and have the desired number of instances
  set +e
  local ic=0
  while [[ ${ic} -ne ${asg_count} ]]; do
    sleep 32
    ic=$(retry aws "${aws_opts[@]}" --output=json autoscaling describe-auto-scaling-groups \
      --auto-scaling-group-names "${asg_name}" |\
      jq -r -e '.AutoScalingGroups[0].Instances | length')
    log "waiting ASG to scale down: actual=${ic} desired=${asg_count} asg=\"${asg_name}\""
  done
  set -e

  # - resize ASG to original size
  retry aws "${aws_opts[@]}" --output=json autoscaling update-auto-scaling-group \
    --auto-scaling-group-name "${asg_name}" --desired-capacity "${asg_count}" \
    --max-size "${asg_count}"

  # Ensure that AZs are balanced before re-enabling ASG actions
  assert_asg_balance "${role}"

  # - re-enable ASG actions
  retry aws "${aws_opts[@]}" --output=json autoscaling resume-processes \
    --auto-scaling-group-name "${asg_name}" --scaling-processes \
    "Launch" \
    "AZRebalance" \
    "AlarmNotification" \
    "ScheduledActions"
}

wait_for_ready_nodes() {
  local role=$1
  local node_count=$2

  # wait for the required number of ready, not retired nodes
  set +e
  local nc=0
  while : ; do
    nc=$(retry kubectl --context="${kube_context}" get nodes -l "role=${role},!retiring" -o json |\
      jq '[.items[] | select(.status.conditions[] | select(.type == "Ready" and .status == "True" ))] | length')
    log "waiting for ready nodes: actual=${nc} desired=${node_count} role=\"${role}\""
    if [[ ${nc} -ge ${node_count} ]]; then
        break
    fi
    sleep 32
  done
  set -e
}

drain_nodes() {
  local role=$1
  local retire_time=$2
  local asg_name=$3
  local asg_count=$4

  # Rearrange the order of the nodes based on their availailability zone. Given
  # six nodes spread evenly in zones A, B and C, we want the resulting array to
  # list the nodes in this order:
  #   node-a-0, node-b-0, node-c-0, node-a-1, node-b-1, node-c-1
  # This way, we avoid draining too many nodes in the same zone sequentially,
  # which can seriously unbalance the ASG and cause scheduling issues, since we
  # don't strictly control in which AZ any new nodes will be launched.
  local old_nodes_by_zone
  # Each item in this array is a space-separated list of node names for a given
  # AZ. For example:
  #  [ "node-a-0 node-a-1" "node-b-0 node-b-1" "node-c-0 node-c-1" ]
  readarray -t old_nodes_by_zone < <(kubectl --context="${kube_context}" get nodes -l "role=${role},retiring=${retire_time}" -ojson |\
    jq -r '.items | group_by(.metadata.labels["topology.kubernetes.io/zone"])[] | [.[].metadata.name] | join(" ")')
  local old_nodes
  declare -t old_nodes
  local zc="${#old_nodes_by_zone[*]}"
  local i j
  i=0
  # This nested loop created the interleaved list of nodes in old_nodes. For
  # example, given the node array from above, the resulting array will be:
  #   [node-a-0 node-b-0 node-c-0 node-a-1 node-b-1 node-c-1]
  for z in "${old_nodes_by_zone[@]}"; do
    j=0
    for n in ${z}; do
      old_nodes[((i + j * zc))]=${n}
      j=$((j + 1))
    done
    i=$((i + 1))
  done
  local old_nodes_count="${#old_nodes[*]}"

  # - drain/terminate labelled nodes (sequentially)
  # Under normal operation, nc == node_batch_size. When resuming, it's adjusted
  # to take the already terminated old nodes under consideration.
  local nc=$(( node_batch_size + asg_count - old_nodes_count ))
  for old_node in "${old_nodes[@]}"; do
    # If we just finished draining the last node a batch and the next batch is
    # not the last one (old_nodes_count - node_batch_size), wait for any pending
    # nodes to become ready. This ensures we operate on a balanced cluster, if
    # the ASG is slow to react.
    if (( nc < old_nodes_count )) && (( nc % node_batch_size == 0 )); then
      log "waiting for new nodes to become ready before starting to drain the next node batch"
      wait_for_ready_nodes "${role}" "${nc}"
    # When we reach the last node batch (ready nodes == asg_count), we should
    # wait for any pending new nodes to become ready and then pause the
    # "Launch" action. This ensures that the ASG size at the end is the desired
    # and that the ASG will not terminate any instances resizing and resuming
    # all operations.
    elif (( nc == asg_count )); then
      run_report
      log "final node batch, waiting for final nodes to become ready and disabling the ASG 'Launch' process"
      wait_for_ready_nodes "${role}" "${nc}"
      retry aws "${aws_opts[@]}" --output=json autoscaling suspend-processes \
        --auto-scaling-group-name "${asg_name}" --scaling-processes "Launch"
      run_report
    fi
    kill_node "${old_node}"
    nc=$((nc + 1))
  done
}

update() {
  local role=$1
  if [[ -z "${resume}" ]]; then
    label_for_cycling "${role}"
  fi

  local instance_address instance_id asgs asg_name asg_count
  while : ; do
    instance_address=$(retry kubectl --context="${kube_context}" get nodes -l "role=${role},retiring=${retire_time}" -o json | jq -r '.items[0].metadata.name')
    [[ -z ${instance_address} ]] && log "error: instance_address is empty" && exit 1

    instance_id=$(retry aws "${aws_opts[@]}" --output=json ec2 describe-instances --filters "Name=network-interface.private-dns-name,Values=${instance_address}" |\
      jq -r '.Reservations[].Instances[].InstanceId')
    [[ -z ${instance_id} ]] && log "error: instance_id is empty" && exit 1

    asgs=$(retry aws "${aws_opts[@]}" --output=json autoscaling describe-auto-scaling-groups)
    [[ -z ${asgs} ]] && log "error: asgs is empty" && exit 1

    asg_name=$(echo "${asgs}" | jq -r ".AutoScalingGroups[] | select(.Instances[].InstanceId==\"${instance_id}\") | .AutoScalingGroupName")

    asg_count=$(echo "${asgs}" | jq -r ".AutoScalingGroups[] | select(.AutoScalingGroupName==\"${asg_name}\") | .MinSize")
    [[ -z ${asg_count} ]] && log "error: asg_count is empty" && exit 1

    if [[ -z "${asg_name}" ]]; then
        log "could not discover the ASG name, a node might be terminating, sleeping and trying again..."
        sleep 60
    else
        break
    fi
  done
  log "asg_name: asg_name=\"${asg_name}\" asg_count=${asg_count}"
  if (( node_batch_size > asg_count )); then
      log "node_batch_size cannot be greater than the asg size!"
      exit 1
  fi

  run_report

  if [[ -z "${resume}" ]]; then
    upscale_asg "${role}" "${asg_name}" "${asg_count}"

    assert_asg_balance "${role}"
  fi

  run_report

  drain_nodes "${role}" "${retire_time}" "${asg_name}" "${asg_count}"

  run_report

  downscale_asg "${asg_name}" "${asg_count}" "${role}"

  run_report
}

# Cleanup child processes on exit
trap "kill 0" EXIT

checkdeps jq kubectl aws timeout xargs readarray
parse_opts "${@}"

initialize_report

log "kube cluster: ${kube_context}"

update "${role}"

log "run: result=\"success\""