Implement backup retention based on amount of days (#241)

the old retention policy was too bad for k8s, where failovers/switchovers would happen rather often
zalando · Jun 7, 2018 · 675d3a4 · 675d3a4
1 parent 6f9156c
commit 675d3a4
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 23 deletions.
diff --git a/postgres-appliance/launch.sh b/postgres-appliance/launch.sh
@@ -33,7 +33,7 @@ if [ "$DEMO" = "true" ]; then
 else
     if python3 /scripts/configure_spilo.py all; then
         (
-            su postgres -c "PATH=$PATH /scripts/patroni_wait.sh -t 3600 -- /scripts/postgres_backup.sh $WALE_ENV_DIR $PGDATA"
+            su postgres -c "PATH=$PATH /scripts/patroni_wait.sh -t 3600 -- /scripts/postgres_backup.sh $WALE_ENV_DIR $PGDATA $BACKUP_NUM_TO_RETAIN"
         ) &
     fi
     exec supervisord --configuration=/etc/supervisor/supervisord.conf --nodaemon

diff --git a/postgres-appliance/scripts/patroni_wait.sh b/postgres-appliance/scripts/patroni_wait.sh
@@ -59,14 +59,13 @@ do
     shift
 done
 
-[ ! -z "$TIMEOUT" ] && CUTOFF=$(($(date +%s)+$TIMEOUT))
+if [ $# -gt 0 ]; then
+    [ ! -z "$TIMEOUT" ] && CUTOFF=$(($(date +%s)+TIMEOUT))
 
-while :
-do
-    [ $(curl -o /dev/null --silent --write-out '%{http_code}\n' "localhost:8008/$ROLE") -eq 200 ] && break
-    [ ! -z "$TIMEOUT" ] && [ $CUTOFF -le $(date +%s) ] && exit 2
-    sleep $INTERVAL
-done
+    while [ "$(curl -so /dev/null -w '%{http_code}' localhost:8008/$ROLE)" != "200" ]; do
+        [ ! -z "$TIMEOUT" ] && [ $CUTOFF -le $(date +%s) ] && exit 2
+        sleep $INTERVAL
+    done
 
-## Execute the command that was specified
-[ $# -gt 0 ] && exec "$@"
+    exec "$@"  # Execute the command that was specified
+fi
diff --git a/postgres-appliance/scripts/postgres_backup.sh b/postgres-appliance/scripts/postgres_backup.sh
@@ -10,23 +10,34 @@ function log
 log "I was called as: $0 $@"
 
 
-WALE_ENV_DIR=$1
-shift
+readonly WALE_ENV_DIR=$1
+readonly PGDATA=$2
+DAYS_TO_RETAIN=$3
 
-PGDATA=$1
-shift
+readonly IN_RECOVERY=$(psql -tXqAc "select pg_is_in_recovery()")
+[[ $IN_RECOVERY != "f" ]] && log "Cluster is in recovery, not running backup" && exit 0
 
-NUM_TO_RETAIN=$1
-shift
+# leave at least 2 days base backups before creating a new one
+[[ "$DAYS_TO_RETAIN" -lt 2 ]] && DAYS_TO_RETAIN=2
 
-IN_RECOVERY=$(psql -tXqAc "select pg_is_in_recovery()")
-[[ $IN_RECOVERY != "f" ]] && log "Cluster is in recovery, not running backup" && exit 0
+readonly WAL_E="envdir $WALE_ENV_DIR wal-e --aws-instance-profile"
+
+BEFORE=""
 
-# leave at least 2 base backups before creating a new one
-[[ "$NUM_TO_RETAIN" -lt 2 ]] && NUM_TO_RETAIN=2
+readonly NOW=$(date +%s -u)
+while read name last_modified rest; do
+    last_modified=$(date +%s -ud "$last_modified")
+    if [ $(((NOW-last_modified)/86400)) -gt $DAYS_TO_RETAIN ]; then
+        if [ -z "$BEFORE" ] || [ "$last_modified" -gt "$BEFORE_TIME" ]; then
+            BEFORE_TIME=$last_modified
+            BEFORE=$name
+        fi
+    fi
+done < <($WAL_E backup-list 2> /dev/null | sed '0,/^name\s*last_modified\s*/d')
 
-# --aws-instance-profile flag is just ignored when running in GCE.
-envdir "${WALE_ENV_DIR}" wal-e --aws-instance-profile delete --confirm retain "${NUM_TO_RETAIN}"
+if [ ! -z "$BEFORE" ]; then
+    $WAL_E delete --confirm before "$BEFORE"
+fi
 
 # Ensure we don't have more workes than CPU's
 POOL_SIZE=$(grep -c ^processor /proc/cpuinfo 2>/dev/null || 1)
@@ -35,4 +46,4 @@ POOL_SIZE=$(grep -c ^processor /proc/cpuinfo 2>/dev/null || 1)
 # push a new base backup
 log "producing a new backup"
 # We reduce the priority of the backup for CPU consumption
-exec nice -n 5 envdir "${WALE_ENV_DIR}" wal-e --aws-instance-profile backup-push "${PGDATA}" --pool-size ${POOL_SIZE}
+exec nice -n 5 $WAL_E backup-push "${PGDATA}" --pool-size ${POOL_SIZE}