Skip to content

Commit

Permalink
Implement backup retention based on amount of days (#241)
Browse files Browse the repository at this point in the history
the old retention policy was too bad for k8s, where failovers/switchovers would happen rather often
  • Loading branch information
CyberDem0n committed Jun 7, 2018
1 parent 6f9156c commit 675d3a4
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 23 deletions.
2 changes: 1 addition & 1 deletion postgres-appliance/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ if [ "$DEMO" = "true" ]; then
else
if python3 /scripts/configure_spilo.py all; then
(
su postgres -c "PATH=$PATH /scripts/patroni_wait.sh -t 3600 -- /scripts/postgres_backup.sh $WALE_ENV_DIR $PGDATA"
su postgres -c "PATH=$PATH /scripts/patroni_wait.sh -t 3600 -- /scripts/postgres_backup.sh $WALE_ENV_DIR $PGDATA $BACKUP_NUM_TO_RETAIN"
) &
fi
exec supervisord --configuration=/etc/supervisor/supervisord.conf --nodaemon
Expand Down
17 changes: 8 additions & 9 deletions postgres-appliance/scripts/patroni_wait.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,13 @@ do
shift
done

[ ! -z "$TIMEOUT" ] && CUTOFF=$(($(date +%s)+$TIMEOUT))
if [ $# -gt 0 ]; then
[ ! -z "$TIMEOUT" ] && CUTOFF=$(($(date +%s)+TIMEOUT))

while :
do
[ $(curl -o /dev/null --silent --write-out '%{http_code}\n' "localhost:8008/$ROLE") -eq 200 ] && break
[ ! -z "$TIMEOUT" ] && [ $CUTOFF -le $(date +%s) ] && exit 2
sleep $INTERVAL
done
while [ "$(curl -so /dev/null -w '%{http_code}' localhost:8008/$ROLE)" != "200" ]; do
[ ! -z "$TIMEOUT" ] && [ $CUTOFF -le $(date +%s) ] && exit 2
sleep $INTERVAL
done

## Execute the command that was specified
[ $# -gt 0 ] && exec "$@"
exec "$@" # Execute the command that was specified
fi
37 changes: 24 additions & 13 deletions postgres-appliance/scripts/postgres_backup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,34 @@ function log
log "I was called as: $0 $@"


WALE_ENV_DIR=$1
shift
readonly WALE_ENV_DIR=$1
readonly PGDATA=$2
DAYS_TO_RETAIN=$3

PGDATA=$1
shift
readonly IN_RECOVERY=$(psql -tXqAc "select pg_is_in_recovery()")
[[ $IN_RECOVERY != "f" ]] && log "Cluster is in recovery, not running backup" && exit 0

NUM_TO_RETAIN=$1
shift
# leave at least 2 days base backups before creating a new one
[[ "$DAYS_TO_RETAIN" -lt 2 ]] && DAYS_TO_RETAIN=2

IN_RECOVERY=$(psql -tXqAc "select pg_is_in_recovery()")
[[ $IN_RECOVERY != "f" ]] && log "Cluster is in recovery, not running backup" && exit 0
readonly WAL_E="envdir $WALE_ENV_DIR wal-e --aws-instance-profile"

BEFORE=""

# leave at least 2 base backups before creating a new one
[[ "$NUM_TO_RETAIN" -lt 2 ]] && NUM_TO_RETAIN=2
readonly NOW=$(date +%s -u)
while read name last_modified rest; do
last_modified=$(date +%s -ud "$last_modified")
if [ $(((NOW-last_modified)/86400)) -gt $DAYS_TO_RETAIN ]; then
if [ -z "$BEFORE" ] || [ "$last_modified" -gt "$BEFORE_TIME" ]; then
BEFORE_TIME=$last_modified
BEFORE=$name
fi
fi
done < <($WAL_E backup-list 2> /dev/null | sed '0,/^name\s*last_modified\s*/d')

# --aws-instance-profile flag is just ignored when running in GCE.
envdir "${WALE_ENV_DIR}" wal-e --aws-instance-profile delete --confirm retain "${NUM_TO_RETAIN}"
if [ ! -z "$BEFORE" ]; then
$WAL_E delete --confirm before "$BEFORE"
fi

# Ensure we don't have more workes than CPU's
POOL_SIZE=$(grep -c ^processor /proc/cpuinfo 2>/dev/null || 1)
Expand All @@ -35,4 +46,4 @@ POOL_SIZE=$(grep -c ^processor /proc/cpuinfo 2>/dev/null || 1)
# push a new base backup
log "producing a new backup"
# We reduce the priority of the backup for CPU consumption
exec nice -n 5 envdir "${WALE_ENV_DIR}" wal-e --aws-instance-profile backup-push "${PGDATA}" --pool-size ${POOL_SIZE}
exec nice -n 5 $WAL_E backup-push "${PGDATA}" --pool-size ${POOL_SIZE}

0 comments on commit 675d3a4

Please sign in to comment.