/
postgres_ha.sh
187 lines (169 loc) · 5.66 KB
/
postgres_ha.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/bash
PATH=$PATH:/usr/lib/postgresql/${PGVERSION}/bin
SSL_CERTIFICATE="$PGHOME/dummy.crt"
SSL_PRIVATE_KEY="$PGHOME/dummy.key"
BACKUP_INTERVAL=3600
function write_postgres_yaml
{
local aws_private_ip=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4)
local pg_port=5432
local api_port=8008
cat > postgres.yml <<__EOF__
ttl: &ttl 30
loop_wait: &loop_wait 10
scope: &scope $SCOPE
restapi:
listen: 0.0.0.0:${api_port}
connect_address: ${aws_private_ip}:${api_port}
__EOF__
if [[ -n $ZOOKEEPER_HOSTS || -n $EXHIBITOR_HOSTS && -n $EXHIBITOR_PORT ]]; then
cat >> postgres.yml <<__EOF__
zookeeper:
scope: *scope
session_timeout: *ttl
reconnect_timeout: *loop_wait
__EOF__
[[ -n $ZOOKEEPER_HOSTS ]] && echo " hosts: ${ZOOKEEPER_HOSTS}" >> postgres.yml
if [[ -n $EXHIBITOR_HOSTS && -n $EXHIBITOR_PORT ]]; then
cat >> postgres.yml <<__EOF__
exhibitor:
poll_interval: 300
port: ${EXHIBITOR_PORT}
hosts: ${EXHIBITOR_HOSTS}
__EOF__
fi
elif [[ -n $ETCD_HOST ]]; then
cat >> postgres.yml <<__EOF__
etcd:
scope: *scope
ttl: *ttl
host: ${ETCD_HOST}
__EOF__
elif [[ -n $ETCD_DISCOVERY_DOMAIN ]]; then
cat >> postgres.yml <<__EOF__
etcd:
scope: *scope
ttl: *ttl
discovery_srv: ${ETCD_DISCOVERY_DOMAIN}
__EOF__
else
>&2 echo "Can not find suitable distributed configuration store."
exit 1
fi
cat >> postgres.yml <<__EOF__
postgresql:
name: postgresql_${HOSTNAME}
scope: *scope
listen: 0.0.0.0:${pg_port}
connect_address: ${aws_private_ip}:${pg_port}
data_dir: $PGDATA
pg_hba:
- hostssl all all 0.0.0.0/0 md5
- host all all 0.0.0.0/0 md5
replication:
username: standby
password: standby
network: 0.0.0.0/0
superuser:
password: zalando
admin:
username: admin
password: admin
wal_e:
env_dir: $WALE_ENV_DIR
threshold_megabytes: ${WALE_BACKUP_THRESHOLD_MEGABYTES}
threshold_backup_size_percentage: ${WALE_BACKUP_THRESHOLD_PERCENTAGE}
restore: patroni/patroni/scripts/restore.py
callbacks:
on_start: patroni/patroni/scripts/aws.py
on_stop: patroni/patroni/scripts/aws.py
on_restart: patroni/patroni/scripts/aws.py
on_role_change: patroni/patroni/scripts/aws.py
pg_rewind:
username: postgres
password: zalando
parameters:
archive_mode: "on"
wal_level: hot_standby
archive_command: "envdir ${WALE_ENV_DIR} wal-e --aws-instance-profile wal-push \"%p\" -p 1"
max_wal_senders: 5
wal_keep_segments: 8
archive_timeout: 1800s
max_replication_slots: 5
hot_standby: "on"
ssl: "on"
ssl_cert_file: "$SSL_CERTIFICATE"
ssl_key_file: "$SSL_PRIVATE_KEY"
wal_log_hints: 'on'
recovery_conf:
restore_command: "envdir ${WALE_ENV_DIR} wal-e --aws-instance-profile wal-fetch \"%f\" \"%p\" -p 1"
__EOF__
}
function write_archive_command_environment
{
# get current AWS region
region=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone|head -c -1)
mkdir -p ${WALE_ENV_DIR}
echo "s3://${WAL_S3_BUCKET}/spilo/${SCOPE}/wal/" > ${WALE_ENV_DIR}/WALE_S3_PREFIX
echo "https+path://s3-$region.amazonaws.com:443" > ${WALE_ENV_DIR}/WALE_S3_ENDPOINT
}
write_postgres_yaml
write_archive_command_environment
# run wal-e s3 backup periodically
(
INITIAL=1
RETRY=0
LAST_BACKUP_TS=0
while true
do
sleep 5
CURRENT_TS=$(date +%s)
CURRENT_HOUR=$(date +%H)
pg_isready >/dev/null 2>&2 || continue
IN_RECOVERY=$(psql -tqAc "select pg_is_in_recovery()")
[[ $IN_RECOVERY != "f" ]] && echo "still in recovery" && continue
# during initial run, count the number of backup lines. If there are
# no backup (only line with backup-list header is returned), or there
# is an error, try to produce a backup. Otherwise, stick to the regular
# schedule, since we might run the backups on a freshly promoted replica.
if [[ $INITIAL = 1 ]]
then
BACKUPS_LINES=$(envdir ${WALE_ENV_DIR} wal-e --aws-instance-profile backup-list 2>/dev/null|wc -l)
[[ $PIPESTATUS[0] = 0 ]] && [[ $BACKUPS_LINES -ge 2 ]] && INITIAL=0
fi
# produce backup only at a given hour, unless it's set to *, which means
# that only backup_interval is taken into account. We also skip all checks
# when the backup is forced because of previous attempt's failure or because
# it's going to be a very first backup, in which case we create it unconditionally.
if [[ $RETRY = 0 ]] && [[ $INITIAL = 0 ]]
then
# check that enough time has passed since the previous backup
[[ $BACKUP_HOUR != '*' ]] && [[ $CURRENT_HOUR != $BACKUP_HOUR ]] && continue
# get the time since the last backup. Do it only one when the hour
# matches the backup hour.
[[ $LAST_BACKUP_TS = 0 ]] && LAST_BACKUP_TS=$(envdir ${WALE_ENV_DIR} wal-e --aws-instance-profile backup-list LATEST 2>/dev/null | tail -n1 | awk '{print $2}' | xargs date +%s --date)
# LAST_BACKUP_TS will be empty on error.
if [[ -z $LAST_BACKUP_TS ]]
then
LAST_BACKUP_TS=0
echo "could not obtain latest backup timestamp"
fi
ELAPSED_TIME=$((CURRENT_TS-LAST_BACKUP_TS))
[[ $ELAPSED_TIME -lt $BACKUP_INTERVAL ]] && continue
fi
# leave only 2 base backups before creating a new one
envdir ${WALE_ENV_DIR} wal-e --aws-instance-profile delete --confirm retain 2
# push a new base backup
echo "producing a new backup at $(date)"
envdir ${WALE_ENV_DIR} wal-e --aws-instance-profile backup-push ${PGDATA}
RETRY=$?
# re-examine last backup timestamp if a new backup has been created
if [[ $RETRY = 0 ]]
then
INITIAL=0
LAST_BACKUP_TS=0
fi
done
) &
[[ "$DEBUG" == 1 ]] && exec /bin/bash
exec patroni/patroni.py "$PGHOME/postgres.yml"