Skip to content

Commit

Permalink
Custom bootstrap scripts for Patroni (#181)
Browse files Browse the repository at this point in the history
scripts to perform custom bootstrap with basebackup and wal-e.
  • Loading branch information
alexeyklyukin authored and CyberDem0n committed Nov 29, 2017
1 parent bce46c4 commit 7516aff
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 16 deletions.
2 changes: 1 addition & 1 deletion postgres-appliance/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ ENV WALE_ENV_DIR=$PGHOME/etc/wal-e.d/env
# Set PGHOME as a login directory for the PostgreSQL user.
RUN usermod -d $PGHOME -m postgres

ADD scm-source.json configure_spilo.py launch.sh postgres_backup.sh patroni_wait.sh post_init.sh _zmon_schema.dump callback_role.py basebackup.sh wale_restore_command.sh wal-e-wal-fetch.sh callback_aws.py /
ADD scm-source.json configure_spilo.py launch.sh postgres_backup.sh patroni_wait.sh post_init.sh _zmon_schema.dump callback_role.py basebackup.sh wale_restore_command.sh wal-e-wal-fetch.sh callback_aws.py bootstrap /
ADD supervisor.d /etc/supervisor/conf.d/
ADD pgq_ticker.ini $PGHOME
ADD motd /etc/
Expand Down
65 changes: 65 additions & 0 deletions postgres-appliance/bootstrap/clone_with_basebackup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python

import argparse
import logging
import subprocess

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)


def read_configuration():
parser = argparse.ArgumentParser(description="Script to clone from another cluster using pg_basebackup")
parser.add_argument('--scope', required=True, help='target cluster name', dest='name')
parser.add_argument('--datadir', required=True, help='target cluster postgres data directory')
parser.add_argument('--pgpass', required=True,
help='path to the pgpass file containing credentials for the instance to be cloned')
parser.add_argument('--host', required=True, help='hostname or IP address of the master to connect to')
parser.add_argument('--port', required=False, help='PostgreSQL port master listens to', default=5432)
parser.add_argument('--dbname', required=False, help='PostgreSQL database to connect to', default='postgres')
parser.add_argument('--user', required=True, help='PostgreSQL user to connect with')
return parser.parse_args()


def escape_value(val):
quote = False
temp = []
for c in val:
if c.isspace():
quote = True
elif c in ('\'', '\\'):
temp.append('\\')
temp.append(c)
result = ''.join(temp)
return result if not quote else '\'{0}\''.format(result)


def prepare_connection(options):
connection = []
for attname in ('host', 'port', 'user', 'dbname'):
attvalue = getattr(options, attname)
connection.append('{0}={1}'.format(attname, escape_value(attvalue)))

return ' '.join(connection), {'PGPASSFILE': options.pgpass}


def run_basebackup(options):
connstr, env = prepare_connection(options)
logger.info('cloning cluster %s from "%s"', options.name, connstr)
ret = subprocess.call(['pg_basebackup', '-D', options.datadir, '-X', 'stream', '-d', connstr, '-w'], env=env)
if ret != 0:
raise Exception("pg_basebackup exited with code={0}".format(ret))
return 0


def main():
options = read_configuration()
try:
return run_basebackup(options)
except Exception:
logger.exception("Clone failed")
return 1


if __name__ == '__main__':
main()
94 changes: 94 additions & 0 deletions postgres-appliance/bootstrap/clone_with_s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python

import argparse
from collections import namedtuple
from dateutil.parser import parse
import csv
import logging
import subprocess

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)


def read_configuration():
parser = argparse.ArgumentParser(description="Script to clone from S3 with support for point-in-time-recovery")
parser.add_argument('--scope', required=True, help='target cluster name')
parser.add_argument('--datadir', required=True, help='target cluster postgres data directory')
parser.add_argument('--envdir', required=True,
help='path to the pgpass file containing credentials for the instance to be cloned')
parser.add_argument('--recovery-target-time',
help='the time stamp up to which recovery will proceed (including time zone)',
dest='recovery_target_time_string')
parser.add_argument('--dry-run', action='store_true', help='find a matching backup and build the wal-e '
'command to fetch that backup without running it')
args = parser.parse_args()

options = namedtuple('Options', 'name datadir wale_envdir recovery_target_time dry_run')
if args.recovery_target_time_string:
recovery_target_time = parse(args.recovery_target_time_string)
if recovery_target_time.tzinfo is None:
raise Exception("recovery target time must contain a timezone")
else:
recovery_target_time = None

return options(args.scope, args.datadir, args.envdir, recovery_target_time, args.dry_run)


def build_wale_command(envdir, command, datadir=None, backup=None):
cmd = ['envdir', envdir, 'wal-e', '--aws-instance-profile']
if command == 'backup-list':
cmd.extend([command, '--detail'])
elif command == 'backup-fetch':
if datadir is None or backup is None:
raise Exception("backup-fetch requires datadir and backup arguments")
cmd.extend([command, datadir, backup])
else:
raise Exception("invalid wal-e command {0}".format(command))
return cmd


def choose_backup(output, recovery_target_time):
""" pick up the latest backup file starting before time recovery_target_time"""
reader = csv.DictReader(output.decode('utf-8').splitlines(), dialect='excel-tab')
backup_list = list(reader)
if len(backup_list) <= 0:
raise Exception("wal-e could not found any backups")
match_timestamp = match = None
for backup in backup_list:
last_modified = parse(backup['last_modified'])
if last_modified < recovery_target_time:
if match is None or last_modified > match_timestamp:
match = backup
match_timestamp = last_modified
if match is None:
raise Exception("wal-e could not found any backups prior to the point in time {0}".format(recovery_target_time))
return match['name']


def run_clone_from_s3(options):
backup_name = 'LATEST'
if options.recovery_target_time:
backup_list_cmd = build_wale_command(options.wale_envdir, 'backup-list')
backup_list = subprocess.check_output(backup_list_cmd)
backup_name = choose_backup(backup_list, options.recovery_target_time)
backup_fetch_cmd = build_wale_command(options.wale_envdir, 'backup-fetch', options.datadir, backup_name)
logger.info("cloning cluster %s using %s", options.name, ' '.join(backup_fetch_cmd))
if not options.dry_run:
ret = subprocess.call(backup_fetch_cmd)
if ret != 0:
raise Exception("wal-e backup-fetch exited with exit code {0}".format(ret))
return 0


def main():
options = read_configuration()
try:
return run_clone_from_s3(options)
except Exception:
logger.exception("Clone failed")
return 1


if __name__ == '__main__':
main()
138 changes: 123 additions & 15 deletions postgres-appliance/configure_spilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@


def parse_args():
sections = ['all', 'patroni', 'patronictl', 'certificate', 'wal-e', 'crontab', 'ldap', 'pam-oauth2', 'pgbouncer']
sections = ['all', 'patroni', 'patronictl', 'certificate', 'wal-e', 'crontab',
'ldap', 'pam-oauth2', 'pgbouncer', 'bootstrap']
argp = argparse.ArgumentParser(description='Configures Spilo',
epilog="Choose from the following sections:\n\t{}".format('\n\t'.join(sections)),
formatter_class=argparse.RawDescriptionHelpFormatter)
Expand Down Expand Up @@ -102,6 +103,7 @@ def deep_update(a, b):

return a if a is not None else b


TEMPLATE = \
'''
bootstrap:
Expand Down Expand Up @@ -141,6 +143,31 @@ def deep_update(a, b):
autovacuum_max_workers: 5
autovacuum_vacuum_scale_factor: 0.05
autovacuum_analyze_scale_factor: 0.02
{{#CLONE_WITH_WALE}}
method: clone_with_wale
clone_with_wale:
command: python3 /clone_with_s3.py --envdir "{{CLONE_WALE_ENV_DIR}}" --recovery-target-time="{{CLONE_TARGET_TIME}}"
recovery_conf:
restore_command: envdir "{{CLONE_WALE_ENV_DIR}}" /wale_restore_command.sh "%f" "%p"
recovery_target_timeline: latest
{{#USE_PAUSE_AT_RECOVERY_TARGET}}
pause_at_recovery_target: false
{{/USE_PAUSE_AT_RECOVERY_TARGET}}
{{^USE_PAUSE_AT_RECOVERY_TARGET}}
recovery_target_action: promote
{{/USE_PAUSE_AT_RECOVERY_TARGET}}
{{#CLONE_TARGET_TIME}}
recovery_target_time: "{{CLONE_TARGET_TIME}}"
{{/CLONE_TARGET_TIME}}
{{^CLONE_TARGET_INCLUSIVE}}
recovery_target_inclusive: false
{{/CLONE_TARGET_INCLUSIVE}}
{{/CLONE_WITH_WALE}}
{{#CLONE_WITH_BASEBACKUP}}
method: clone_with_basebackup
clone_with_basebackup:
command: python3 /clone_with_basebackup.py --pgpass={{CLONE_PGPASS}} --host={{CLONE_HOST}} --port={{CLONE_PORT}} --user="{{CLONE_USER}}"
{{/CLONE_WITH_BASEBACKUP}}
initdb:
- encoding: UTF8
- locale: en_US.UTF-8
Expand Down Expand Up @@ -267,6 +294,27 @@ def get_instance_metadata(provider):
return metadata


def set_clone_with_wale_placeholders(placeholders, provider):
""" checks that enough parameters are provided to configure cloning with WAL-E """
if provider == PROVIDER_AWS:
clone_bucket_placeholder = 'CLONE_WAL_S3_BUCKET'
elif provider == PROVIDER_GOOGLE:
clone_bucket_placeholder = 'CLONE_WAL_GSC_BUCKET'
else:
logging.warning('Cloning with WAL-E is only possible when running on AWS or GCP')
return
# XXX: Cloning from one provider into another (i.e. Google from Amazon) is not possible.
# No WAL-E related limitations, but credentials would have to be passsed explicitely.
clone_cluster = placeholders.get('CLONE_SCOPE')
if placeholders.get(clone_bucket_placeholder) and clone_cluster:
placeholders['CLONE_WITH_WALE'] = True
placeholders.setdefault('CLONE_WALE_ENV_DIR', os.path.join(placeholders['PGHOME'], 'etc', 'wal-e.d',
'env-clone-{0}'.format(clone_cluster)))
else:
logging.warning("Clone method is set to WAL-E, but no '%s' or 'CLONE_SCOPE' specified",
clone_bucket_placeholder)


def get_placeholders(provider):
placeholders = dict(os.environ)

Expand All @@ -293,7 +341,28 @@ def get_placeholders(provider):
placeholders.setdefault('WALE_BACKUP_THRESHOLD_PERCENTAGE', 30)
placeholders.setdefault('WALE_ENV_DIR', os.path.join(placeholders['PGHOME'], 'etc', 'wal-e.d', 'env'))
placeholders.setdefault('USE_WALE', False)
placeholders.setdefault('USE_PAUSE_AT_RECOVERY_TARGET', False)
placeholders.setdefault('CALLBACK_SCRIPT', '')
placeholders.setdefault('CLONE_METHOD', '')
placeholders.setdefault('CLONE_WITH_WALE', '')
placeholders.setdefault('CLONE_WITH_BASEBACKUP', '')
placeholders.setdefault('CLONE_TARGET_TIME', '')
placeholders.setdefault('CLONE_TARGET_INCLUSIVE', True)

if placeholders['CLONE_METHOD'] == 'CLONE_WITH_WALE':
# set_clone_with_wale_placeholders would modify placeholders and take care of error cases
set_clone_with_wale_placeholders(placeholders, provider)
elif placeholders['CLONE_METHOD'] == 'CLONE_WITH_BASEBACKUP':
clone_scope = placeholders.get('CLONE_SCOPE')
if clone_scope and placeholders.get('CLONE_HOST') \
and placeholders.get('CLONE_USER') and placeholders.get('CLONE_PASSWORD'):
placeholders['CLONE_WITH_BASEBACKUP'] = True
placeholders.setdefault('CLONE_PGPASS', os.path.join(placeholders['PGHOME'],
'.pgpass_{0}'.format(clone_scope)))
placeholders.setdefault('CLONE_PORT', 5432)
else:
logging.warning("Clone method is set to basebackup, but no 'CLONE_SCOPE' "
"or 'CLONE_HOST' or 'CLONE_USER' or 'CLONE_PASSWORD' specified")

if provider == PROVIDER_AWS:
if 'WAL_S3_BUCKET' in placeholders:
Expand Down Expand Up @@ -363,36 +432,63 @@ def get_dcs_config(config, placeholders):
return config


def write_wale_command_environment(placeholders, overwrite, provider):
if not placeholders['USE_WALE']:
return
def write_wale_environment(placeholders, provider, prefix, overwrite):
wale = {}

if not os.path.exists(placeholders['WALE_ENV_DIR']):
os.makedirs(placeholders['WALE_ENV_DIR'])
for name in ('SCOPE', 'WALE_ENV_DIR', 'WAL_S3_BUCKET', 'WAL_GCS_BUCKET'):
rename = prefix + name
if rename in placeholders:
wale[name] = placeholders[rename]

if not os.path.exists(wale['WALE_ENV_DIR']):
os.makedirs(wale['WALE_ENV_DIR'])

if provider == PROVIDER_AWS:
write_file('s3://{WAL_S3_BUCKET}/spilo/{SCOPE}/wal/'.format(**placeholders),
os.path.join(placeholders['WALE_ENV_DIR'], 'WALE_S3_PREFIX'), overwrite)
match = re.search(r'.*(eu-\w+-\d+)-.*', placeholders['WAL_S3_BUCKET'])
write_file('s3://{WAL_S3_BUCKET}/spilo/{SCOPE}/wal/'.format(**wale),
os.path.join(wale['WALE_ENV_DIR'], 'WALE_S3_PREFIX'), overwrite)
match = re.search(r'.*(eu-\w+-\d+)-.*', wale['WAL_S3_BUCKET'])
if match:
region = match.group(1)
else:
region = placeholders['instance_data']['zone'][:-1]
write_file('https+path://s3-{}.amazonaws.com:443'.format(region),
os.path.join(placeholders['WALE_ENV_DIR'], 'WALE_S3_ENDPOINT'), overwrite)
os.path.join(wale['WALE_ENV_DIR'], 'WALE_S3_ENDPOINT'), overwrite)
elif provider == PROVIDER_GOOGLE:
write_file('gs://{WAL_GCS_BUCKET}/spilo/{SCOPE}/wal/'.format(**placeholders),
os.path.join(placeholders['WALE_ENV_DIR'], 'WALE_GS_PREFIX'), overwrite)
write_file('gs://{WAL_GCS_BUCKET}/spilo/{SCOPE}/wal/'.format(**wale),
os.path.join(wale['WALE_ENV_DIR'], 'WALE_GS_PREFIX'), overwrite)
if placeholders['GOOGLE_APPLICATION_CREDENTIALS']:
write_file('{GOOGLE_APPLICATION_CREDENTIALS}'.format(**placeholders),
os.path.join(placeholders['WALE_ENV_DIR'], 'GOOGLE_APPLICATION_CREDENTIALS'), overwrite)
os.path.join(wale['WALE_ENV_DIR'], 'GOOGLE_APPLICATION_CREDENTIALS'), overwrite)
else:
return

if not os.path.exists(placeholders['WALE_TMPDIR']):
os.makedirs(placeholders['WALE_TMPDIR'])
os.chmod(placeholders['WALE_TMPDIR'], 0o1777)

write_file(placeholders['WALE_TMPDIR'], os.path.join(placeholders['WALE_ENV_DIR'], 'TMPDIR'), True)
write_file(placeholders['WALE_TMPDIR'], os.path.join(wale['WALE_ENV_DIR'], 'TMPDIR'), True)


def write_bootstrap_configuration(placeholders, provider, overwrite):
if placeholders['CLONE_WITH_WALE']:
write_wale_environment(placeholders, provider, 'CLONE_', overwrite)
if placeholders['CLONE_WITH_BASEBACKUP']:
write_clone_pgpass(placeholders, overwrite)


def write_clone_pgpass(placeholders, overwrite):
pgpassfile = placeholders['CLONE_PGPASS']
# pgpass is host:port:database:user:password
r = {'host': escape_pgpass_value(placeholders['CLONE_HOST']),
'port': placeholders['CLONE_PORT'],
'database': '*',
'user': escape_pgpass_value(placeholders['CLONE_USER']),
'password': escape_pgpass_value(placeholders['CLONE_PASSWORD'])}
pgpass_string = "{host}:{port}:{database}:{user}:{password}".format(**r)
write_file(pgpass_string, pgpassfile, overwrite)
uid = os.stat(placeholders['PGHOME']).st_uid
os.chmod(pgpassfile, 0o600)
os.chown(pgpassfile, uid, -1)


def write_crontab(placeholders, path, overwrite):
Expand Down Expand Up @@ -558,7 +654,8 @@ def main():
os.makedirs(os.path.dirname(patronictl_configfile))
write_file(yaml.dump(patronictl_config), patronictl_configfile, args['force'])
elif section == 'wal-e':
write_wale_command_environment(placeholders, args['force'], provider)
if placeholders['USE_WALE']:
write_wale_environment(placeholders, provider, '', args['force'])
elif section == 'certificate':
write_certificates(placeholders, args['force'])
elif section == 'crontab':
Expand All @@ -570,12 +667,23 @@ def main():
write_pam_oauth2_configuration(placeholders, args['force'])
elif section == 'pgbouncer':
write_pgbouncer_configuration(placeholders, args['force'])
elif section == 'bootstrap':
write_bootstrap_configuration(placeholders, provider, args['force'])
else:
raise Exception('Unknown section: {}'.format(section))

# We will abuse non zero exit code as an indicator for the launch.sh that it should not even try to create a backup
sys.exit(int(not placeholders['USE_WALE']))


def escape_pgpass_value(val):
output = []
for c in val:
if c in ('\\', ':'):
output.append('\\')
output.append(c)
return ''.join(output)


if __name__ == '__main__':
main()

0 comments on commit 7516aff

Please sign in to comment.