Skip to content

Commit

Permalink
Make spilo more k8s-compatible (#105)
Browse files Browse the repository at this point in the history
* Make spilo more k8s-compatible
* Make it possible to work correctly on minikube
* K8S -> Kubernetes. app -> application.
* Don't rely on labels and do direct update of Endpoints instead
  • Loading branch information
CyberDem0n committed Nov 21, 2016
1 parent 2cb7cd8 commit 7a5ef9e
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 110 deletions.
2 changes: 1 addition & 1 deletion postgres-appliance/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ ENV PATH=$PATH:/usr/lib/postgresql/${PGVERSION}/bin
## 2 Build
## 3 Remove tools only required for build
ENV PATRONIVERSION=1.1
ENV WALE_VERSION=1.0.0b1
ENV WALE_VERSION=1.0.1
RUN export DEBIAN_FRONTEND=noninteractive \
export BUILD_PACKAGES="postgresql-server-dev-${PGVERSION} python3-pip python3-dev build-essential pgxnclient" \
export PGXN_EXTENSIONS="quantile trimmed_aggregates" \
Expand Down
77 changes: 50 additions & 27 deletions postgres-appliance/callback_role.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,60 +5,83 @@
import requests
import requests.exceptions
import os
import socket
import sys
import time

TOKEN_FILENAME = '/var/run/secrets/kubernetes.io/serviceaccount/token'
CA_CERT_FILENAME = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
API_URL = 'https://kubernetes.default.svc.cluster.local/api/v1/namespaces/{0}/pods/{1}'
KUBE_SERVICE_DIR = '/var/run/secrets/kubernetes.io/serviceaccount/'
KUBE_NAMESPACE_FILENAME = KUBE_SERVICE_DIR + 'namespace'
KUBE_TOKEN_FILENAME = KUBE_SERVICE_DIR + 'token'
KUBE_CA_CERT = KUBE_SERVICE_DIR + 'ca.crt'

KUBE_API_URL = 'https://kubernetes.default.svc.cluster.local/api/v1/namespaces'

logger = logging.getLogger(__name__)

NUM_ATTEMPTS = 10
LABEL = 'spilo-role'


def change_host_role_label(new_role):
def read_first_line(filename):
try:
with open(TOKEN_FILENAME, "r") as f:
token = f.read()
with open(filename) as f:
return f.readline().rstrip()
except IOError:
sys.exit("Unable to read K8S authorization token")
return None


def read_token():
return read_first_line(KUBE_TOKEN_FILENAME)


headers = {'Authorization': 'Bearer {0}'.format(token)}
headers['Content-Type'] = 'application/json-patch+json'
url = API_URL.format(os.environ.get('POD_NAMESPACE', 'default'),
os.environ['HOSTNAME'])
data = [{'op': 'add', 'path': '/metadata/labels/{0}'.format(LABEL), 'value': new_role}]
def api_patch(namespace, kind, name, entity_name, body):
api_url = '/'.join([KUBE_API_URL, namespace, kind, name])
for i in range(NUM_ATTEMPTS):
try:
r = requests.patch(url, headers=headers, data=json.dumps(data), verify=CA_CERT_FILENAME)
if r.status_code >= 300:
logger.warning("Unable to change the role label to {0}: {1}".format(new_role, r.text))
token = read_token()
if token:
r = requests.patch(api_url, data=body, verify=KUBE_CA_CERT,
headers={'Content-Type': 'application/strategic-merge-patch+json',
'Authorization': 'Bearer {0}'.format(token)})
if r.status_code >= 300:
logger.warning('Unable to change %s: %s', entity_name, r.text)
else:
break
else:
break
logger.warning('Unable to read Kubernetes authorization token')
except requests.exceptions.RequestException as e:
logger.warning("Exception when executing POST on {0}: {1}".format(url, e))
logger.warning('Exception when executing PATCH on %s: %s', api_url, e)
time.sleep(1)
else:
logger.warning("Unable to set the label after {0} attempts".format(NUM_ATTEMPTS))
logger.error('Unable to change %s after %s attempts', entity_name, NUM_ATTEMPTS)


def change_pod_role_label(namespace, new_role):
body = json.dumps({'metadata': {'labels': {LABEL: new_role}}})
api_patch(namespace, 'pods', os.environ['HOSTNAME'], '{} label'.format(LABEL), body)


def change_endpoints(namespace, cluster):
ip = os.environ.get('POD_IP', socket.gethostbyname(socket.gethostname()))
body = json.dumps({'subsets': [{'addresses': [{'ip': ip}], 'ports': [{'port': 5432, 'protocol': 'TCP'}]}]})
api_patch(namespace, 'endpoints', cluster, 'service endpoints', body)


def record_role_change(action, new_role):
# on stop always sets the label to the replica, the load balancer
# should not direct connections to the hosts with the stopped DB.
if action == 'on_stop':
new_role = 'replica'
change_host_role_label(new_role)
logger.debug("Changing the host's role to {0}".format(new_role))
def record_role_change(action, new_role, cluster):
new_role = None if action == 'on_stop' else new_role
logger.debug("Changing the pod's role to %s", new_role)
pod_namespace = os.environ.get('POD_NAMESPACE', read_first_line(KUBE_NAMESPACE_FILENAME)) or 'default'
if new_role == 'master':
change_endpoints(pod_namespace, cluster)
change_pod_role_label(pod_namespace, new_role)


def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
if len(sys.argv) == 4 and sys.argv[1] in ('on_start', 'on_stop', 'on_role_change', 'on_restart'):
record_role_change(action=sys.argv[1], new_role=sys.argv[2])
record_role_change(action=sys.argv[1], new_role=sys.argv[2], cluster=sys.argv[3])
else:
sys.exit("Usage: {0} action role name".format(sys.argv[0]))
sys.exit('Usage: %s <action> <role> <cluster_name>', sys.argv[0])
return 0

if __name__ == '__main__':
Expand Down
62 changes: 35 additions & 27 deletions postgres-appliance/configure_spilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import socket
import subprocess
import sys

from six.moves.urllib_parse import urlparse

Expand All @@ -19,7 +20,7 @@
PROVIDER_GOOGLE = "google"
PROVIDER_LOCAL = "local"
PROVIDER_UNSUPPORTED = "unsupported"
USE_K8S = os.environ.get('KUBERNETES_SERVICE_HOST') is not None
USE_KUBERNETES = os.environ.get('KUBERNETES_SERVICE_HOST') is not None


def parse_args():
Expand Down Expand Up @@ -111,11 +112,9 @@ def deep_update(a, b):
use_pg_rewind: true
use_slots: true
parameters:
{{#USE_WALE}}
archive_mode: "on"
archive_timeout: 1800s
archive_command: envdir "{{WALE_ENV_DIR}}" wal-e --aws-instance-profile wal-push "%p" -p 1
{{/USE_WALE}}
archive_command: {{{postgresql.parameters.archive_command}}}
wal_level: hot_standby
wal_keep_segments: 8
wal_log_hints: 'on'
Expand Down Expand Up @@ -219,14 +218,21 @@ def get_instance_metadata(provider):
'id': socket.gethostname(),
'zone': 'local'}

if USE_KUBERNETES:
metadata['ip'] = os.environ.get('POD_IP', metadata['ip'])

headers = {}
if provider == PROVIDER_GOOGLE:
headers['Metadata-Flavor'] = 'Google'
url = 'http://metadata.google.internal/computeMetadata/v1/instance'
mapping = {'zone': 'zone', 'id': 'id'}
mapping = {'zone': 'zone'}
if not USE_KUBERNETES:
mapping.update({'id': 'id'})
elif provider == PROVIDER_AWS:
url = 'http://instance-data/latest/meta-data'
mapping = {'ip': 'local-ipv4', 'id': 'instance-id', 'zone': 'placement/availability-zone'}
mapping = {'zone': 'placement/availability-zone'}
if not USE_KUBERNETES:
mapping.update({'ip': 'local-ipv4', 'id': 'instance-id'})
else:
logging.info("No meta-data available for this provider")
return metadata
Expand Down Expand Up @@ -256,26 +262,28 @@ def get_placeholders(provider):
placeholders.setdefault('WALE_BACKUP_THRESHOLD_MEGABYTES', 1024)
placeholders.setdefault('WALE_BACKUP_THRESHOLD_PERCENTAGE', 30)
placeholders.setdefault('WALE_ENV_DIR', os.path.join(placeholders['PGHOME'], 'etc', 'wal-e.d', 'env'))
placeholders.setdefault('USE_WALE', False)
placeholders.setdefault('CALLBACK_SCRIPT', '')

if provider in (PROVIDER_AWS, PROVIDER_GOOGLE):
placeholders.setdefault('USE_WALE', True)
if provider == PROVIDER_AWS:
placeholders.setdefault('WAL_S3_BUCKET', 'spilo-example-com')
elif provider == PROVIDER_GOOGLE:
placeholders.setdefault('WAL_GCS_BUCKET', 'spilo-example-com')
if 'WAL_S3_BUCKET' in placeholders:
placeholders['USE_WALE'] = True
if not USE_KUBERNETES: # AWS specific callback to tag the instances with roles
placeholders['CALLBACK_SCRIPT'] = 'patroni_aws'
elif provider == PROVIDER_GOOGLE and 'WAL_GCS_BUCKET' in placeholders:
placeholders['USE_WALE'] = True
placeholders.setdefault('GOOGLE_APPLICATION_CREDENTIALS', '')
# Kubernetes requires a callback to change the labels in order to point to the new master
if USE_K8S:
placeholders.setdefault('CALLBACK_SCRIPT', '/callback_role.py')
elif provider == PROVIDER_AWS: # AWS specific callback to tag the instances with roles
placeholders.setdefault('CALLBACK_SCRIPT', 'patroni_aws')

else: # avoid setting WAL-E archive command and callback script for unknown providers (i.e local docker)
placeholders.setdefault('USE_WALE', False)
placeholders.setdefault('CALLBACK_SCRIPT', '')
# Kubernetes requires a callback to change the labels in order to point to the new master
if USE_KUBERNETES:
placeholders['CALLBACK_SCRIPT'] = '/callback_role.py'

placeholders.setdefault('postgresql', {})
placeholders['postgresql'].setdefault('parameters', {})
placeholders['postgresql']['parameters']['archive_command'] = \
'envdir "{0}" wal-e --aws-instance-profile wal-push "%p" -p 1'.format(placeholders['WALE_ENV_DIR']) \
if placeholders['USE_WALE'] else '/bin/true'

os_memory_mb = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / 1024 / 1024

Expand All @@ -285,11 +293,7 @@ def get_placeholders(provider):
placeholders['postgresql']['parameters']['max_connections'] = min(max(100, int(os_memory_mb/30)), 1000)

placeholders['instance_data'] = get_instance_metadata(provider)
if provider in (PROVIDER_AWS, PROVIDER_GOOGLE):
if USE_K8S:
# id is not unique per container, we use the hostname instead
placeholders['instance_data']['id'] = os.environ.get('HOSTNAME')
placeholders['instance_data']['id'] = re.sub(r'\W+', '_', placeholders['instance_data']['id'])
placeholders['instance_data']['id'] = re.sub(r'\W+', '_', placeholders['instance_data']['id'])
return placeholders


Expand Down Expand Up @@ -339,7 +343,7 @@ def get_dcs_config(config, placeholders):


def write_wale_command_environment(placeholders, overwrite, provider):
if provider not in (PROVIDER_AWS, PROVIDER_GOOGLE):
if not placeholders['USE_WALE']:
return

if not os.path.exists(placeholders['WALE_ENV_DIR']):
Expand Down Expand Up @@ -384,7 +388,7 @@ def write_crontab(placeholders, path, overwrite):
def write_etcd_configuration(placeholders, overwrite=False):
placeholders.setdefault('ETCD_HOST', '127.0.0.1:2379')

etcd_config="""\
etcd_config = """\
[program:etcd]
user=postgres
autostart=1
Expand Down Expand Up @@ -453,7 +457,7 @@ def main():
provider = os.environ.get('DEVELOP', '').lower() in ['1', 'true', 'on'] and PROVIDER_LOCAL or get_provider()
placeholders = get_placeholders(provider)

if provider == PROVIDER_LOCAL:
if provider == PROVIDER_LOCAL and not USE_KUBERNETES:
write_etcd_configuration(placeholders)

config = yaml.load(pystache_render(TEMPLATE, placeholders))
Expand Down Expand Up @@ -488,12 +492,16 @@ def main():
elif section == 'certificate':
write_certificates(placeholders, args['force'])
elif section == 'crontab':
write_crontab(placeholders, os.environ.get('PATH'), args['force'])
if placeholders['USE_WALE']:
write_crontab(placeholders, os.environ.get('PATH'), args['force'])
elif section == 'ldap':
write_ldap_configuration(placeholders, args['force'])
else:
raise Exception('Unknown section: {}'.format(section))

# We will abuse non zero exit code as an indicator for the launch.sh that it should not even try to create a backup
sys.exit(int(not placeholders['USE_WALE']))


if __name__ == '__main__':
main()
13 changes: 7 additions & 6 deletions postgres-appliance/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
pgrep supervisord > /dev/null
if [ $? -ne 1 ]; then echo "ERROR: Supervisord is already running"; exit 1; fi

mkdir -p "$PGROOT" && chown -R postgres:postgres "$PGROOT"
mkdir -p "$PGLOG"
mkdir -p "$PGLOG" && chown -R postgres:postgres "$PGROOT"

## Ensure all logfiles exist, most appliances will have
## a foreign data wrapper pointing to these files
for i in $(seq 0 7); do touch "${PGLOG}/postgresql-$i.csv"; done
chown -R postgres:postgres "$PGLOG"

python3 /configure_spilo.py all
(
sudo PATH="$PATH" -u postgres /patroni_wait.sh -t 3600 -- /postgres_backup.sh "$WALE_ENV_DIR" "$PGDATA"
) &
if python3 /configure_spilo.py all; then
(
sudo PATH="$PATH" -u postgres /patroni_wait.sh -t 3600 -- /postgres_backup.sh "$WALE_ENV_DIR" "$PGDATA"
) &
fi

exec supervisord --configuration=/etc/supervisor/supervisord.conf --nodaemon

0 comments on commit 7a5ef9e

Please sign in to comment.