From 7d3d02b88c536f5dd1ff920f9b04d690b2a78f7c Mon Sep 17 00:00:00 2001 From: Vitaliy Kukharik <37010174+vitabaks@users.noreply.github.com> Date: Thu, 17 Aug 2023 18:32:27 +0300 Subject: [PATCH] Automate in place major upgrade for PostgreSQL (#301) --- .github/workflows/molecule_pg_upgrade.yml | 81 ++++ README.md | 20 +- molecule/default/converge.yml | 6 +- molecule/pg_upgrade/converge.yml | 83 +++++ molecule/pg_upgrade/molecule.yml | 94 +++++ molecule/pg_upgrade/prepare.yml | 28 ++ molecule/tests/etcd/etcd.yml | 2 +- molecule/tests/patroni/patroni.yml | 2 +- molecule/tests/postgres/postgres.yml | 6 +- molecule/tests/postgres/replication.yml | 6 +- pg_upgrade.yml | 284 ++++++++++++++ pg_upgrade_rollback.yml | 55 +++ roles/packages/tasks/main.yml | 11 +- roles/upgrade/README.md | 339 +++++++++++++++++ roles/upgrade/tasks/checkpoint_location.yml | 82 ++++ roles/upgrade/tasks/custom_wal_dir.yml | 48 +++ roles/upgrade/tasks/dcs_remove_cluster.yml | 16 + roles/upgrade/tasks/extensions.yml | 19 + roles/upgrade/tasks/initdb.yml | 106 ++++++ roles/upgrade/tasks/maintenance_disable.yml | 59 +++ roles/upgrade/tasks/maintenance_enable.yml | 102 +++++ roles/upgrade/tasks/packages.yml | 66 ++++ roles/upgrade/tasks/pgbouncer_pause.yml | 119 ++++++ roles/upgrade/tasks/pgbouncer_resume.yml | 11 + roles/upgrade/tasks/post_checks.yml | 75 ++++ roles/upgrade/tasks/post_upgrade.yml | 199 ++++++++++ roles/upgrade/tasks/pre_checks.yml | 350 ++++++++++++++++++ roles/upgrade/tasks/rollback.yml | 215 +++++++++++ roles/upgrade/tasks/schema_compatibility.yml | 139 +++++++ roles/upgrade/tasks/ssh-keys.yml | 67 ++++ roles/upgrade/tasks/start_services.yml | 87 +++++ roles/upgrade/tasks/statistics.yml | 72 ++++ roles/upgrade/tasks/stop_services.yml | 130 +++++++ roles/upgrade/tasks/update_config.yml | 162 ++++++++ roles/upgrade/tasks/update_extensions.yml | 79 ++++ roles/upgrade/tasks/upgrade_check.yml | 48 +++ roles/upgrade/tasks/upgrade_primary.yml | 49 +++ roles/upgrade/tasks/upgrade_secondary.yml | 134 +++++++ .../templates/haproxy-no-http-checks.cfg.j2 | 77 ++++ vars/Debian.yml | 8 +- vars/RedHat.yml | 18 +- vars/main.yml | 1 + vars/upgrade.yml | 92 +++++ 43 files changed, 3621 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/molecule_pg_upgrade.yml create mode 100644 molecule/pg_upgrade/converge.yml create mode 100644 molecule/pg_upgrade/molecule.yml create mode 100644 molecule/pg_upgrade/prepare.yml create mode 100644 pg_upgrade.yml create mode 100644 pg_upgrade_rollback.yml create mode 100644 roles/upgrade/README.md create mode 100644 roles/upgrade/tasks/checkpoint_location.yml create mode 100644 roles/upgrade/tasks/custom_wal_dir.yml create mode 100644 roles/upgrade/tasks/dcs_remove_cluster.yml create mode 100644 roles/upgrade/tasks/extensions.yml create mode 100644 roles/upgrade/tasks/initdb.yml create mode 100644 roles/upgrade/tasks/maintenance_disable.yml create mode 100644 roles/upgrade/tasks/maintenance_enable.yml create mode 100644 roles/upgrade/tasks/packages.yml create mode 100644 roles/upgrade/tasks/pgbouncer_pause.yml create mode 100644 roles/upgrade/tasks/pgbouncer_resume.yml create mode 100644 roles/upgrade/tasks/post_checks.yml create mode 100644 roles/upgrade/tasks/post_upgrade.yml create mode 100644 roles/upgrade/tasks/pre_checks.yml create mode 100644 roles/upgrade/tasks/rollback.yml create mode 100644 roles/upgrade/tasks/schema_compatibility.yml create mode 100644 roles/upgrade/tasks/ssh-keys.yml create mode 100644 roles/upgrade/tasks/start_services.yml create mode 100644 roles/upgrade/tasks/statistics.yml create mode 100644 roles/upgrade/tasks/stop_services.yml create mode 100644 roles/upgrade/tasks/update_config.yml create mode 100644 roles/upgrade/tasks/update_extensions.yml create mode 100644 roles/upgrade/tasks/upgrade_check.yml create mode 100644 roles/upgrade/tasks/upgrade_primary.yml create mode 100644 roles/upgrade/tasks/upgrade_secondary.yml create mode 100644 roles/upgrade/templates/haproxy-no-http-checks.cfg.j2 create mode 100644 vars/upgrade.yml diff --git a/.github/workflows/molecule_pg_upgrade.yml b/.github/workflows/molecule_pg_upgrade.yml new file mode 100644 index 000000000..7d19dad45 --- /dev/null +++ b/.github/workflows/molecule_pg_upgrade.yml @@ -0,0 +1,81 @@ +--- +name: Molecule pg_upgrade + +on: + schedule: + - cron: "0 0 * * 6" + +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + config: + - distro: debian12 + tag: latest + namespace: geerlingguy + - distro: debian11 + tag: latest + namespace: geerlingguy + - distro: debian10 + tag: latest + namespace: geerlingguy + - distro: ubuntu2204 + tag: latest + namespace: geerlingguy + - distro: ubuntu2004 + tag: latest + namespace: geerlingguy + - distro: ubuntu1804 + tag: latest + namespace: geerlingguy + - distro: rockylinux8 + tag: latest + namespace: geerlingguy + - distro: rockylinux9 + tag: latest + namespace: geerlingguy + - distro: almalinux8 + tag: latest + namespace: glillico + - distro: almalinux9 + tag: latest + namespace: glillico + - distro: oraclelinux8 + tag: latest + namespace: glillico + - distro: oraclelinux9 + tag: latest + namespace: glillico + - distro: centosstream9 + tag: latest + namespace: glillico + - distro: centosstream8 + tag: latest + namespace: glillico + + steps: + - name: Set TERM environment variable + run: echo "TERM=xterm" >> $GITHUB_ENV + + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: make bootstrap-dev + + - name: Run Molecule tests for pg_upgrade + run: make molecule-test-scenario + env: + PY_COLORS: "1" + ANSIBLE_FORCE_COLOR: "1" + IMAGE_DISTRO: ${{ matrix.config.distro }} + IMAGE_TAG: ${{ matrix.config.tag }} + IMAGE_NAMESPACE: ${{ matrix.config.namespace }} + MOLECULE_SCENARIO: "pg_upgrade" diff --git a/README.md b/README.md index 52ccaf4d5..67f0598eb 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ In addition to deploying new clusters, this playbook also support the deployment - [Point-In-Time-Recovery:](#point-in-time-recovery) - [Maintenance](#maintenance) - [Update the PostgreSQL HA Cluster](#update-the-postgresql-ha-cluster) + - [PostgreSQL major upgrade](#postgresql-major-upgrade) - [Using Git for cluster configuration management](#using-git-for-cluster-configuration-management-iacgitops) - [Disaster Recovery](#disaster-recovery) - [etcd](#etcd) @@ -490,7 +491,7 @@ I recommend that you study the following materials for further maintenance of th - [Patroni documentation](https://patroni.readthedocs.io/en/latest/) - [etcd operations guide](https://etcd.io/docs/v3.5/op-guide/) -#### Update the PostgreSQL HA Cluster +### Update the PostgreSQL HA Cluster Use the `update_pgcluster.yml` playbook for update the PostgreSQL HA Cluster to a new minor version (for example 15.1->15.2, and etc). @@ -523,7 +524,22 @@ ansible-playbook update_pgcluster.yml -e target=system More details [here](roles/update/README.md) -#### Using Git for cluster configuration management (IaC/GitOps) +### PostgreSQL major upgrade + +Use the `pg_upgrade.yml` playbook to upgrade the PostgreSQL to a new major version (for example 14->15, and etc). + +
Upgrade PostgreSQL + +``` +ansible-playbook pg_upgrade.yml -e "pg_old_version=14 pg_new_version=15" +``` + +
+ +More details [here](roles/upgrade/README.md) + + +### Using Git for cluster configuration management (IaC/GitOps) Infrastructure as Code (IaC) is the managing and provisioning of infrastructure through code instead of through manual processes. \ GitOps automates infrastructure updates using a Git workflow with continuous integration (CI) and continuous delivery (CI/CD). When new code is merged, the CI/CD pipeline enacts the change in the environment. Any configuration drift, such as manual changes or errors, is overwritten by GitOps automation so the environment converges on the desired state defined in Git. diff --git a/molecule/default/converge.yml b/molecule/default/converge.yml index 341455250..81172b94e 100644 --- a/molecule/default/converge.yml +++ b/molecule/default/converge.yml @@ -15,13 +15,13 @@ with_haproxy_load_balancing: true consul_node_role: server # if dcs_type: "consul" consul_bootstrap_expect: true # if dcs_type: "consul" - postgresql_version: "15" + postgresql_version: "15" # to test custom WAL dir cacheable: true - name: Set variables for custom PostgreSQL data and WAL directory test ansible.builtin.set_fact: - postgresql_data_dir: "/data/{{ postgresql_version }}/main" - postgresql_wal_dir: "/wal/{{ postgresql_version }}/pg_wal" + postgresql_data_dir: "/pgdata/{{ postgresql_version }}/main" + postgresql_wal_dir: "/pgwal/{{ postgresql_version }}/pg_wal" - name: Set variables for TimescaleDB cluster deployment test ansible.builtin.set_fact: diff --git a/molecule/pg_upgrade/converge.yml b/molecule/pg_upgrade/converge.yml new file mode 100644 index 000000000..9e76007e0 --- /dev/null +++ b/molecule/pg_upgrade/converge.yml @@ -0,0 +1,83 @@ +--- +- name: Converge + hosts: all + gather_facts: true + + tasks: + - name: Set variables for Patroni cluster deployment test + ansible.builtin.set_fact: + firewall_enabled_at_boot: false + firewall_enable_ipv6: false # Added to prevent test failures in CI. + swap_file_create: false # Added to prevent test failures in CI. + sysctl_set: false # Added to prevent test failures in CI. + nameservers: ["8.8.8.8", "9.9.9.9"] + timezone: "Etc/UTC" + with_haproxy_load_balancing: true + consul_node_role: server # if dcs_type: "consul" + consul_bootstrap_expect: true # if dcs_type: "consul" + postgresql_version: "12" # redefine the version to install for the upgrade test + cacheable: true + + - name: Set variables for custom PostgreSQL data and WAL directory test + ansible.builtin.set_fact: + postgresql_data_dir: "/pgdata/{{ postgresql_version }}/main" + postgresql_wal_dir: "/pgwal/{{ postgresql_version }}/pg_wal" + + - name: Set variables for TimescaleDB cluster deployment test + ansible.builtin.set_fact: + enable_timescale: true + when: + - not (ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('20.04', '<')) + - not (ansible_distribution == 'Debian' and ansible_distribution_version is version('11', '>')) # TODO Debian 12 + + - name: Set variables for PostgreSQL upgrade test + ansible.builtin.set_fact: + pg_old_version: "12" + pg_new_version: "15" + + - name: Clean yum cache (molecule containers) + ansible.builtin.command: yum clean all + when: + - ansible_os_family == "RedHat" + - ansible_distribution_major_version == '7' + + - name: Clean dnf cache (molecule containers) + ansible.builtin.command: dnf clean all + when: + - ansible_os_family == "RedHat" + - ansible_distribution_major_version is version('8', '>=') + + - name: Update apt cache (molecule containers) + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + register: apt_status + until: apt_status is success + delay: 5 + retries: 3 + when: ansible_os_family == "Debian" + + - name: Install openssh-server package (molecule containers) + become: true + ansible.builtin.package: + name: openssh-server + state: present + + - name: Start ssh service (molecule containers) + become: true + ansible.builtin.systemd: + name: "{{ 'ssh' if ansible_os_family == 'Debian' else 'sshd' }}" + state: started + enabled: true + + - name: Delete "/run/nologin" file (if exists) + become: true + ansible.builtin.file: + path: /run/nologin + state: absent + +- name: Deploy PostgreSQL Cluster + ansible.builtin.import_playbook: ../../deploy_pgcluster.yml + +- name: PostgreSQL upgrade test + ansible.builtin.import_playbook: ../../pg_upgrade.yml diff --git a/molecule/pg_upgrade/molecule.yml b/molecule/pg_upgrade/molecule.yml new file mode 100644 index 000000000..22b008efa --- /dev/null +++ b/molecule/pg_upgrade/molecule.yml @@ -0,0 +1,94 @@ +--- +platforms: + - name: 10.172.2.20 + hostname: pgnode01 + image: "${IMAGE_NAMESPACE:-geerlingguy}/docker-${IMAGE_DISTRO:-ubuntu2204}-ansible:${IMAGE_TAG:-latest}" + networks: + - name: upgrade_test_docker_network + ipv4_address: 10.172.2.20 + exposed_ports: + - 22/tcp + - 2379/tcp # if dcs_type: "etcd" + - 2380/tcp # if dcs_type: "etcd" + - 8300/tcp # if dcs_type: "consul" + - 8301/tcp # if dcs_type: "consul" + - 8302/tcp # if dcs_type: "consul" + - 8500/tcp # if dcs_type: "consul" + - 8600/tcp # if dcs_type: "consul" + - 8008/tcp + - 5432/tcp + - 6432/tcp + command: "" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:rw + cgroupns_mode: host + privileged: true + pre_build_image: true + groups: + - etcd_cluster # if dcs_type: "etcd" + - consul_instances # if dcs_type: "consul" + - master + - postgres_cluster + - balancers + + - name: 10.172.2.21 + hostname: pgnode02 + image: "${IMAGE_NAMESPACE:-geerlingguy}/docker-${IMAGE_DISTRO:-ubuntu2204}-ansible:${IMAGE_TAG:-latest}" + networks: + - name: upgrade_test_docker_network + ipv4_address: 10.172.2.21 + exposed_ports: + - 22/tcp + - 2379/tcp + - 2380/tcp + - 8300/tcp + - 8301/tcp + - 8302/tcp + - 8500/tcp + - 8600/tcp + - 8008/tcp + - 5432/tcp + - 6432/tcp + command: "" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:rw + cgroupns_mode: host + privileged: true + pre_build_image: true + groups: + - etcd_cluster + - consul_instances + - replica + - postgres_cluster + - balancers + + - name: 10.172.2.22 + hostname: pgnode03 + image: "${IMAGE_NAMESPACE:-geerlingguy}/docker-${IMAGE_DISTRO:-ubuntu2204}-ansible:${IMAGE_TAG:-latest}" + networks: + - name: upgrade_test_docker_network + ipv4_address: 10.172.2.22 + exposed_ports: + - 22/tcp + - 2379/tcp + - 2380/tcp + - 8300/tcp + - 8301/tcp + - 8302/tcp + - 8500/tcp + - 8600/tcp + - 8008/tcp + - 5432/tcp + - 6432/tcp + command: "" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:rw + cgroupns_mode: host + privileged: true + pre_build_image: true + groups: + - etcd_cluster + - consul_instances + - replica + - postgres_cluster + - balancers diff --git a/molecule/pg_upgrade/prepare.yml b/molecule/pg_upgrade/prepare.yml new file mode 100644 index 000000000..01f658ddf --- /dev/null +++ b/molecule/pg_upgrade/prepare.yml @@ -0,0 +1,28 @@ +--- +- name: "Update docker network(s)" + hosts: localhost + gather_facts: false + become: false + tasks: + - name: "Create docker network: upgrade_test_docker_network" + community.docker.docker_network: + name: upgrade_test_docker_network + driver: bridge + driver_options: + com.docker.network.driver.mtu: 1440 + enable_ipv6: false + internal: false + ipam_config: + - subnet: 10.172.2.0/24 + gateway: 10.172.2.1 + force: true + state: present + labels: + owner: molecule + + - name: "Install netaddr dependency on controlling host" + ansible.builtin.pip: + name: netaddr + become: false + +... diff --git a/molecule/tests/etcd/etcd.yml b/molecule/tests/etcd/etcd.yml index e95693cd7..7a83fbd2a 100644 --- a/molecule/tests/etcd/etcd.yml +++ b/molecule/tests/etcd/etcd.yml @@ -1,6 +1,6 @@ --- - name: Check etcd health - uri: + ansible.builtin.uri: url: "http://{{ inventory_hostname }}:2379/health" return_content: true register: etcd_health_status diff --git a/molecule/tests/patroni/patroni.yml b/molecule/tests/patroni/patroni.yml index 833c7bff6..0af690219 100644 --- a/molecule/tests/patroni/patroni.yml +++ b/molecule/tests/patroni/patroni.yml @@ -1,6 +1,6 @@ --- - name: Check Patroni status - uri: + ansible.builtin.uri: url: "http://{{ inventory_hostname }}:8008/patroni" return_content: true register: patroni_status diff --git a/molecule/tests/postgres/postgres.yml b/molecule/tests/postgres/postgres.yml index b3020cf3b..7f46a410e 100644 --- a/molecule/tests/postgres/postgres.yml +++ b/molecule/tests/postgres/postgres.yml @@ -5,7 +5,7 @@ failed_when: result.rc != 0 - name: Check if PostgreSQL is listening on the default port - wait_for: + ansible.builtin.wait_for: port: 5432 timeout: 5 register: is_listening @@ -13,6 +13,8 @@ - name: Try to connect to PostgreSQL postgresql_ping: + login_host: "127.0.0.1" + login_port: "{{ postgresql_port }}" login_user: "{{ patroni_superuser_username }}" login_password: "{{ patroni_superuser_password }}" - db: template1 + login_db: template1 diff --git a/molecule/tests/postgres/replication.yml b/molecule/tests/postgres/replication.yml index ffe85b284..b4c369669 100644 --- a/molecule/tests/postgres/replication.yml +++ b/molecule/tests/postgres/replication.yml @@ -1,10 +1,12 @@ --- - name: Check PostgreSQL replication status postgresql_query: - db: postgres + query: "SELECT * FROM pg_stat_wal_receiver;" + login_host: "127.0.0.1" + login_port: "{{ postgresql_port }}" login_user: "{{ patroni_superuser_username }}" login_password: "{{ patroni_superuser_password }}" - query: "SELECT * FROM pg_stat_wal_receiver;" + login_db: template1 register: pg_replication_status failed_when: "pg_replication_status.rowcount == 0" when: "'replica' in group_names" diff --git a/pg_upgrade.yml b/pg_upgrade.yml new file mode 100644 index 000000000..1de434d0c --- /dev/null +++ b/pg_upgrade.yml @@ -0,0 +1,284 @@ +--- +# TODO: +# - Citus support +- name: "Upgrade PostgreSQL {{ pg_old_version }} to version {{ pg_new_version }} (PostgreSQL HA Cluster: {{ patroni_cluster_name }})" + hosts: postgres_cluster + gather_facts: true + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + - vars/upgrade.yml + pre_tasks: + - name: '[Prepare] Get Patroni Cluster Leader Node' + ansible.builtin.uri: + url: http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/leader + status_code: 200 + register: patroni_leader_result + changed_when: false + failed_when: false + + # Stop, if Patroni is unavailable + - name: The Patroni cluster is unhealthy + ansible.builtin.fail: + msg: "Patroni is unavailable on {{ ansible_hostname }}. Please check the cluster status." + changed_when: false + when: patroni_leader_result is undefined or patroni_leader_result.status == -1 + + - name: '[Prepare] Add host to group "primary" (in-memory inventory)' + ansible.builtin.add_host: + name: "{{ item }}" + groups: primary + when: hostvars[item]['patroni_leader_result']['status'] == 200 + loop: "{{ groups['postgres_cluster'] }}" + changed_when: false + + - name: '[Prepare] Add hosts to group "secondary" (in-memory inventory)' + ansible.builtin.add_host: + name: "{{ item }}" + groups: secondary + when: hostvars[item]['patroni_leader_result']['status'] != 200 + loop: "{{ groups['postgres_cluster'] }}" + changed_when: false + + - name: "Print Patroni Cluster info" + ansible.builtin.debug: + msg: + - "Cluster Name: {{ patroni_cluster_name }}" + - "Cluster Leader: {{ ansible_hostname }}" + when: inventory_hostname in groups['primary'] + tags: + - always + +- name: "(1/7) PRE-UPGRADE: Perform Pre-Checks" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + environment: "{{ proxy_env | default({}) }}" + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Running Pre-Checks + ansible.builtin.include_role: + name: upgrade + tasks_from: pre_checks + tags: + - upgrade + - pre-checks + +- name: "(2/7) PRE-UPGRADE: Install new PostgreSQL packages" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: root + any_errors_fatal: true + environment: "{{ proxy_env | default({}) }}" + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Install packages + ansible.builtin.include_role: + name: upgrade + tasks_from: packages + tags: + - upgrade + - upgrade-check + - packages + +- name: "(3/7) PRE-UPGRADE: Initialize new db, schema compatibility check, and pg_upgrade --check" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Create Data directory and initdb + ansible.builtin.include_role: + name: upgrade + tasks_from: initdb + + # (optional) copy files specified in variable: + # 'copy_files_to_all_server' + - name: Copy files + ansible.builtin.include_role: + name: copy + + - name: Check Schema Compatibility + ansible.builtin.import_role: + name: upgrade + tasks_from: schema_compatibility + when: schema_compatibility_check | bool + + - name: Check pg_upgrade + ansible.builtin.import_role: + name: upgrade + tasks_from: upgrade_check + tags: + - upgrade + - upgrade-check + - schema-compatibility-check + +- name: "(4/7) PRE-UPGRADE: Prepare the Patroni configuration" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Patroni config + ansible.builtin.include_role: + name: upgrade + tasks_from: update_config + tags: + - upgrade + - update-config + +- name: "(5/7) UPGRADE: Upgrade PostgreSQL" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Enable maintenance mode + ansible.builtin.include_role: + name: upgrade + tasks_from: maintenance_enable + + - name: Stop Services + ansible.builtin.include_role: + name: upgrade + tasks_from: stop_services + + - name: Check 'Latest checkpoint location' + ansible.builtin.include_role: + name: upgrade + tasks_from: checkpoint_location + + - name: Upgrade Primary + ansible.builtin.include_role: + name: upgrade + tasks_from: upgrade_primary + + - name: Upgrade Secondary + ansible.builtin.include_role: + name: upgrade + tasks_from: upgrade_secondary + + # if pg_new_wal_dir is defined + - name: Create WAL dir symlink + ansible.builtin.include_role: + name: upgrade + tasks_from: custom_wal_dir + when: pg_new_wal_dir | length > 0 + + - name: Remove old cluster from DCS + ansible.builtin.include_role: + name: upgrade + tasks_from: dcs_remove_cluster + + - name: Start Services + ansible.builtin.include_role: + name: upgrade + tasks_from: start_services + + - name: Disable maintenance mode + ansible.builtin.include_role: + name: upgrade + tasks_from: maintenance_disable + tags: + - upgrade + +- name: "(6/7) POST-UPGRADE: Perform Post-Checks and Update extensions" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Running Post-Checks + ansible.builtin.include_role: + name: upgrade + tasks_from: post_checks + + - name: Update extensions + ansible.builtin.include_role: + name: upgrade + tasks_from: extensions + when: update_extensions | bool + tags: + - upgrade + - post-checks + - update-extensions + +- name: "(7/7) POST-UPGRADE: Analyze a PostgreSQL database (update optimizer statistics) and Post-Upgrade tasks" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tasks: + - name: Analyze database + ansible.builtin.include_role: + name: upgrade + tasks_from: statistics + + - name: Running Post-Upgrade tasks + ansible.builtin.include_role: + name: upgrade + tasks_from: post_upgrade + tags: + - upgrade + - post-upgrade + - analyze + - statistics + +... diff --git a/pg_upgrade_rollback.yml b/pg_upgrade_rollback.yml new file mode 100644 index 000000000..bd18c3cec --- /dev/null +++ b/pg_upgrade_rollback.yml @@ -0,0 +1,55 @@ +--- +# This playbook performs a rollback of a PostgreSQL upgrade. +# It's designed to be used when a PostgreSQL upgrade hasn't been fully completed and the new version hasn't been started. +# The rollback operation is performed by starting the Patroni cluster with the old version of PostgreSQL using the same PGDATA. +# The playbook first checks the health of the current cluster, verifies the version of PostgreSQL, and ensures the new PostgreSQL is not running. +# If these checks pass, the playbook switches back to the old PostgreSQL paths and restarts the Patroni service. + +- name: "Rollback PostgreSQL {{ pg_new_version }} to version {{ pg_old_version }} (PostgreSQL HA Cluster: {{ patroni_cluster_name }})" + hosts: postgres_cluster + gather_facts: true + any_errors_fatal: true + vars_files: + - vars/main.yml + - vars/upgrade.yml + tasks: + - name: '[Prepare] Add host to group "primary" (in-memory inventory)' + ansible.builtin.add_host: + name: "{{ item }}" + groups: primary + # As Primary we specify the host in the 'master' group in the inventory file. + loop: "{{ groups['master'] }}" + changed_when: false + + - name: '[Prepare] Add hosts to group "secondary" (in-memory inventory)' + ansible.builtin.add_host: + name: "{{ item }}" + groups: secondary + # As Secondary we specify the hosts in the 'replica' group in the inventory file. + loop: "{{ groups['replica'] }}" + changed_when: false + tags: + - always + +- name: "Perform Rollback" + hosts: 'primary:secondary' + gather_facts: false + become: true + become_user: postgres + any_errors_fatal: true + vars_files: + - vars/main.yml + pre_tasks: + - name: Include OS-specific variables + ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" + tags: always + - name: Include upgrade variables + ansible.builtin.include_vars: "vars/upgrade.yml" + tags: always + tasks: + - name: Running rollback.yml + ansible.builtin.include_role: + name: upgrade + tasks_from: rollback + tags: + - rollback diff --git a/roles/packages/tasks/main.yml b/roles/packages/tasks/main.yml index da61afe86..3cf0b8a9b 100644 --- a/roles/packages/tasks/main.yml +++ b/roles/packages/tasks/main.yml @@ -150,8 +150,15 @@ ansible.builtin.package: name: "{{ item }}" state: present - loop: - - timescaledb-2-postgresql-{{ postgresql_version }} + loop: "{{ timescaledb_package }}" + vars: + timescaledb_package: + - > + {% if postgresql_version | int >= 11 %} + timescaledb-2-postgresql-{{ postgresql_version }} + {% else %} + timescaledb-postgresql-{{ postgresql_version }} + {% endif %} register: package_status until: package_status is success delay: 5 diff --git a/roles/upgrade/README.md b/roles/upgrade/README.md new file mode 100644 index 000000000..637be66ea --- /dev/null +++ b/roles/upgrade/README.md @@ -0,0 +1,339 @@ +## PostgreSQL in-place major upgrade + +This role is designed for in-place major upgrades of PostgreSQL, e.g., from version 14 to 15. + +#### Compatibility + +The upgrade is supported starting from PostgreSQL 9.3 and up to the latest PostgreSQL version. + +#### Requirements + +There is no need to plan additional disk space, because when upgrading PostgreSQL using hard links instead of copying files. However, it is required that the `pg_old_datadir` and `pg_new_datadir` are located within the same top-level directory (`pg_upper_datadir` variable). + +Specify the current (old) version of PostgreSQL in the `pg_old_version` variable and target version of PostgreSQL for the upgrade in the `pg_new_version` variable. + +#### Recommendations + +1. Before upgrading to a new major version, it's recommended to update PostgreSQL and its extensions. Additionally, consider updating Patroni and the entire system. + + To achieve this, use the `update_pgcluster.yml` playbook. More details can be found [here](../update/README.md). + +2. Before moving forward, execute preliminary checks to ensure that your database schema is compatible with the upcoming PostgreSQL version and that the cluster is ready for the upgrade. + + To do this, run the `pg_upgrade.yml` playbook using the tags '`pre-checks,upgrade-check`'. + + If any errors arise, such as schema object incompatibilities, resolve these issues and repeat the checks. + + Once the playbook completes the pre-checks without any errors, you should see the following messages in the Ansible log: + - "`The database schema is compatible with PostgreSQL `" + - "`Clusters are compatible`" + + Upon seeing these messages, proceed to run the playbook without any tags to initiate the upgrade. + +### Upgrade + +```bash +ansible-playbook pg_upgrade.yml -e "pg_old_version=14 pg_new_version=15" +``` + +#### Database Downtime Considerations + +To minimize or even eliminate errors during database upgrades (depending on the workload and timeouts), we pause the PgBouncer pools. From an application's perspective, this does not result in terminated database connections. Instead, applications might experience a temporary increase in query latency while the PgBouncer pools are paused. + +On average, the PgBouncer pause duration is approximately 30 seconds. However, for larger databases, this pause might be extended due to longer `pg_upgrade` and `rsync` procedures. The default maximum wait time for a request during a pause is set to 2 minutes (controlled by the `query_wait_timeout` pgbouncer parameter). If the pause exceeds this duration, connections will be terminated with a timeout error. + +### Rollback + +This playbook performs a rollback of a PostgreSQL upgrade. + +```bash +ansible-playbook pg_upgrade_rollback.yml +``` + +It's designed to be used when a PostgreSQL upgrade hasn't been fully completed and the new version hasn't been started. +The rollback operation is performed by starting the Patroni cluster with the old version of PostgreSQL using the same PGDATA. +The playbook first checks the health of the current cluster, verifies the version of PostgreSQL, and ensures the new PostgreSQL is not running. +If these checks pass, the playbook switches back to the old PostgreSQL paths and restarts the Patroni service. + +### Variables + +| Variable Name | Description | Default Value | +|---------------|-------------|--------------:| +| `pg_old_version` | Current (old) version of PostgreSQL. | `""` | +| `pg_new_version` | Target version of PostgreSQL for the upgrade. | `""` | +| `pg_old_bindir` | Directory containing binaries for the old PostgreSQL version. | Derived value | +| `pg_old_datadir` | Data directory path for the old PostgreSQL version. | Derived value | +| `pg_old_confdir` | Configuration directory path for the old PostgreSQL version. | Derived value | +| `pg_new_bindir` | Directory containing binaries for the new PostgreSQL version. | Derived value | +| `pg_new_datadir` | Data directory path for the new PostgreSQL version. | Derived value | +| `pg_new_confdir` | Configuration directory path for the new PostgreSQL version. | Derived value | +| `pg_new_wal_dir` | Custom WAL directory for the new PostgreSQL version. | Derived value | +| `pg_upper_datadir` | Top-level directory containing both old and new PostgreSQL data directories. | Derived value | +| `pg_new_packages` | List of package names for the new PostgreSQL version to be installed. | Derived value | +| `pg_old_packages_remove` | Whether to remove old PostgreSQL packages after the upgrade. | `true` | +| `pg_start_stop_timeout` | Timeout when starting/stopping PostgreSQL during the upgrade (in seconds). | `1800` | +| `schema_compatibility_check` | Check database schema compatibility with the new PostgreSQL version before upgrading. | `true` | +| `schema_compatibility_check_port` | Port for temporary PostgreSQL instance for schema compatibility checking. | Derived value | +| `schema_compatibility_check_timeout` | Max duration for compatibility check (pg_dumpall --schema-only) in seconds. | `3600` | +| `vacuumdb_parallel_jobs` | Execute the analyze command in parallel by running `njobs` commands simultaneously. This option may reduce the processing time but it also increases the load on the database server. | all CPU cores | +| `vacuumdb_analyze_timeout` | Max duration of analyze command in seconds. | `3600` | +| `update_extensions` | Automatically update all PostgreSQL extensions. | `true` | +| `max_replication_lag_bytes` | Maximum allowed replication lag in bytes. | `10485760` | +| `max_transaction_sec` | Maximum allowed duration for a transaction in seconds. | `15` | +| `copy_files_to_all_server` | Copy files located in the "files" directory to all servers. (optional) | `[]` | +| `pgbouncer_pool_pause` | Pause pgbouncer pools during upgrade. | `true` | +| `pgbouncer_pool_pause_terminate_after` | Time in seconds after which script terminates slow active queries. | `30` | +| `pgbouncer_pool_pause_stop_after` | Time in seconds after which the script exits with an error if unable to pause all pgbouncer pools. | `60` | +| `pg_slow_active_query_treshold` | Time in milliseconds to wait for active queries before trying to pause the pool. | `1000` | +| `pg_slow_active_query_treshold_to_terminate` | Time in milliseconds after reaching "pgbouncer_pool_pause_terminate_after" before the script terminates active queries. | `100` | + +Note: For variables marked as "Derived value", the default value is determined based on other variables. \ +Please see the variable file vars/[upgrade.yml](../../vars/upgrade.yml) + +--- + +### Upgrade Plan: + +#### 1. PRE-UPGRADE: Perform Pre-Checks +- **Make sure that the required variables are specified** + - Notes: `pg_old_version` and `pg_new_version` variables + - Stop, if one or more required variables have empty values. +- **Make sure that the old and new data and confg directories do not match** + - Stop, if `pg_old_datadir` and `pg_new_datadir`, or `pg_old_confdir` and `pg_new_confdir` match. +- **Make sure the ansible required Python library is installed** + - Notes: Install 'pexpect' package if missing +- **Test PostgreSQL database access using a unix socket** + - if there is an error (no pg_hba.conf entry): + - Add temporary local access rule (during the upgrade) + - Update the PostgreSQL configuration +- **Check the current version of PostgreSQL** + - Stop, if the current version does not match `pg_old_version` + - Stop, if the current version greater than or equal to `pg_new_version`. No upgrade is needed. +- **Ensure new data directory is different from the current one** + - Note: This check is necessary to avoid the risk of deleting the current data directory + - Stop, if the current data directory is the same as `pg_new_datadir`. + - Stop, if the current WAL directory is the same as `pg_new_wal_dir` (if a custom wal dir is used). +- **Make sure that physical replication is active** + - Stop, if there are no active replicas +- **Make sure there is no high replication lag** + - Stop, if replication lag is high (more than `max_replication_lag_bytes`) +- **Make sure there are no long-running transactions** + - Stop, if long-running transactions detected (more than `max_transaction_sec`) +- **Make sure that SSH key-based authentication is configured between cluster nodes** + - Create and copy ssh keys between database servers (if not configured) +- **Perform Rsync Checks** + - Make sure that the rsync package are installed + - Create 'testrsync' file on Primary + - Test rsync and ssh key access + - Cleanup 'testrsync' file +- **Check if PostgreSQL tablespaces exist** + - Print tablespace location (if exists) + - Note: If tablespaces are present they will be upgraded (step 5) on replicas using rsync +- **Test PgBouncer access via localhost** + - test access via 'localhost' to be able to perform 'PAUSE' command +- **Make sure that the cluster ip address (VIP) is running** + - Notes: if 'cluster_vip' is defined + +#### 2. PRE-UPGRADE: Install new PostgreSQL packages +- Clean yum/dnf cache (for RedHat based) or Update apt cache for (Debian based) +- Install new PostgreSQL packages +- Install TimescaleDB package for new PostgreSQL + - Note: if 'enable_timescale' is 'true' + +#### 3. PRE-UPGRADE: Initialize new db, schema compatibility check, and pg_upgrade --check +- **Initialize new PostgreSQL** + - Make sure new PostgreSQL data directory exists + - Make sure new PostgreSQL data directory is not initialized + - If already initialized: + - Perform pg_dropcluster (for Debian based) + - Clear the new PostgreSQL data directory + - Get the current install user (rolname with oid = 10) + - Get the current encodig and data_checksums settings + - Initialize new PostgreSQL data directory + - for Debain based: on all database servers to create default config files + - for RedHat based: on the Primary only +- **Copy files specified in the `copy_files_to_all_server` variable** (vars/upgrade.yml), [optional] + - Notes: for example, it may be necessary for Postgres Full-Text Search (FTS) files +- **Schema compatibility check** + - Get the current `shared_preload_libraries` settings + - Get the current `cron.database_name` settings + - Notes: if 'pg_cron' is defined in 'pg_shared_preload_libraries' + - Start new PostgreSQL to check the schema compatibility + - Note: on the port specified in the `schema_compatibility_check_port` variable + - Wait for PostgreSQL to start + - Check the compatibility of the database schema with the new PostgreSQL + - Notes: used `pg_dumpall` with `--schema-only` options + - Wait for the schema compatibility check to complete + - Checking the result of the schema compatibility + - Note: Checking for errors in `/tmp/pg_schema_compatibility_check.log` + - Stop, if the scheme is not compatible (there are errors) + - Print result of checking the compatibility of the scheme + - Stop new PostgreSQL to re-initdb + - Drop new PostgreSQL to re-initdb (perform pg_dropcluster for Debian based) + - Reinitialize the database after checking schema compatibility +- **Perform pg_upgrade check** + - Get the current `shared_preload_libraries` settings + - Verify the two clusters are compatible (`pg_upgrade --check`) + - Print the result of the pg_upgrade check + +#### 4. PRE-UPGRADE: Prepare the Patroni configuration +- Edit patroni.yml + - **Update parameters**: `data_dir`, `bin_dir`, `config_dir` + - **Check if the 'standby_cluster' parameter is specified** + - Remove parameters: `standby_cluster` (if exists) + - Notes: To support upgrades in the Patroni Standby Cluster + - **Prepare the PostgreSQL parameters** (removed or renamed parameters) + - Check if the '`replacement_sort_tuples`' parameter is specified (removed in PG 11) + - remove parameter: 'replacement_sort_tuples' (if exists) + - Check if the '`default_with_oids`' parameter is specified (removed in PG 12) + - remove parameter: 'default_with_oids' (if exists) + - Check if the '`wal_keep_segments`' parameter is specified (removed in PG 13) + - replace parameter: 'wal_keep_segments' to '`wal_keep_size`' + - Check if the '`operator_precedence_warning`' parameter is specified" (removed in PG 14) + - remove parameter: 'operator_precedence_warning' (if exists) + - Check if the '`vacuum_cleanup_index_scale_factor`' parameter is specified (removed in PG 14) + - remove parameter: 'vacuum_cleanup_index_scale_factor' (if exists) + - Check if the '`stats_temp_directory`' parameter is specified (removed in PG 15) + - remove parameter: 'stats_temp_directory' (if exists) +- **Copy pg_hba.conf to `pg_new_confdir`** + - Notes: to save pg_hba rules + +#### 5. UPGRADE: Upgrade PostgreSQL +- **Enable maintenance mode for Patroni cluster** (pause) +- **Enable maintenance mode for HAProxy** (for 'Type A' scheme) + - Notes: if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' + - Stop confd service + - Update haproxy conf file + - Notes: Temporarily disable http-checks in order to keep database connections after stopping the Patroni service + - Reload haproxy service +- **Enable maintenance mode for vip-manager** (for 'Type B' scheme) + - Notes: if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' + - Update vip-manager service file (comment out 'ExecStopPost') + - Notes: Temporarily disable vip-manager service to keep database connections after stopping the Patroni service + - Stop vip-manager service + - Notes: This prevents the VIP from being removed when the Patroni leader is unavailable during maintenance + - Make sure that the cluster ip address (VIP) is running +- **Stop Patroni service** + - Wait until the Patroni cluster is stopped +- **Execute CHECKPOINT before stopping PostgreSQL** + - Wait for the CHECKPOINT to complete +- **Wait until replication lag is less than `max_replication_lag_bytes`** + - Notes: max wait time: 2 minutes + - Stop, if replication lag is high + - Perform rollback + - Print error message: "There's a replication lag in the PostgreSQL Cluster. Please try again later" +- **Perform PAUSE on all pgbouncers servers** + - Notes: if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' + - Notes: pgbouncer pause script (details in [pgbouncer_pause.yml](tasks/pgbouncer_pause.yml)) performs the following actions: + - Waits for active queries on the database servers to complete (with a runtime more than `pg_slow_active_query_treshold`). + - If there are no active queries, sends a `PAUSE` command to each pgbouncer servers in parallel (using `xargs` and ssh connections). + - If all pgbouncer are successfully paused, the script exits with code 0 (successful). + - If active queries do not complete within 30 seconds (`pgbouncer_pool_pause_terminate_after` variable), the script terminates slow active queries (longer than `pg_slow_active_query_treshold_to_terminate`). + - If after that it is still not possible to pause the pgbouncer servers within 60 seconds (`pgbouncer_pool_pause_stop_after` variable) from the start of the script, the script exits with an error. + - Perform rollback + - Print error message: "PgBouncer pools could not be paused, please try again later." +- **Stop PostgreSQL** on the Leader and Replicas + - Check if old PostgreSQL is stopped + - Check if new PostgreSQL is stopped +- **Get 'Latest checkpoint location'** on the Leader and Replicas + - Print 'Latest checkpoint location' for the Leader and Replicas +- **Check if all 'Latest checkpoint location' values match** + - if 'Latest checkpoint location' values match + - Print info message: + - "'Latest checkpoint location' is the same on the leader and its standbys" + - if 'Latest checkpoint location' values doesn't match + - Perform rollback + - Stop with error message: + - "Latest checkpoint location' doesn't match on leader and its standbys. Please try again later" +- **Upgrade the PostgreSQL on the Primary** (using pg_upgrade --link) + - Print the result of the pg_upgrade +- **Make sure that the new data directory are empty on the Replica** +- **Upgrade the PostgreSQL on the Replica** (using rsync --hard-links) + - Wait for the rsync to complete +- **Upgrade the PostgreSQL tablespaces on the Replica** (using rsync --hard-links) + - Notes: if tablespaces exist + - Wait for the tablespaces rsync to complete +- **Synchronize WAL directory** (if `pg_new_wal_dir` is defined) [optional] + - Make sure new pg_wal directory is not symlink + - Make sure the custom WAL directory exists and is empty + - Synchronize new pg_wal to 'pg_new_wal_dir' path + - Rename pg_wal to pg_wal_old + - Create symlink + - Remove 'pg_wal_old' directory +- **Remove existing cluster from DCS** +- **Start Patroni service on the Cluster Leader** + - Wait for Patroni port to become open on the host + - Check Patroni is healthy on the Leader +- **Perform RESUME PgBouncer pools on the Leader** + - Notes: if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- **Start Patroni service on the Cluster Replica** + - Wait for Patroni port to become open on the host + - Check Patroni is healthy on the Replica +- **Perform RESUME PgBouncer pools on the Replica** + - Notes: if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- **Check PostgreSQL is started and accepting connections** +- **Disable maintenance mode for HAProxy** (for 'Type A' scheme) + - Update haproxy conf file + - Notes: Enable http-checks + - Reload haproxy service + - Start confd service +- **Disable maintenance mode for vip-manager** (for 'Type B' scheme) + - Update vip-manager service file (uncomment 'ExecStopPost') + - Start vip-manager service + - Make sure that the cluster ip address (VIP) is running + +#### 6. POST-UPGRADE: Perform Post-Checks and Update extensions +- **Make sure that physical replication is active** + - if no active replication connections found, print error message: + - "No active replication connections found. Please check the replication status and PostgreSQL logs." +- **Create a table "test_replication" with 10000 rows on the Primary** +- **Wait until the PostgreSQL replica is synchronized** + - Notes: max wait time: 2 minutes +- **Drop a table "test_replication"** +- **Print the result of checking the number of records** + - if the number of rows match, print info message: + - "The PostgreSQL Replication is OK. The number of records in the 'test_replication' table the same as the Primary." + - if the number of rows does not match, print error message: + - "The number of records in the 'test_replication' table does not match the Primary. Please check the replication status and PostgreSQL logs." +- **Get a list of databases** +- **Update extensions in each database** + - Get list of installed PostgreSQL extensions + - Get list of old PostgreSQL extensions + - Update old PostgreSQL extensions + - Notes: excluding: 'pg_repack' and 'pg_stat_kcache' (is exists), as it requires re-creation to update + - Recreate old pg_stat_statements and pg_stat_kcache extensions to update + - Notes: if pg_stat_kcache is installed + - Recreate old pg_repack extension to update + - Notes: if pg_repack is installed + - Notes: if there are no old extensions, print message: + - "The extension versions are up-to-date for the database. No update is required." + +#### 7. POST-UPGRADE: Analyze a PostgreSQL database (update optimizer statistics) and Post-Upgrade tasks +- **Run vacuumdb to analyze the PostgreSQL databases** + - Notes: Uses parallel processes equal to CPU cores ('`vacuumdb_parallel_jobs`' variable) + - Notes: Before collecting statistics, the 'pg_terminator' script is launched to monitor and terminate any 'ANALYZE' blockers. Once statistics collection is complete, the script is stopped. + - Wait for the analyze to complete. + - Notes: max wait time: 1 hour ('`vacuumdb_analyze_timeout`' variable) +- **Ensure the current data directory is the new data directory** + - Notes: to prevent deletion the old directory if it is used +- **Delete the old PostgreSQL data directory** + - Notes: perform pg_dropcluster for Debian based +- **Delete the old PostgreSQL WAL directory** + - Notes: if 'pg_new_wal_dir' is defined +- **Remove old PostgreSQL packages** + - Notes: if 'pg_old_packages_remove' is 'true' +- **pgBackRest** (if 'pgbackrest_install' is 'true') + - Check pg-path option + - Update pg-path in pgbackrest.conf + - Upgrade stanza +- **WAL-G** (if 'wal_g_install' is 'true') + - Update PostgreSQL data directory path in .walg.json + - Update PostgreSQL data directory path in cron jobs +- **Check the Patroni cluster state** +- **Check the current PostgreSQL version** +- **Remove temporary local access rule from pg_hba.conf** + - Notes: if it has been changed + - Update the PostgreSQL configuration +- **Print info messages** + - List the Patroni cluster members + - Upgrade completed diff --git a/roles/upgrade/tasks/checkpoint_location.yml b/roles/upgrade/tasks/checkpoint_location.yml new file mode 100644 index 000000000..c79f048b7 --- /dev/null +++ b/roles/upgrade/tasks/checkpoint_location.yml @@ -0,0 +1,82 @@ +--- +# This playbook performs several tasks related to PostgreSQL's "Latest checkpoint location": +# 1. Retrieves the value from the cluster leader and its replicas. +# 2. Debugs this value for both leader and replicas. +# 3. Determines if the values match across the leader and replicas, setting 'pg_checkpoint_location_match' accordingly. +# 4. If the values match across all nodes, a success message is displayed and the update procedure continues. +# 5. If there's a mismatch, the previously stopped cluster starts (rollback), and the playbook stops with an error message. + +- name: Get 'Latest checkpoint location' on the Leader + ansible.builtin.shell: | + set -o pipefail; + {{ pg_old_bindir }}/pg_controldata {{ pg_old_datadir }} | grep 'Latest checkpoint location' | awk '{print $4}' + args: + executable: /bin/bash + changed_when: false + register: pg_checkpoint_location_leader + when: + - inventory_hostname in groups['primary'] + +- name: Get 'Latest checkpoint location' on the Replicas + ansible.builtin.shell: | + set -o pipefail; + {{ pg_old_bindir }}/pg_controldata {{ pg_old_datadir }} | grep 'Latest checkpoint location' | awk '{print $4}' + args: + executable: /bin/bash + changed_when: false + register: pg_checkpoint_location_replica + when: + - inventory_hostname in groups['secondary'] + +- name: Print 'Latest checkpoint location' for the Leader + ansible.builtin.debug: + msg: "Leader's latest checkpoint location: {{ pg_checkpoint_location_leader.stdout }}" + when: + - inventory_hostname in groups['primary'] + +- name: Print 'Latest checkpoint location' for the Replica + ansible.builtin.debug: + msg: "Replica: {{ inventory_hostname }}, latest checkpoint location: {{ pg_checkpoint_location_replica.stdout }}" + when: + - inventory_hostname in groups['secondary'] + +- name: Check if all 'Latest checkpoint location' values match + ansible.builtin.set_fact: + pg_checkpoint_location_match: "{{ pg_checkpoint_location_replica.stdout == hostvars[groups['primary'][0]]['pg_checkpoint_location_leader']['stdout'] }}" + when: + - inventory_hostname in groups['secondary'] + +- name: "SUCCESS: 'Latest checkpoint location' values match on all cluster nodes" + ansible.builtin.debug: + msg: "'Latest checkpoint location' is the same on the leader and its standbys" + run_once: true + when: + # This condition retrieves the 'pg_checkpoint_location_match' value for each node in the 'secondary' group. + # The 'select' filter selects all nodes whose 'pg_checkpoint_location_match' is 'False'. + # If no such nodes exist (i.e., the length of the resulting list is less than 1), it means that the 'pg_checkpoint_location_match' is 'True' for all nodes. + - groups['secondary'] | map('extract', hostvars, 'pg_checkpoint_location_match') | select('equalto', False) | list | length < 1 + +# Stop, if 'Latest checkpoint location' doesn't match +- block: + - name: "'Latest checkpoint location' doesn't match" + ansible.builtin.debug: + msg: "'Latest checkpoint location' doesn't match on leader and its standbys" + run_once: true + + # rollback + - name: Perform rollback + ansible.builtin.include_tasks: rollback.yml + + - name: "ERROR: 'Latest checkpoint location' doesn't match" + ansible.builtin.fail: + msg: "'Latest checkpoint location' doesn't match on leader and its standbys. Please try again later" + run_once: true + when: + # This condition retrieves the 'pg_checkpoint_location_match' value for each node in the 'secondary' group. + # The 'select' filter selects all nodes whose 'pg_checkpoint_location_match' is 'False'. + # If there is at least one such node (i.e., the length of the resulting list is greater than 0), + # it means that the 'pg_checkpoint_location_match' is not 'True' for all nodes, + # and the block of tasks is executed, including cleanup and throwing an error. + - groups['secondary'] | map('extract', hostvars, 'pg_checkpoint_location_match') | select('equalto', False) | list | length > 0 + +... diff --git a/roles/upgrade/tasks/custom_wal_dir.yml b/roles/upgrade/tasks/custom_wal_dir.yml new file mode 100644 index 000000000..1cf2819bd --- /dev/null +++ b/roles/upgrade/tasks/custom_wal_dir.yml @@ -0,0 +1,48 @@ +--- + +- name: "Make sure {{ pg_new_datadir }}/pg_wal is not symlink" + ansible.builtin.stat: + path: "{{ pg_new_datadir }}/pg_wal" + register: sym + +# Synchronize WAL`s (if wal dir is not symlink) +- block: + - name: Make sure the custom WAL directory "{{ pg_new_wal_dir }}" exists and is empty + become: true + become_user: root + ansible.builtin.file: + path: "{{ pg_new_wal_dir }}" + state: "{{ item }}" + owner: postgres + group: postgres + mode: "0700" + loop: + - absent + - directory + + - name: "Synchronize {{ pg_new_datadir }}/pg_wal to {{ pg_new_wal_dir }}" + become: true + become_user: postgres + ansible.posix.synchronize: + src: "{{ pg_new_datadir }}/pg_wal/" + dest: "{{ pg_new_wal_dir }}/" + delegate_to: "{{ inventory_hostname }}" + + - name: "Rename pg_wal to pg_wal_old" + ansible.builtin.command: mv {{ pg_new_datadir }}/pg_wal {{ pg_new_datadir }}/pg_wal_old + + - name: "Create symlink {{ pg_new_datadir }}/pg_wal -> {{ pg_new_wal_dir }}" + become: true + become_user: postgres + ansible.builtin.file: + src: "{{ pg_new_wal_dir }}" + dest: "{{ pg_new_datadir }}/pg_wal" + state: link + + - name: "Remove pg_wal_old directory" + ansible.builtin.file: + path: "{{ pg_new_datadir }}/pg_wal_old" + state: absent + when: sym.stat.exists and not sym.stat.islnk | bool + +... diff --git a/roles/upgrade/tasks/dcs_remove_cluster.yml b/roles/upgrade/tasks/dcs_remove_cluster.yml new file mode 100644 index 000000000..286ff5671 --- /dev/null +++ b/roles/upgrade/tasks/dcs_remove_cluster.yml @@ -0,0 +1,16 @@ +--- + +- name: Remove existing cluster "{{ patroni_cluster_name }}" from DCS + ansible.builtin.expect: + command: "patronictl -c {{ patroni_config_file }} remove {{ patroni_cluster_name }}" + responses: + (.*)Please confirm the cluster name to remove: "{{ patroni_cluster_name }}" + (.*)"Yes I am aware": "Yes I am aware" + environment: + PATH: "{{ ansible_env.PATH }}:/usr/bin:/usr/local/bin" + vars: + ansible_python_interpreter: /usr/bin/python3 + when: + - inventory_hostname in groups['primary'] + +... diff --git a/roles/upgrade/tasks/extensions.yml b/roles/upgrade/tasks/extensions.yml new file mode 100644 index 000000000..1751fbe5f --- /dev/null +++ b/roles/upgrade/tasks/extensions.yml @@ -0,0 +1,19 @@ +--- + +- name: Get a list of databases + ansible.builtin.command: psql -tAXc "select datname from pg_catalog.pg_database where datname <> 'template0'" + register: databases_list + changed_when: false + when: + - inventory_hostname in groups['primary'] + +- name: Update extensions in each database + ansible.builtin.include_tasks: update_extensions.yml + loop: "{{ databases_list.stdout_lines }}" + loop_control: + loop_var: pg_target_dbname + when: + - databases_list.stdout_lines is defined + - databases_list.stdout_lines | length > 0 + +... diff --git a/roles/upgrade/tasks/initdb.yml b/roles/upgrade/tasks/initdb.yml new file mode 100644 index 000000000..3db2fd4c2 --- /dev/null +++ b/roles/upgrade/tasks/initdb.yml @@ -0,0 +1,106 @@ +--- + +- name: Make sure new PostgreSQL data directory "{{ pg_new_datadir }}" exists + become: true + become_user: root + ansible.builtin.file: + path: "{{ pg_new_datadir }}" + state: directory + mode: "0700" + group: postgres + owner: postgres + +- name: Make sure new PostgreSQL data directory "{{ pg_new_datadir }}" is not initialized + ansible.builtin.stat: + path: "{{ pg_new_datadir }}/PG_VERSION" + register: pgdata_initialized + +- block: # if already initialized + - name: Perform pg_dropcluster + ansible.builtin.command: > + /usr/bin/pg_dropcluster --stop {{ pg_new_version }} {{ postgresql_cluster_name }} + failed_when: false + when: + - ansible_os_family == "Debian" + - pg_new_confdir != pg_new_datadir + + - name: Clear the new PostgreSQL data directory "{{ pg_new_datadir }}" + ansible.builtin.file: + path: "{{ pg_new_datadir }}" + state: "{{ item }}" + mode: "0700" + group: postgres + owner: postgres + loop: + - absent + - directory + when: + - pgdata_initialized.stat.exists is defined + - pgdata_initialized.stat.exists + +- name: Get the current install user + ansible.builtin.command: psql -tAXc "select rolname from pg_roles where oid = 10" + changed_when: false + register: pg_install_user + when: + - inventory_hostname in groups['primary'] + +- name: Get the current encodig and data_checksums settings + ansible.builtin.command: psql -tAXc "show {{ item }}" + changed_when: false + register: pg_settings + loop: + - server_encoding + - lc_collate + - lc_ctype + - lc_messages + - lc_monetary + - lc_numeric + - lc_time + - data_checksums + when: + - inventory_hostname in groups['primary'] + +# for Debian based use pg_createcluster, if the PostgreSQL configuration is not located in the data directory. +# Note: Patroni failure is possible if the default postgresql config files are missing in the /etc/postgresql/... +- name: Initialize new PostgreSQL data directory with default config files + ansible.builtin.command: > + /usr/bin/pg_createcluster {{ pg_new_version }} {{ postgresql_cluster_name }} + --user={{ hostvars[groups['primary'][0]].pg_install_user.stdout }} + --datadir={{ pg_new_datadir }} + --encoding={{ hostvars[groups['primary'][0]].pg_settings.results[0].stdout }} + --lc-collate={{ hostvars[groups['primary'][0]].pg_settings.results[1].stdout }} + --lc-ctype={{ hostvars[groups['primary'][0]].pg_settings.results[2].stdout }} + --lc-messages={{ hostvars[groups['primary'][0]].pg_settings.results[3].stdout }} + --lc-monetary={{ hostvars[groups['primary'][0]].pg_settings.results[4].stdout }} + --lc-numeric={{ hostvars[groups['primary'][0]].pg_settings.results[5].stdout }} + --lc-time={{ hostvars[groups['primary'][0]].pg_settings.results[6].stdout }} + --start-conf=manual + {% if hostvars[groups['primary'][0]].pg_settings.results[7].stdout == 'on' %} + -- --data-checksums + {% endif %} + when: + - ansible_os_family == "Debian" + - pg_new_confdir != pg_new_datadir + +# Use initdb, if the PostgreSQL configuration is located in the data directory. +- name: Initialize new PostgreSQL data directory on the Primary + ansible.builtin.command: > + {{ pg_new_bindir }}/initdb + --username={{ pg_install_user.stdout }} + --pgdata={{ pg_new_datadir }} + --encoding={{ pg_settings.results[0].stdout }} + --lc-collate={{ pg_settings.results[1].stdout }} + --lc-ctype={{ pg_settings.results[2].stdout }} + --lc-messages={{ pg_settings.results[3].stdout }} + --lc-monetary={{ pg_settings.results[4].stdout }} + --lc-numeric={{ pg_settings.results[5].stdout }} + --lc-time={{ pg_settings.results[6].stdout }} + {% if pg_settings.results[7].stdout == 'on' %} + --data-checksums + {% endif %} + when: + - inventory_hostname in groups['primary'] + - pg_new_confdir == pg_new_datadir + +... diff --git a/roles/upgrade/tasks/maintenance_disable.yml b/roles/upgrade/tasks/maintenance_disable.yml new file mode 100644 index 000000000..38a80bbd4 --- /dev/null +++ b/roles/upgrade/tasks/maintenance_disable.yml @@ -0,0 +1,59 @@ +--- +# Disable maintenance mode for HAProxy (Type A scheme) +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- block: + - name: Update haproxy conf file (enable http-checks) + ansible.builtin.template: + src: ../haproxy/templates/haproxy.cfg.j2 # use the haproxy role template + dest: /etc/haproxy/haproxy.cfg + owner: haproxy + group: haproxy + + - name: Reload haproxy service + ansible.builtin.systemd: + name: haproxy + state: reloaded + + - name: Start confd service + ansible.builtin.service: + name: confd + state: started + become: true + become_user: root + ignore_errors: true # show the error and continue the playbook execution + when: + - with_haproxy_load_balancing | bool + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +# Disable maintenance mode for vip-manager (Type B scheme) +- block: + - name: Update vip-manager service file (uncomment 'ExecStopPost') + ansible.builtin.replace: + path: /etc/systemd/system/vip-manager.service + regexp: '#ExecStopPost=/sbin/ip addr del {{ vip_manager_ip }}/{{ vip_manager_mask }} dev {{ vip_manager_iface }}' + replace: "ExecStopPost=/sbin/ip addr del {{ vip_manager_ip }}/{{ vip_manager_mask }} dev {{ vip_manager_iface }}" + + - name: Start vip-manager service + ansible.builtin.service: + name: vip-manager + daemon_reload: true + state: started + + - name: Make sure that the cluster ip address (VIP) "{{ cluster_vip }}" is running + ansible.builtin.wait_for: + host: "{{ cluster_vip }}" + port: "{{ pgbouncer_listen_port }}" + state: started + timeout: 30 + delay: 2 + become: true + become_user: root + ignore_errors: true # show the error and continue the playbook execution + when: + - not with_haproxy_load_balancing | bool + - cluster_vip | length > 0 + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +... diff --git a/roles/upgrade/tasks/maintenance_enable.yml b/roles/upgrade/tasks/maintenance_enable.yml new file mode 100644 index 000000000..7f7cf15fb --- /dev/null +++ b/roles/upgrade/tasks/maintenance_enable.yml @@ -0,0 +1,102 @@ +--- +# Disable auto failover – we need this to be able to stop leader before its standbys +- name: Pause Patroni cluster (enable maintenance mode) + become: true + become_user: postgres + ansible.builtin.command: "patronictl -c {{ patroni_config_file }} pause --wait {{ patroni_cluster_name }}" + environment: + PATH: "{{ ansible_env.PATH }}:/usr/bin:/usr/local/bin" + vars: + ansible_python_interpreter: /usr/bin/python3 + register: pause_result + failed_when: "'Cluster is already paused' not in pause_result.stderr and pause_result.rc != 0" + when: + - inventory_hostname in groups['primary'] + +# Enable maintenance mode for HAProxy (Type A scheme) +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +# Temporarily disable http-checks in order to keep database connections after stopping the Patroni service +# and then pause the pgbouncer pools. +- block: + - name: Stop confd service + ansible.builtin.service: + name: confd + state: stopped + + - name: Update haproxy conf file (disable http-checks) + ansible.builtin.template: + src: templates/haproxy-no-http-checks.cfg.j2 + dest: /etc/haproxy/haproxy.cfg + owner: haproxy + group: haproxy + + - name: Reload haproxy service + ansible.builtin.systemd: + name: haproxy + state: reloaded + become: true + become_user: root + when: + - with_haproxy_load_balancing | bool + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +# Enable maintenance mode for vip-manager (Type B scheme) +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +# Temporarily disable vip-manager service to keep database connections after stopping the Patroni service +# and then pause the pgbouncer pools. +# This prevents the VIP from being removed when the Patroni leader is unavailable during maintenance. +- block: + - name: Update vip-manager service file (comment out 'ExecStopPost') + ansible.builtin.replace: + path: /etc/systemd/system/vip-manager.service + regexp: 'ExecStopPost=/sbin/ip addr del {{ vip_manager_ip }}/{{ vip_manager_mask }} dev {{ vip_manager_iface }}' + replace: "#ExecStopPost=/sbin/ip addr del {{ vip_manager_ip }}/{{ vip_manager_mask }} dev {{ vip_manager_iface }}" + + - name: Stop vip-manager service + ansible.builtin.service: + name: vip-manager + daemon_reload: true + state: stopped + + - name: Make sure that the cluster ip address (VIP) "{{ cluster_vip }}" is running + ansible.builtin.wait_for: + host: "{{ cluster_vip }}" + port: "{{ pgbouncer_listen_port }}" + state: started + timeout: 30 + delay: 2 + become: true + become_user: root + when: + - not with_haproxy_load_balancing | bool + - cluster_vip | length > 0 + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +# Stop Patroni +- name: Stop Patroni service + become: true + become_user: root + ansible.builtin.service: + name: patroni + state: stopped + +- name: Wait until the Patroni cluster is stopped + ansible.builtin.shell: | + set -o pipefail; + patronictl -c {{ patroni_config_file }} list -f json | grep -cv '^\[\]$' + args: + executable: /bin/bash + environment: + PATH: "{{ ansible_env.PATH }}:/usr/bin:/usr/local/bin" + register: patronictl_result + until: patronictl_result.stdout|int == 0 + retries: 30 # max duration 5 minutes + delay: 10 + changed_when: false + failed_when: false + when: + - inventory_hostname in groups['primary'] + +... diff --git a/roles/upgrade/tasks/packages.yml b/roles/upgrade/tasks/packages.yml new file mode 100644 index 000000000..da3b116f0 --- /dev/null +++ b/roles/upgrade/tasks/packages.yml @@ -0,0 +1,66 @@ +--- + +- name: Clean yum cache + ansible.builtin.command: yum clean all + register: yum_status + until: yum_status is success + delay: 5 + retries: 3 + when: + - ansible_os_family == "RedHat" + - ansible_distribution_major_version == '7' + +- name: Clean dnf cache + ansible.builtin.command: dnf clean all + register: dnf_status + until: dnf_status is success + delay: 5 + retries: 3 + when: + - ansible_os_family == "RedHat" + - ansible_distribution_major_version is version('8', '>=') + +- name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + register: apt_status + until: apt_status is success + delay: 5 + retries: 3 + when: + - ansible_os_family == "Debian" + +- name: "Install PostgreSQL {{ pg_new_version }} packages" + ansible.builtin.package: + name: "{{ item }}" + state: latest + loop: "{{ pg_new_packages }}" + register: package_status + until: package_status is success + delay: 5 + retries: 3 + +# timescaledb (if enable_timescale is defined) +- name: Install TimescaleDB package for PostgreSQL {{ pg_new_version }} + ansible.builtin.package: + name: "{{ item }}" + state: latest + loop: "{{ timescaledb_package }}" + vars: + timescaledb_package: + - > + {% if pg_new_version | int >= 11 %} + timescaledb-2-postgresql-{{ pg_new_version }} + {% else %} + timescaledb-postgresql-{{ pg_new_version }} + {% endif %} + register: package_status + until: package_status is success + delay: 5 + retries: 3 + when: + - enable_timescale is defined + - enable_timescale | bool + +... diff --git a/roles/upgrade/tasks/pgbouncer_pause.yml b/roles/upgrade/tasks/pgbouncer_pause.yml new file mode 100644 index 000000000..30bf47007 --- /dev/null +++ b/roles/upgrade/tasks/pgbouncer_pause.yml @@ -0,0 +1,119 @@ +# yamllint disable rule:line-length +--- +# Perform PAUSE in parallel on all pgbouncers servers +# +# This script performs the following actions: +# 1. Waits for active queries on the database servers to complete (with a runtime more than 'pg_slow_active_query_treshold'). +# 2. If there are no active queries, sends a PAUSE command to each pgbouncer server in the pgb_servers list (in parallel to all servers). +# 3. If all pgbouncer are successfully paused, the script exits. +# 4. If active queries do not complete within 30 seconds, the script forcibly terminates slow active queries using pg_slow_active_terminate_query. +# 5. If after that it is still not possible to pause the pgbouncer servers within 60 seconds from the start of the script, the script exits with an error. +# +# The script uses the 'pause_results' array to track the results of executing the PAUSE command on each pgbouncer server. +# The 'timeout 2' command is used to set a timeout for the execution of the 'pgb_pause_command'. +# If the execution of the 'pgb_pause_command' does not finish within 2 seconds, +# the timeout command will interrupt the execution of 'pgb_resume_command' and execute the pgb_resume_query command to remove the pause and ensure atomicity. +# +# Finally, the script checks whether all servers have been successfully paused by comparing the number of successful PAUSE executions to the total number of pgbouncer servers. +- name: PAUSE PgBouncer pools + become: true + become_user: postgres + vars: + pg_slow_active_count_query: >- + select count(*) from pg_stat_activity + where pid <> pg_backend_pid() + and state <> 'idle' + and query_start < clock_timestamp() - interval '{{ pg_slow_active_query_treshold }} ms' + {{ "and backend_type = 'client backend'" if pg_old_version is version('10', '>=') else '' }} + pg_slow_active_terminate_query: >- + select + clock_timestamp(), + pg_terminate_backend(pid), + clock_timestamp() - query_start as query_age, + left(regexp_replace(query, E'[ \\t\\n\\r]+', ' ', 'g'),150) as query + from pg_stat_activity + where pid <> pg_backend_pid() + and state <> 'idle' + and query_start < clock_timestamp() - interval '{{ pg_slow_active_query_treshold_to_terminate }} ms' + {{ "and backend_type = 'client backend'" if pg_old_version is version('10', '>=') else '' }} + ansible.builtin.shell: | + set -o pipefail; + + pg_servers="{{ (groups['primary'] + groups['secondary']) | join('\n') }}" + pg_count=$(echo -e "$pg_servers" | wc -l) + pg_slow_active_count_query="{{ pg_slow_active_count_query }}" + pg_slow_active_terminate_query="{{ pg_slow_active_terminate_query }}" + # it is assumed that pgbouncer is installed on database servers + pgb_servers="$pg_servers" + pgb_count="$pg_count" + pgb_pause_command="psql -h localhost -p {{ pgbouncer_listen_port }} -U {{ patroni_superuser_username }} -d pgbouncer -tAXc \"PAUSE\"" + pgb_resume_command='kill -SIGUSR2 $(pidof pgbouncer)' + + start_time=$(date +%s) + while true; do + current_time=$(date +%s) + # initialize pgb_paused_count to 0 (we assume that all pgbouncers are not paused) + pgb_paused_count=0 + + # wait for the active queries to complete on pg_servers + IFS=$'\n' pg_slow_active_counts=($(echo -e "$pg_servers" | xargs -I {} -P "$pg_count" -n 1 ssh -o StrictHostKeyChecking=no {} "psql -tAXc \"$pg_slow_active_count_query\"")) + + # sum up all the values in the array + total_pg_slow_active_count=0 + for count in "${pg_slow_active_counts[@]}"; do + total_pg_slow_active_count=$((total_pg_slow_active_count + count)) + done + + echo "$(date): total pg_slow_active_count: $total_pg_slow_active_count" + + if [[ "$total_pg_slow_active_count" == 0 ]]; then + # pause pgbouncer on all pgb_servers. We send via ssh to all pgbouncers in parallel and collect results from all (maximum wait time 2 seconds) + IFS=$'\n' pause_results=($(echo -e "$pgb_servers" | xargs -I {} -P "$pgb_count" -n 1 ssh -o StrictHostKeyChecking=no {} "timeout 2 $pgb_pause_command 2>&1 || true")) + echo "${pause_results[*]}" + # analyze the pause_results array to count the number of paused pgbouncers + pgb_paused_count=$(echo "${pause_results[*]}" | grep -o -e "PAUSE" -e "already suspended/paused" | wc -l) + echo "$(date): pgb_count: $pgb_count, pgb_paused: $pgb_paused_count" + fi + + # make sure that the pause is performed on all pgbouncer servers, to ensure atomicity + if [[ "$pgb_paused_count" -eq "$pgb_count" ]]; then + break # pause is performed on all pgb_servers, exit from the loop + elif [[ "$pgb_paused_count" -gt 0 && "$pgb_paused_count" -ne "$pgb_count" ]]; then + # pause is not performed on all pgb_servers, perform resume (we do not use timeout because we mast to resume all pgbouncers) + IFS=$'\n' resume_results=($(echo -e "$pgb_servers" | xargs -I {} -P "$pgb_count" -n 1 ssh -o StrictHostKeyChecking=no {} "$pgb_resume_command 2>&1 || true")) + echo "${resume_results[*]}" + fi + + # after 30 seconds of waiting, terminate active sessions on pg_servers and try pausing again + if (( current_time - start_time >= {{ pgbouncer_pool_pause_terminate_after }} )); then + echo "$(date): terminate active queries" + echo -e "$pg_servers" | xargs -I {} -P "$pg_count" -n 1 ssh -o StrictHostKeyChecking=no {} "psql -tAXc \"$pg_slow_active_terminate_query\"" + fi + + # if it was not possible to pause for 60 seconds, exit with an error + if (( current_time - start_time >= {{ pgbouncer_pool_pause_stop_after }} )); then + echo "$(date): it was not possible to pause (exit by timeout)" + exit 1 + fi + done > /tmp/pgbouncer_pool_pause_{{ ansible_date_time.date }}.log + args: + executable: /bin/bash + register: pgbouncer_pool_pause_result + ignore_errors: true + when: inventory_hostname in groups['primary'] + +# Stop, if it was not possible to put the pools on pause +- block: + - name: Perform rollback + ansible.builtin.include_tasks: rollback.yml + + - name: "ERROR: PgBouncer pools cannot be paused" + ansible.builtin.fail: + msg: + - "PgBouncer pools could not be paused, please try again later." + - "The log is available on the path: /tmp/pgbouncer_pool_pause_{{ ansible_date_time.date }}.log" + - "on the {{ hostvars[groups['primary'][0]]['ansible_hostname'] }} server." + run_once: true + when: hostvars[groups['primary'][0]].pgbouncer_pool_pause_result is failed + +... diff --git a/roles/upgrade/tasks/pgbouncer_resume.yml b/roles/upgrade/tasks/pgbouncer_resume.yml new file mode 100644 index 000000000..433ca5fef --- /dev/null +++ b/roles/upgrade/tasks/pgbouncer_resume.yml @@ -0,0 +1,11 @@ +--- +# Perform RESUME pgbouncers server +- name: RESUME PgBouncer pools + become: true + become_user: postgres + ansible.builtin.shell: kill -SIGUSR2 $(pidof pgbouncer) + args: + executable: /bin/bash + ignore_errors: true # if there is an error, show the message and continue + +... diff --git a/roles/upgrade/tasks/post_checks.yml b/roles/upgrade/tasks/post_checks.yml new file mode 100644 index 000000000..abc659c55 --- /dev/null +++ b/roles/upgrade/tasks/post_checks.yml @@ -0,0 +1,75 @@ +--- + +- name: Make sure that physical replication is active + ansible.builtin.command: >- + psql -tAXc "select count(*) from pg_stat_replication + where application_name != 'pg_basebackup'" + register: pg_replication_state + until: pg_replication_state.stdout | int > 0 + retries: 30 # max wait time: 1 minute + delay: 2 + changed_when: false + failed_when: false + when: + - inventory_hostname in groups['primary'] + +# Error, if no active replication connections found. +- name: "Post-Check error. No active replication connections found." + ansible.builtin.debug: + msg: + - "No active replication connections found." + - "Please check the replication status and PostgreSQL logs." + failed_when: pg_replication_state.stdout | int == 0 + ignore_errors: true # show the error and continue the playbook execution + when: + - inventory_hostname in groups['primary'] + - pg_replication_state.stdout | int == 0 + +- name: Create a table "test_replication" with 10000 rows on the Primary + ansible.builtin.command: >- + psql -tAXc "drop table IF EXISTS test_replication; + create table test_replication as select generate_series(1, 10000)" + when: + - inventory_hostname in groups['primary'] + +- name: Wait until the PostgreSQL replica is synchronized + ansible.builtin.command: >- + psql -tAXc "select count(*) from test_replication" + register: count_test + until: count_test.stdout | int == 10000 + retries: 60 # max wait time: 2 minutes + delay: 2 + changed_when: false + failed_when: false + when: + - inventory_hostname in groups['secondary'] + +- name: Drop a table "test_replication" + ansible.builtin.command: >- + psql -tAXc "drop table IF EXISTS test_replication" + when: + - inventory_hostname in groups['primary'] + +- name: Print the result of checking the number of records + ansible.builtin.debug: + msg: + - "The PostgreSQL Replication is OK for replica {{ ansible_hostname }}" + - "The number of records in the test_replication table the same as the Primary ({{ count_test.stdout }} rows)" + when: + - inventory_hostname in groups['secondary'] + - count_test.stdout | int == 10000 + +# Error, if the number of records in the "test_replication" table does not match the Primary. +- name: "Post-Check error. The number of records does not match" + ansible.builtin.debug: + msg: + - "The PostgreSQL Replication is NOT OK for replica {{ ansible_hostname }}" + - "The number of records in the test_replication table does not match the Primary ({{ count_test.stdout }} rows)." + - "Please check the replication status and PostgreSQL logs." + failed_when: count_test.stdout | int != 10000 + ignore_errors: true # show the error and continue the playbook execution + when: + - inventory_hostname in groups['secondary'] + - count_test.stdout | int != 10000 + +... diff --git a/roles/upgrade/tasks/post_upgrade.yml b/roles/upgrade/tasks/post_upgrade.yml new file mode 100644 index 000000000..680f58e9e --- /dev/null +++ b/roles/upgrade/tasks/post_upgrade.yml @@ -0,0 +1,199 @@ +--- + +- name: Ensure the current data directory is the new data directory + ansible.builtin.command: psql -tAXc "show data_directory" + changed_when: false + register: pg_current_datadir + +# RedHat based +- name: Delete the old PostgreSQL data directory + ansible.builtin.file: + path: "{{ pg_old_datadir }}" + state: absent + when: + - pg_new_datadir == pg_current_datadir.stdout | trim + - ansible_os_family == "RedHat" + +# Debian based (use pg_dropcluster) +- name: Delete the old PostgreSQL data directory (perform pg_dropcluster) + ansible.builtin.command: > + /usr/bin/pg_dropcluster {{ pg_old_version }} {{ postgresql_cluster_name }} + failed_when: false + when: + - pg_new_datadir == pg_current_datadir.stdout | trim + - ansible_os_family == "Debian" + +# if pg_new_wal_dir is defined +- name: Delete the old PostgreSQL WAL directory + ansible.builtin.file: + path: "{{ postgresql_wal_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_old_version) }}" + state: absent + when: + - postgresql_wal_dir | length > 0 + - pg_new_wal_dir | length > 0 + +# RedHat based +- name: Remove old PostgreSQL packages + become: true + become_user: root + ansible.builtin.package: + name: "{{ item }}" + state: absent + loop: "{{ postgresql_packages | regex_replace(postgresql_version, pg_old_version) }}" + register: package_remove + until: package_remove is success + delay: 5 + retries: 3 + ignore_errors: true + when: + - item is search(pg_old_version) + - pg_old_packages_remove | bool + - ansible_os_family == "RedHat" + +# Debian based (use purge option) +- name: Remove old PostgreSQL packages + become: true + become_user: root + ansible.builtin.apt: + name: "{{ item }}" + state: absent + purge: true + loop: "{{ postgresql_packages | regex_replace(postgresql_version, pg_old_version) }}" + register: apt_remove + until: apt_remove is success + delay: 5 + retries: 3 + ignore_errors: true + when: + - item is search(pg_old_version) + - pg_old_packages_remove | bool + - ansible_os_family == "Debian" + +# pgbackrest (local) +- block: + - name: pgbackrest | Check pg-path option + ansible.builtin.command: "grep -c '^pg.*-path=' {{ pgbackrest_conf_file }}" + register: pg_path_count + changed_when: false + + - name: pgbackrest | Update pg-path in pgbackrest.conf + ansible.builtin.lineinfile: + path: "{{ pgbackrest_conf_file }}" + regexp: '^pg{{ idx + 1 }}-path=' + line: 'pg{{ idx + 1 }}-path={{ pg_new_datadir }}' + loop: "{{ range(0, pg_path_count.stdout | int) | list }}" + loop_control: + index_var: idx + label: "pg{{ idx + 1 }}-path={{ pg_new_datadir }}" + when: pg_path_count.stdout | length > 0 + + - name: pgbackrest | Upgrade stanza "{{ pgbackrest_stanza }}" + ansible.builtin.command: "pgbackrest --stanza={{ pgbackrest_stanza }} --no-online stanza-upgrade" + when: pg_path_count.stdout | length > 0 + become: true + become_user: postgres + ignore_errors: true + when: + - pgbackrest_install | bool + - pgbackrest_repo_host | length < 1 + +# pgbackrest (dedicated) +- block: + - name: pgbackrest | Check pg-path option + delegate_to: "{{ groups['pgbackrest'][0] }}" + run_once: true + ansible.builtin.command: "grep -c '^pg.*-path=' {{ pgbackrest_conf_file | dirname }}/conf.d/{{ pgbackrest_stanza }}.conf" + register: pg_path_count + changed_when: false + + - name: pgbackrest | Update pg-path in pgbackrest.conf + delegate_to: "{{ groups['pgbackrest'][0] }}" + run_once: true + ansible.builtin.lineinfile: + path: "{{ pgbackrest_conf_file | dirname }}/conf.d/{{ pgbackrest_stanza }}.conf" + regexp: '^pg{{ idx + 1 }}-path=' + line: 'pg{{ idx + 1 }}-path={{ pg_new_datadir }}' + loop: "{{ range(0, pg_path_count.stdout | int) | list }}" + loop_control: + index_var: idx + label: "pg{{ idx + 1 }}-path={{ pg_new_datadir }}" + when: pg_path_count.stdout | length > 0 + + - name: pgbackrest | Upgrade stanza "{{ pgbackrest_stanza }}" + delegate_to: "{{ groups['pgbackrest'][0] }}" + run_once: true + ansible.builtin.command: "pgbackrest --stanza={{ pgbackrest_stanza }} --no-online stanza-upgrade" + when: pg_path_count.stdout | length > 0 + become: true + become_user: "{{ pgbackrest_repo_user }}" + ignore_errors: true + when: + - pgbackrest_install | bool + - pgbackrest_repo_host | length > 0 + +# WAL-G +- block: + - name: "WAL-G | Update PostgreSQL data directory path in .walg.json" + ansible.builtin.replace: + path: "{{ postgresql_home_dir }}/.walg.json" + regexp: "{{ postgresql_data_dir | regex_replace(postgresql_version, pg_old_version) }}" + replace: "{{ postgresql_data_dir | regex_replace(postgresql_version, pg_new_version) }}" + + - name: "WAL-G | Update PostgreSQL data directory path in cron jobs" + ansible.builtin.replace: + path: "{{ wal_g_cron_jobs[0].file | default('/etc/cron.d/walg') }}" + regexp: "{{ postgresql_data_dir | regex_replace(postgresql_version, pg_old_version) }}" + replace: "{{ postgresql_data_dir | regex_replace(postgresql_version, pg_new_version) }}" + become: true + become_user: root + ignore_errors: true + when: wal_g_install | bool + +- name: Check the Patroni cluster state + run_once: true + become: true + become_user: postgres + ansible.builtin.command: "patronictl -c {{ patroni_config_file }} list" + register: patronictl_result + changed_when: false + environment: + PATH: "{{ ansible_env.PATH }}:/usr/bin:/usr/local/bin" + when: inventory_hostname in groups['primary'] + +- name: Check the current PostgreSQL version + run_once: true + ansible.builtin.command: psql -tAXc "select current_setting('server_version')" + register: postgres_version + changed_when: false + when: inventory_hostname in groups['primary'] + +# Return the pg_hba.conf file to its original state (if it has been changed) +- block: + - name: Remove temporary local access rule from pg_hba.conf + ansible.builtin.blockinfile: + path: "{{ pg_new_confdir }}/pg_hba.conf" + marker: "# {mark} ANSIBLE TEMPORARY pg_upgrade RULE" + state: absent + + - name: Update the PostgreSQL configuration + ansible.builtin.command: "{{ pg_new_bindir }}/pg_ctl reload -D {{ pg_new_datadir }}" + when: + - socket_access_result.stderr is defined + - "'no pg_hba.conf entry' in socket_access_result.stderr" + +# finish (info) +- name: List the Patroni cluster members + run_once: true + ansible.builtin.debug: + msg: "{{ patronictl_result.stdout_lines }}" + when: patronictl_result.stdout_lines is defined + +- name: Upgrade completed + run_once: true + ansible.builtin.debug: + msg: + - "PostgreSQL upgrade completed." + - "Current version: {{ postgres_version.stdout }}" + when: postgres_version.stdout is defined + +... diff --git a/roles/upgrade/tasks/pre_checks.yml b/roles/upgrade/tasks/pre_checks.yml new file mode 100644 index 000000000..1b2727dfa --- /dev/null +++ b/roles/upgrade/tasks/pre_checks.yml @@ -0,0 +1,350 @@ +--- + +# Stop, if pg_old_version, pg_new_version are not defined +- name: Make sure that the required variables are specified + run_once: true + ansible.builtin.debug: + msg: + - "One or more required variables have empty values." + - "Please specify a value for the variables: pg_old_version, pg_new_version" + failed_when: pg_old_version | length < 1 or pg_new_version | length < 1 + when: pg_old_version | length < 1 or pg_new_version | length < 1 + +# Stop, if the directories of the old and new versions are the same +- name: "Make sure that the old and new data and config directories do not match" + run_once: true + ansible.builtin.debug: + msg: + - "pg_old_datadir and pg_new_datadir, pg_old_confdir and pg_new_confdir must not be the same." + - "Please check your configuration (vars/upgrade.yml)" + failed_when: (pg_old_datadir == pg_new_datadir) or (pg_old_confdir == pg_new_confdir) + when: (pg_old_datadir == pg_new_datadir) or (pg_old_confdir == pg_new_confdir) + +# required to perform the dcs_remove_cluster.yml +- name: '[Pre-Check] Make sure the ansible required Python library is installed' + ansible.builtin.pip: + name: "{{ item }}" + state: present + executable: pip3 + extra_args: "--trusted-host=pypi.python.org --trusted-host=pypi.org --trusted-host=files.pythonhosted.org" + umask: "0022" + loop: + - pexpect + environment: + PATH: "{{ ansible_env.PATH }}:/usr/local/bin:/usr/bin" + vars: + ansible_python_interpreter: /usr/bin/python3 + +- name: '[Pre-Check] Test PostgreSQL database access using a unix socket' + ansible.builtin.command: psql -tAXc 'select 1' + register: socket_access_result + changed_when: false + failed_when: "socket_access_result.rc != 0 and 'no pg_hba.conf entry' not in socket_access_result.stderr" + +# if 'no pg_hba.conf entry' +- block: + # Add a temporary local access rule for pg_upgrade to allow the upgrade process to proceed without authentication issues. + # This is necessary to ensure a smooth upgrade process and will be removed after the upgrade is complete. + - name: Add temporary local access rule (during the upgrade) + ansible.builtin.blockinfile: + path: "{{ pg_old_confdir }}/pg_hba.conf" + marker: "# {mark} ANSIBLE TEMPORARY pg_upgrade RULE" + insertbefore: BOF + content: "local all all trust" + + - name: Update the PostgreSQL configuration + ansible.builtin.command: "{{ pg_old_bindir }}/pg_ctl reload -D {{ pg_old_datadir }}" + when: + - socket_access_result.stderr is defined + - "'no pg_hba.conf entry' in socket_access_result.stderr" + +- name: '[Pre-Check] Check the current version of PostgreSQL' + ansible.builtin.command: psql -tAXc "select setting::integer/10000 from pg_settings where name = 'server_version_num'" + register: pg_current_version + changed_when: false + when: + - inventory_hostname in groups['primary'] + - pg_old_version is version('10', '>=') + +# for compatibility with Postgres 9.x +- name: '[Pre-Check] Check the current version of PostgreSQL' + ansible.builtin.command: psql -tAXc "select substring(setting from '^[0-9]+\.[0-9]+') from pg_settings where name = 'server_version'" + register: pg_current_version_9x + changed_when: false + when: + - inventory_hostname in groups['primary'] + - pg_old_version is version('10', '<') + +- name: "Set variable 'current_pg_version'" + ansible.builtin.set_fact: + current_pg_version: "{{ pg_current_version.stdout if pg_old_version is version('10', '>=') else pg_current_version_9x.stdout }}" + when: + - inventory_hostname in groups['primary'] + +# Stop, if the current version does not match pg_old_version +- name: "Pre-Check error. An incorrect version of PostgreSQL may have been specified" + ansible.builtin.fail: + msg: + - "The current version of PostgreSQL is {{ current_pg_version }}" + - "Make sure that you have specified the correct version in the pg_old_version variable." + when: + - inventory_hostname in groups['primary'] + - current_pg_version is not version (pg_old_version, '==') + +# Stop, if the current version greater than or equal to pg_new_version +- name: "Pre-Check error. An incorrect target version of PostgreSQL may have been specified" + ansible.builtin.fail: + msg: + - "The current version of PostgreSQL is {{ current_pg_version }}, no upgrade is needed." + - "Or, make sure that you have specified the correct version in the pg_new_version variable." + when: + - inventory_hostname in groups['primary'] + - current_pg_version is version (pg_new_version, '>=') + +# This check is necessary to avoid the risk of deleting the current data directory +# the current directory must not be equal to the path specified in the pg_new_datadir variable +# which will later be cleaned up before executing initdb for a new version of PostgreSQL +- name: '[Pre-Check] Ensure new data directory is different from the current one' + ansible.builtin.command: psql -tAXc "show data_directory" + changed_when: false + register: pg_current_datadir + when: + - inventory_hostname in groups['primary'] + +# Stop, if the current data directory is the same as pg_new_datadir +- name: "Pre-Check error. The current data directory is the same as new data directory" + ansible.builtin.fail: + msg: + - "The new data directory ({{ pg_new_datadir }}) must be different from the current one ({{ pg_current_datadir.stdout | trim }})" + when: + - inventory_hostname in groups['primary'] + - pg_new_datadir == pg_current_datadir.stdout | trim + +# Stop, if the current WAL directory is the same as pg_new_wal_dir +- name: "Pre-Check error. The current WAL directory is the same as new WAL directory" + ansible.builtin.fail: + msg: + - "The new WAL directory ({{ pg_new_wal_dir }}) must be different from the current one ({{ postgresql_wal_dir }})" + - "Please specify a different path for the 'pg_new_wal_dir' variable." + when: + - inventory_hostname in groups['primary'] + - pg_new_wal_dir | length > 0 and pg_new_wal_dir == postgresql_wal_dir + +- name: '[Pre-Check] Make sure that physical replication is active' + ansible.builtin.command: >- + psql -tAXc "select count(*) from pg_stat_replication + where application_name != 'pg_basebackup'" + register: pg_replication_state + changed_when: false + when: + - inventory_hostname in groups['primary'] + +# Stop, if there are no active replicas +- name: "Pre-Check error. Print physical replication state" + ansible.builtin.fail: + msg: "There are no active replica servers (pg_stat_replication returned 0 entries)." + when: + - inventory_hostname in groups['primary'] + - pg_replication_state.stdout | int == 0 + +- name: '[Pre-Check] Make sure there is no high replication lag (more than {{ max_replication_lag_bytes | human_readable }})' + ansible.builtin.command: >- + psql -tAXc "select + pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn) as pg_lag_bytes + from pg_stat_replication + order by pg_lag_bytes desc limit 1" + register: pg_lag_bytes + changed_when: false + failed_when: false + until: pg_lag_bytes.stdout|int < max_replication_lag_bytes|int + retries: 30 # 1 minute + delay: 5 + when: + - inventory_hostname in groups['primary'] + - pg_old_version is version('10', '>=') + +# Stop, if replication lag is high +- name: "Pre-Check error. High replication lag" + ansible.builtin.fail: + msg: + - "High replication lag ({{ pg_lag_bytes.stdout | int | human_readable }}) on the Patroni Cluster" + - "Please try again later." + when: + - pg_lag_bytes.stdout is defined + - pg_lag_bytes.stdout|int >= max_replication_lag_bytes|int + +# for compatibility with Postgres 9.x +- name: '[Pre-Check] Make sure there is no high replication lag (more than {{ max_replication_lag_bytes | human_readable }})' + ansible.builtin.command: >- + psql -tAXc "select + pg_xlog_location_diff(pg_current_xlog_location(),replay_location) as pg_lag_bytes + from pg_stat_replication + order by pg_lag_bytes desc limit 1" + register: pg_lag_bytes_9x + changed_when: false + failed_when: false + until: pg_lag_bytes_9x.stdout|int < max_replication_lag_bytes|int + retries: 30 # 1 minute + delay: 5 + when: + - inventory_hostname in groups['primary'] + - pg_old_version is version('10', '<') + +# Stop, if replication lag is high (for 9x) +- name: "Pre-Check error. High replication lag" + ansible.builtin.fail: + msg: + - "High replication lag ({{ pg_lag_bytes_9x.stdout | int | human_readable }}) on the Patroni Cluster" + - "Please try again later." + when: + - pg_lag_bytes_9x.stdout is defined + - pg_lag_bytes_9x.stdout|int >= max_replication_lag_bytes|int + +- name: '[Pre-Check] Make sure there are no long-running transactions (more than {{ max_transaction_sec }} seconds)' + ansible.builtin.command: >- + psql -tAXc "select pid, usename, client_addr, clock_timestamp() - xact_start as xact_age, + state, wait_event_type ||':'|| wait_event as wait_events, + left(regexp_replace(query, E'[ \\t\\n\\r]+', ' ', 'g'),100) as query + from pg_stat_activity + where clock_timestamp() - xact_start > '{{ max_transaction_sec }} seconds'::interval + and backend_type = 'client backend' and pid <> pg_backend_pid() + order by xact_age desc limit 10" + register: pg_long_transactions + changed_when: false + failed_when: false + until: pg_long_transactions.stdout | length < 1 + retries: 30 # 1 minute + delay: 2 + when: pg_old_version is version('10', '>=') + +# Stop, if long-running transactions detected +- block: + - name: "Print long-running (>{{ max_transaction_sec }}s) transactions" + ansible.builtin.debug: + msg: "{{ pg_long_transactions.stdout_lines }}" + + - name: "Pre-Check error. Long-running transactions detected" + ansible.builtin.fail: + msg: long-running transactions detected (more than {{ max_transaction_sec }} seconds), please try again later. + run_once: true + when: + - pg_long_transactions.stdout is defined + - pg_long_transactions.stdout | length > 0 + +# for compatibility with Postgres 9.x +- name: '[Pre-Check] Make sure there are no long-running transactions (more than {{ max_transaction_sec }} seconds)' + ansible.builtin.command: >- + psql -tAXc "select pid, usename, client_addr, clock_timestamp() - xact_start as xact_age, + state, left(regexp_replace(query, E'[ \\t\\n\\r]+', ' ', 'g'),100) as query + from pg_stat_activity + where clock_timestamp() - xact_start > '{{ max_transaction_sec }} seconds'::interval + order by xact_age desc limit 10" + register: pg_long_transactions_9x + changed_when: false + failed_when: false + until: pg_long_transactions_9x.stdout | length < 1 + retries: 30 # 1 minute + delay: 2 + when: pg_old_version is version('10', '<') + +# Stop, if long-running transactions detected (for 9x) +- block: + - name: "Print long-running (>{{ max_transaction_sec }}s) transactions" + ansible.builtin.debug: + msg: "{{ pg_long_transactions_9x.stdout_lines }}" + + - name: "Pre-Check error. Long-running transactions detected" + ansible.builtin.fail: + msg: long-running transactions detected (more than {{ max_transaction_sec }} seconds), please try again later. + run_once: true + when: + - pg_long_transactions_9x.stdout is defined + - pg_long_transactions_9x.stdout | length > 0 + +# SSH Keys (required for upgrade replicas with rsync) +- name: '[Pre-Check] Make sure that SSH key-based authentication is configured between cluster nodes' + ansible.builtin.include_tasks: ssh-keys.yml + vars: + ssh_key_user: postgres + +# Rsync Checks +- name: '[Pre-Check] Make sure that the rsync package are installed' + become: true + become_user: root + ansible.builtin.package: + name: rsync + state: present + +- name: '[Pre-Check] Rsync Checks: create testrsync file on Primary' + become: true + become_user: postgres + ansible.builtin.file: + path: /tmp/testrsync + state: touch + when: + - inventory_hostname in groups['primary'] + +- name: '[Pre-Check] Rsync Checks: test rsync and ssh key access' + become: true + become_user: postgres + ansible.builtin.shell: > + rsync -e "ssh -o StrictHostKeyChecking=no" --archive --delete --hard-links --size-only --no-inc-recursive --omit-dir-times + /tmp/testrsync {{ item }}:/tmp/ + args: + executable: /bin/bash + loop: "{{ groups.secondary | list }}" + when: + - inventory_hostname in groups['primary'] + +- name: "[Pre-Check] Cleanup testrsync file" + become: true + become_user: postgres + ansible.builtin.file: + path: /tmp/testrsync + state: absent + +# Tablespaces +- name: '[Pre-Check] Check if PostgreSQL tablespaces exist' + ansible.builtin.command: >- + psql -tAXc "select + pg_tablespace_location(oid) as tablespace_location + from pg_tablespace + where spcname not in ('pg_default','pg_global')" + register: tablespace_location + changed_when: false + when: + - inventory_hostname in groups['primary'] + +- name: "Print tablespace location" + ansible.builtin.debug: + var: tablespace_location.stdout_lines + when: + - inventory_hostname in groups['primary'] + - tablespace_location.stdout_lines | length > 0 + +# PgBouncer (if 'pgbouncer_pool_pause' is 'true') +# test access via localhost to be able to perform 'PAUSE' command +- name: '[Pre-Check] Test PgBouncer access via localhost' + ansible.builtin.command: > + psql -h localhost -p {{ pgbouncer_listen_port }} -U {{ patroni_superuser_username }} -d pgbouncer -tAXc "SHOW POOLS" + changed_when: false + when: + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +# Check the VIP address +- name: Make sure that the cluster ip address (VIP) "{{ cluster_vip }}" is running + ansible.builtin.wait_for: + host: "{{ cluster_vip }}" + port: "{{ target_port }}" + state: started + timeout: 3 + delay: 2 + vars: + target_port: >- + {{ pgbouncer_listen_port if pgbouncer_install | bool + else (ansible_ssh_port | default(22)) }} + when: + - cluster_vip | length > 0 + +... diff --git a/roles/upgrade/tasks/rollback.yml b/roles/upgrade/tasks/rollback.yml new file mode 100644 index 000000000..9c208b62f --- /dev/null +++ b/roles/upgrade/tasks/rollback.yml @@ -0,0 +1,215 @@ +--- +# This playbook performs a rollback of a PostgreSQL database cluster upgrade. +# It's designed to be used when a PostgreSQL upgrade hasn't been fully completed and the new version hasn't been started. +# The rollback operation is performed by starting the Patroni cluster with the old version of PostgreSQL using the same PGDATA. +# The playbook first checks the health of the current cluster, verifies the version of PostgreSQL, and ensures the new PostgreSQL is not running. +# If these checks pass, the playbook switches back to the old PostgreSQL paths and restarts the Patroni service. +# Notes: +# If pg_upgrade aborted before linking started, the old cluster was unmodified; it can be restarted. +# If you did not start the new cluster, the old cluster was unmodified except that, +# when linking started, a .old suffix was appended to $PGDATA/global/pg_control. +# To reuse the old cluster, remove the .old suffix from $PGDATA/global/pg_control; you can then restart the old cluster. +# If you did start the new cluster, it has written to shared files and it is unsafe to use the old cluster. +# The old cluster will need to be restored from backup in this case. + +# If the cluster is already healthy, the process stops to avoid unnecessary actions. +- name: '[Rollback] Check Patroni cluster state' + ansible.builtin.uri: + url: http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/cluster + return_content: true + register: patroni_cluster_result + failed_when: false + changed_when: false + when: + - inventory_hostname in groups['primary'] + +# Stop, if the Patroni cluster is already healthy +- name: '[Rollback] Abort if the Patroni cluster is already running' + ansible.builtin.fail: + msg: "The Patroni cluster is already running. Stop rollback." + vars: + cluster_members: "{{ patroni_cluster_result.json.members | default([]) | rejectattr('state', 'equalto', 'stopped') | list | length }}" + total_nodes: "{{ groups['primary'] | length + groups['secondary'] | length }}" + when: + - inventory_hostname in groups['primary'] + # Check if the cluster members (excluding 'stopped') equals the total number of nodes + - cluster_members == total_nodes + +- name: '[Rollback] Make sure the new PostgreSQL is not running' + ansible.builtin.command: "{{ pg_new_bindir }}/pg_ctl status -D {{ pg_new_datadir }}" + register: pg_ctl_status_new_result + failed_when: false + changed_when: false + +# Stop, if new PostgreSQL is running +# "If you did start the new cluster, it has written to shared files and it is unsafe to use the old cluster." +- name: '[Rollback] Abort if the new PostgreSQL cluster is already running' + ansible.builtin.fail: + msg: + - "The PostgreSQL {{ pg_new_version }} is running on host {{ ansible_hostname }}. We can't rollback." + when: + - pg_ctl_status_new_result is defined + - pg_ctl_status_new_result.rc == 0 + +- name: Check if pg_control.old exists + ansible.builtin.stat: + path: "{{ pg_old_datadir }}/global/pg_control.old" + register: pg_control_old + +# if 'pg_control.old' exists +# "To reuse the old cluster, remove the .old suffix from $PGDATA/global/pg_control" +- name: '[Rollback] Rename pg_control.old to pg_control' + ansible.builtin.command: mv "{{ pg_old_datadir }}/global/pg_control.old" "{{ pg_old_datadir }}/global/pg_control" + when: pg_control_old.stat.exists + +- name: '[Rollback] Check PostgreSQL version in pg_control' + ansible.builtin.shell: | + set -o pipefail; + {{ pg_old_bindir }}/pg_controldata {{ pg_old_datadir }} | grep 'pg_control version number' | awk '{print substr($4, 1, 2)}' + args: + executable: /bin/bash + changed_when: false + register: pg_control_version + when: + - inventory_hostname in groups['primary'] + +# Stop, if 'pg_control version number' is equal to the new PostgreSQL version +- name: '[Rollback] Abort if the PostgreSQL version does not match expected version' + ansible.builtin.fail: + msg: + - "The version in pg_control ({{ pg_control_version.stdout }}) is not equal to the PostgreSQL {{ pg_old_version }}. We can't rollback." + - "The old cluster will need to be restored from backup." + when: + - inventory_hostname in groups['primary'] + - pg_control_version.stdout == pg_new_version | replace('.', '') + +# Revert the paths to the old PostgreSQL +- name: '[Rollback] Revert the paths to the old PostgreSQL in patroni.yml' + ansible.builtin.replace: + path: "{{ patroni_config_file }}" + regexp: "{{ item.regexp }}" + replace: "{{ item.replace }}" + loop: + - { regexp: 'data_dir: {{ pg_new_datadir }}', replace: 'data_dir: {{ pg_old_datadir }}' } + - { regexp: 'bin_dir: {{ pg_new_bindir }}', replace: 'bin_dir: {{ pg_old_bindir }}' } + - { regexp: 'config_dir: {{ pg_new_confdir }}', replace: 'config_dir: {{ pg_old_confdir }}' } + loop_control: + label: '{{ item.replace }}' + +# Start Patroni cluster +- name: '[Rollback] Start Patroni service on the Cluster Leader' + become: true + become_user: root + ansible.builtin.service: + name: patroni + state: started + when: + - inventory_hostname in groups['primary'] + +- name: '[Rollback] Wait for port "{{ patroni_restapi_port }}" to become open on the host' + ansible.builtin.wait_for: + port: '{{ patroni_restapi_port }}' + host: '{{ inventory_hostname }}' + state: started + timeout: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['primary'] + +- name: '[Rollback] Resume Patroni (disable maintenance mode)' + run_once: true + ansible.builtin.command: "patronictl -c {{ patroni_config_file }} resume --wait {{ patroni_cluster_name }}" + environment: + PATH: "{{ ansible_env.PATH }}:/usr/bin:/usr/local/bin" + vars: + ansible_python_interpreter: /usr/bin/python3 + failed_when: false + +- name: '[Rollback] Check Patroni is healthy on the Leader' + ansible.builtin.uri: + url: http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/leader + status_code: 200 + register: patroni_leader_result + until: patroni_leader_result.status == 200 + retries: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['primary'] + +- name: '[Rollback] Start Patroni service on the Cluster Replica' + become: true + become_user: root + ansible.builtin.service: + name: patroni + state: started + when: + - inventory_hostname in groups['secondary'] + +- name: '[Rollback] Wait for port "{{ patroni_restapi_port }}" to become open on the host' + ansible.builtin.wait_for: + port: '{{ patroni_restapi_port }}' + host: '{{ inventory_hostname }}' + state: started + timeout: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['secondary'] + +- name: '[Rollback] Check Patroni is healthy on the Replica' + ansible.builtin.uri: + url: http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/health + status_code: 200 + register: patroni_replica_result + until: patroni_replica_result.status == 200 + retries: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['secondary'] + +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- block: + - name: '[Rollback] Disable maintenance mode' + ansible.builtin.include_tasks: maintenance_disable.yml + + - name: '[Rollback] Perform RESUME PgBouncer pools (if paused)' + ansible.builtin.include_tasks: pgbouncer_resume.yml + when: + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +- name: '[Rollback] Check PostgreSQL is started and accepting connections' + ansible.builtin.command: "{{ pg_old_bindir }}/pg_isready -p {{ postgresql_port }}" + register: pg_isready_result + until: pg_isready_result.rc == 0 + retries: 300 # max duration 10 minutes + delay: 2 + changed_when: false + +# Info +- block: + - name: Check the PostgreSQL version + ansible.builtin.command: psql -tAXc "select current_setting('server_version')" + register: postgres_version + changed_when: false + + - name: Get the Patroni cluster members + become: true + become_user: postgres + ansible.builtin.command: "patronictl -c {{ patroni_config_file }} list" + register: patronictl_result + changed_when: false + environment: + PATH: "{{ ansible_env.PATH }}:/usr/bin:/usr/local/bin" + + - name: Print the Patroni cluster state + ansible.builtin.debug: + msg: "{{ patronictl_result.stdout_lines }}" + + - name: Rollback completed + ansible.builtin.debug: + msg: + - "Rollback to the old PostgreSQL is completed." + - "Current version: {{ postgres_version.stdout }}" + when: inventory_hostname in groups['primary'] + +... diff --git a/roles/upgrade/tasks/schema_compatibility.yml b/roles/upgrade/tasks/schema_compatibility.yml new file mode 100644 index 000000000..5cae40a7d --- /dev/null +++ b/roles/upgrade/tasks/schema_compatibility.yml @@ -0,0 +1,139 @@ +--- + +- name: Get the current shared_preload_libraries settings + ansible.builtin.command: psql -tAXc "show shared_preload_libraries" + changed_when: false + register: pg_shared_preload_libraries + when: + - inventory_hostname in groups['primary'] + +- name: Get the current cron.database_name settings + ansible.builtin.command: psql -tAXc "select current_setting('cron.database_name', true)" + changed_when: false + register: pg_cron_database_name + when: + - inventory_hostname in groups['primary'] + - "'pg_cron' in pg_shared_preload_libraries.stdout" + +- name: Check if PostgreSQL is running + ansible.builtin.command: "{{ pg_new_bindir }}/pg_ctl status -D {{ pg_new_datadir }}" + register: pg_ctl_status_result + failed_when: false + changed_when: false + when: + - inventory_hostname in groups['primary'] + +- name: "Start new PostgreSQL on port {{ schema_compatibility_check_port }} to check the schema compatibility" + ansible.builtin.command: > + {{ pg_new_bindir }}/pg_ctl -D {{ pg_new_datadir }} + -o "-p {{ schema_compatibility_check_port }} + -c unix_socket_directories='/tmp' + -c shared_preload_libraries='{{ pg_shared_preload_libraries.stdout }}' + {% if pg_cron_database_name.stdout | default('') | length > 0 %} + -c cron.database_name='{{ pg_cron_database_name.stdout }}' + {% endif %} + -c config_file='{{ pg_new_confdir }}/postgresql.conf'" + start -w -t {{ pg_start_stop_timeout }} -l /tmp/pg_tmp_start.log + async: "{{ pg_start_stop_timeout }}" # run the command asynchronously + poll: 0 + register: pg_ctl_start_result + when: + - inventory_hostname in groups['primary'] + - pg_ctl_status_result.rc != 0 + +- name: Wait for PostgreSQL to start + ansible.builtin.async_status: + jid: "{{ pg_ctl_start_result.ansible_job_id }}" + register: pg_ctl_start_job_result + until: pg_ctl_start_job_result.finished + retries: "{{ (pg_start_stop_timeout | int) // 10 }}" + delay: 10 + when: + - pg_ctl_start_result.ansible_job_id is defined + - inventory_hostname in groups['primary'] + +- name: "Check the compatibility of the database schema with the PostgreSQL {{ pg_new_version }}" + ansible.builtin.shell: | + set -o pipefail; + {{ pg_new_bindir }}/pg_dumpall \ + -h {{ postgresql_unix_socket_dir }} \ + -p {{ postgresql_port }} \ + -U {{ pg_install_user.stdout }} \ + --schema-only | {{ pg_new_bindir }}/psql \ + -U {{ pg_install_user.stdout }} \ + -d postgres \ + -h /tmp \ + -p {{ schema_compatibility_check_port }} \ + > /tmp/pg_schema_compatibility_check.log 2>&1 + args: + executable: /bin/bash + async: "{{ schema_compatibility_check_timeout }}" # run the command asynchronously + poll: 0 + register: pg_dumpall_result + when: + - inventory_hostname in groups['primary'] + +- name: Wait for the schema compatibility check to complete. + ansible.builtin.async_status: + jid: "{{ pg_dumpall_result.ansible_job_id }}" + register: pg_dumpall_job_result + until: pg_dumpall_job_result.finished + retries: "{{ (schema_compatibility_check_timeout | int) // 10 }}" + delay: 10 + when: + - inventory_hostname in groups['primary'] + +- name: Checking the result of the schema compatibility + ansible.builtin.shell: > + set -o pipefail; + grep ERROR /tmp/pg_schema_compatibility_check.log | grep -v "already exists" + args: + executable: /bin/bash + register: pg_schema_compatibility_check_result + changed_when: false + failed_when: false + when: + - inventory_hostname in groups['primary'] + +- name: "Result of checking the compatibility of the scheme - success" + ansible.builtin.debug: + msg: "The database schema are compatible with PostgreSQL {{ pg_new_version }}" + when: + - inventory_hostname in groups['primary'] + - pg_schema_compatibility_check_result.stdout | length < 1 + +# Stop, if the scheme is not compatible (there are errors) +- name: "Result of checking the compatibility of the scheme - error" + ansible.builtin.debug: + msg: + - "{{ pg_schema_compatibility_check_result.stdout_lines }}" + - "The database schema is not compatible with PostgreSQL {{ pg_new_version }}" + - "Please check the /tmp/pg_schema_compatibility_check.log on the Primary" + failed_when: true + when: + - inventory_hostname in groups['primary'] + - pg_schema_compatibility_check_result.stdout | length > 0 + +- name: Stop new PostgreSQL to re-initdb + ansible.builtin.command: > + {{ pg_new_bindir }}/pg_ctl -D {{ pg_new_datadir }} stop -w -t {{ pg_start_stop_timeout }} + when: + - inventory_hostname in groups['primary'] + - pg_new_confdir == pg_new_datadir + +# for Debian based, drop the cluster to perform re-init +- name: Drop new PostgreSQL to re-initdb (perform pg_dropcluster) + ansible.builtin.command: > + /usr/bin/pg_dropcluster --stop {{ pg_new_version }} {{ postgresql_cluster_name }} + failed_when: false + when: + - inventory_hostname in groups['primary'] + - pg_new_confdir != pg_new_datadir + - ansible_os_family == "Debian" + +- name: Reinitialize the database after checking schema compatibility + ansible.builtin.include_tasks: "{{ role_path }}/tasks/initdb.yml" + when: + - inventory_hostname in groups['primary'] + +... diff --git a/roles/upgrade/tasks/ssh-keys.yml b/roles/upgrade/tasks/ssh-keys.yml new file mode 100644 index 000000000..0e34821c8 --- /dev/null +++ b/roles/upgrade/tasks/ssh-keys.yml @@ -0,0 +1,67 @@ +--- +# Configure SSH Key-Based Authentication between cluster nodes + +- name: Make sure that the openssh-client package is installed + become: true + become_user: root + ansible.builtin.package: + name: openssh-client + state: present + when: ansible_os_family == "Debian" + +- name: Make sure that the openssh-clients package is installed + become: true + become_user: root + ansible.builtin.package: + name: openssh-clients + state: present + when: ansible_os_family == "RedHat" + +- name: Make sure the SSH key for user "{{ ssh_key_user }}" exists + ansible.builtin.user: + name: "{{ ssh_key_user }}" + generate_ssh_key: true + ssh_key_file: .ssh/id_rsa + +- name: Fetch public SSH keys from database servers + ansible.builtin.fetch: + src: "~{{ ssh_key_user }}/.ssh/id_rsa.pub" + dest: "files/{{ inventory_hostname }}-id_rsa.pub" + flat: true + changed_when: false + +- name: Add public SSH keys to authorized_keys + ansible.posix.authorized_key: + user: "{{ ssh_key_user }}" + state: present + key: "{{ lookup('pipe', 'cat files/*id_rsa.pub') }}" + exclusive: false + +- name: Remove public SSH keys from localhost + run_once: true + become: false + ansible.builtin.file: + path: files/{{ item }}-id_rsa.pub + state: absent + loop: "{{ groups['postgres_cluster'] }}" + delegate_to: localhost + changed_when: false + +# known_hosts +- name: known_hosts | for each host, scan for its ssh public key + ansible.builtin.command: "ssh-keyscan -trsa -p {{ ansible_ssh_port | default(22) }} {{ item }}" + loop: "{{ groups['postgres_cluster'] }}" + register: ssh_known_host_results + changed_when: false + +- name: known_hosts | for each host, add/update the public key in the "~{{ ssh_key_user }}/.ssh/known_hosts" + become: true + become_user: "{{ ssh_key_user }}" + ansible.builtin.known_hosts: + name: "{{ item.item }}" + key: "{{ item.stdout }}" + path: "~{{ ssh_key_user }}/.ssh/known_hosts" + loop: "{{ ssh_known_host_results.results }}" + no_log: true # don't show public keys + +... diff --git a/roles/upgrade/tasks/start_services.yml b/roles/upgrade/tasks/start_services.yml new file mode 100644 index 000000000..71c9507c0 --- /dev/null +++ b/roles/upgrade/tasks/start_services.yml @@ -0,0 +1,87 @@ +--- + +- name: Start Patroni service on the Cluster Leader + become: true + become_user: root + ansible.builtin.service: + name: patroni + state: started + when: + - inventory_hostname in groups['primary'] + +- name: Wait for Patroni port "{{ patroni_restapi_port }}" to become open on the host + ansible.builtin.wait_for: + port: '{{ patroni_restapi_port }}' + host: '{{ inventory_hostname }}' + state: started + timeout: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['primary'] + +- name: Check Patroni is healthy on the Leader + ansible.builtin.uri: + url: http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/leader + status_code: 200 + register: patroni_leader_result + until: patroni_leader_result.status == 200 + retries: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['primary'] + +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- name: Perform RESUME PgBouncer pools on the Leader + ansible.builtin.include_tasks: pgbouncer_resume.yml + when: + - inventory_hostname in groups['primary'] + - hostvars[groups['primary'][0]].pgbouncer_pool_pause_result is defined + - hostvars[groups['primary'][0]].pgbouncer_pool_pause_result is succeeded + +- name: Start Patroni service on the Cluster Replica + become: true + become_user: root + ansible.builtin.service: + name: patroni + state: started + when: + - inventory_hostname in groups['secondary'] + +- name: Wait for Patroni port "{{ patroni_restapi_port }}" to become open on the host + ansible.builtin.wait_for: + port: '{{ patroni_restapi_port }}' + host: '{{ inventory_hostname }}' + state: started + timeout: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['secondary'] + +- name: Check Patroni is healthy on the Replica + ansible.builtin.uri: + url: http://{{ inventory_hostname }}:{{ patroni_restapi_port }}/health + status_code: 200 + register: patroni_replica_result + until: patroni_replica_result.status == 200 + retries: "{{ (pg_start_stop_timeout | int) // 2 }}" + delay: 2 + when: + - inventory_hostname in groups['secondary'] + +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- name: Perform RESUME PgBouncer pools on the Replica + ansible.builtin.include_tasks: pgbouncer_resume.yml + when: + - inventory_hostname in groups['secondary'] + - hostvars[groups['primary'][0]].pgbouncer_pool_pause_result is defined + - hostvars[groups['primary'][0]].pgbouncer_pool_pause_result is succeeded + +- name: Check PostgreSQL is started and accepting connections + ansible.builtin.command: "{{ pg_new_bindir }}/pg_isready -p {{ postgresql_port }}" + register: pg_isready_result + until: pg_isready_result.rc == 0 + retries: 300 # max duration 10 minutes + delay: 2 + changed_when: false + +... diff --git a/roles/upgrade/tasks/statistics.yml b/roles/upgrade/tasks/statistics.yml new file mode 100644 index 000000000..ad280f891 --- /dev/null +++ b/roles/upgrade/tasks/statistics.yml @@ -0,0 +1,72 @@ +--- +# ANALYZE: Update optimizer statistics +# +# When collecting statistics, if the autovacuum process starts and begins a transaction ID wraparound, +# there's a risk that the 'ANALYZE' command might get blocked and it will wait for the lock to be released until the autovacuum process is completed. +# For large tables, this waiting period can span from several minutes to hours. +# +# To prevent 'ANALYZE' from getting blocked, we execute the 'pg_terminator' script during statistics collection. + +- block: + # Monitor the locks and terminate the backend blocking the 'ANALYZE' query (for more than 15 seconds) + - name: "Start pg_terminator script: Monitor locks and terminate the 'ANALYZE' blockers" + ansible.builtin.shell: | + echo $$ > /tmp/pg_terminator.pid + for i in {1..{{ vacuumdb_analyze_timeout // 10 }}}; do + psql -tAXc " + with blocker_pids(pid) as ( + select unnest(pg_blocking_pids(pid)) + from pg_stat_activity + where + query ilike 'ANALYZE %' + and wait_event_type = 'Lock' + ) + select + clock_timestamp(), + pg_terminate_backend(pid), + pid, + clock_timestamp() - xact_start as xact_age, + left(regexp_replace(query, E'[ \\t\\n\\r]+', ' ', 'g'),150) as query + from pg_stat_activity + where + pid in (select pid from blocker_pids) + and xact_start < clock_timestamp() - interval '15s';" >> /tmp/pg_terminator.log + sleep 10 + done + args: + executable: /bin/bash + async: "{{ vacuumdb_analyze_timeout }}" # run the command asynchronously with a maximum duration + poll: 0 + register: pg_terminator_analyze + ignore_errors: true # ignore errors if the task runs for over an 'vacuumdb_analyze_timeout'. + when: pg_new_version is version('9.6', '>=') + + - name: "Run vacuumdb to analyze the PostgreSQL databases" + ansible.builtin.command: > + {{ pg_new_bindir }}/vacuumdb -p {{ postgresql_port }} + --all --analyze-in-stages --jobs={{ vacuumdb_parallel_jobs }} + async: "{{ vacuumdb_analyze_timeout }}" # run the command asynchronously with a maximum duration + poll: 0 + register: vacuumdb_analyze + ignore_errors: true # ignore errors if the task runs for over an 'vacuumdb_analyze_timeout'. + + - name: "Collecting statistics in progress. Wait for the analyze to complete." + ansible.builtin.async_status: + jid: "{{ vacuumdb_analyze.ansible_job_id }}" + register: vacuumdb_analyze_job_result + until: vacuumdb_analyze_job_result.finished + retries: "{{ (vacuumdb_analyze_timeout | int) // 10 }}" # max wait time + delay: 10 + ignore_errors: true # ignore errors if the task runs for over an vacuumdb_analyze_timeout + + - name: "Stop pg_terminator script" + ansible.builtin.shell: | + pid=$(cat /tmp/pg_terminator.pid) + ps -p $pid > /dev/null 2>&1 && kill -9 $pid + args: + executable: /bin/bash + ignore_errors: true + when: pg_terminator_analyze is changed + when: inventory_hostname in groups['primary'] + +... diff --git a/roles/upgrade/tasks/stop_services.yml b/roles/upgrade/tasks/stop_services.yml new file mode 100644 index 000000000..8df9674ab --- /dev/null +++ b/roles/upgrade/tasks/stop_services.yml @@ -0,0 +1,130 @@ +--- + +- name: "Execute CHECKPOINT before stopping PostgreSQL" + ansible.builtin.command: psql -tAXc "CHECKPOINT" + async: "{{ pg_start_stop_timeout | int }}" # run the command asynchronously + poll: 0 + register: checkpoint_result + +- name: Wait for the CHECKPOINT to complete + ansible.builtin.async_status: + jid: "{{ checkpoint_result.ansible_job_id }}" + register: checkpoint_job_result + until: checkpoint_job_result.finished + retries: "{{ (pg_start_stop_timeout | int) // 10 }}" + delay: 10 + +# Wait for the window to appear without high replication lag before stopping PostgreSQL +- name: "Wait until replication lag is less than {{ max_replication_lag_bytes | human_readable }}" + ansible.builtin.command: >- + psql -tAXc "select + coalesce(max(pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn)),1) as pg_lag_bytes + from pg_stat_replication" + register: pg_lag_bytes + until: pg_lag_bytes.stdout|int < max_replication_lag_bytes|int + retries: 60 # max wait time: 2 minutes + delay: 2 + changed_when: false + failed_when: false + when: + - inventory_hostname in groups['primary'] + - pg_old_version is version('10', '>=') + +# Stop, if replication lag is high +- block: + - name: "Print replication lag" + ansible.builtin.debug: + msg: "Current replication lag: {{ pg_lag_bytes.stdout | int | human_readable }}" + + # rollback + - name: Perform rollback + ansible.builtin.include_tasks: rollback.yml + + - name: "Replication lag detected" + ansible.builtin.fail: + msg: "There's a replication lag in the PostgreSQL Cluster. Please try again later." + when: + - pg_lag_bytes.stdout is defined + - pg_lag_bytes.stdout|int >= max_replication_lag_bytes|int + +# for compatibility with Postgres 9.x +- name: "Wait until replication lag is less than {{ max_replication_lag_bytes | human_readable }}" + ansible.builtin.command: >- + psql -tAXc "select + coalesce(max(pg_xlog_location_diff(pg_current_xlog_location(),replay_location)),1) as pg_lag_bytes + from pg_stat_replication" + register: pg_lag_bytes_9x + until: pg_lag_bytes_9x.stdout|int < max_replication_lag_bytes|int + retries: 60 # max wait time: 2 minutes + delay: 2 + changed_when: false + failed_when: false + when: + - inventory_hostname in groups['primary'] + - pg_old_version is version('10', '<') + +# Stop, if replication lag is high (for 9x) +- block: + - name: "Print replication lag" + ansible.builtin.debug: + msg: "Current replication lag: {{ pg_lag_bytes_9x.stdout | int | human_readable }}" + + # rollback + - name: Perform rollback + ansible.builtin.include_tasks: rollback.yml + + - name: "Replication lag detected" + ansible.builtin.fail: + msg: "There's a replication lag in the PostgreSQL Cluster. Please try again later." + when: + - pg_lag_bytes_9x.stdout is defined + - pg_lag_bytes_9x.stdout|int >= max_replication_lag_bytes|int + +# if 'pgbouncer_install' is 'true' and 'pgbouncer_pool_pause' is 'true' +- name: Perform PAUSE on all pgbouncers servers + ansible.builtin.include_tasks: pgbouncer_pause.yml + when: + - pgbouncer_install | bool + - pgbouncer_pool_pause | bool + +# Stop PostgreSQL (if replication lag is 0 bytes) +- name: Stop PostgreSQL on the Leader + ansible.builtin.command: >- + {{ pg_old_bindir }}/pg_ctl -D {{ pg_old_datadir }} stop -m fast -w -t {{ pg_start_stop_timeout }} + when: + - inventory_hostname in groups['primary'] + +- name: Stop PostgreSQL on the Replica + ansible.builtin.command: >- + {{ pg_old_bindir }}/pg_ctl -D {{ pg_old_datadir }} stop -m fast -w -t {{ pg_start_stop_timeout }} + when: + - inventory_hostname in groups['secondary'] + +# additional checks using pg_ctl +- name: "Check if PostgreSQL {{ pg_old_version }} is stopped" + ansible.builtin.command: "{{ pg_old_bindir }}/pg_ctl status -D {{ pg_old_datadir }}" + register: pg_ctl_status_old_result + failed_when: false + changed_when: false + +- name: "Check if PostgreSQL {{ pg_new_version }} is stopped" + ansible.builtin.command: "{{ pg_new_bindir }}/pg_ctl status -D {{ pg_new_datadir }}" + register: pg_ctl_status_new_result + failed_when: false + changed_when: false + +- name: "Stop PostgreSQL {{ pg_old_version }}" + ansible.builtin.command: > + {{ pg_old_bindir }}/pg_ctl -D {{ pg_old_datadir }} stop -w -t {{ pg_start_stop_timeout }} + when: + - pg_ctl_status_old_result is defined + - pg_ctl_status_old_result.rc == 0 + +- name: "Stop PostgreSQL {{ pg_new_version }}" + ansible.builtin.command: > + {{ pg_new_bindir }}/pg_ctl -D {{ pg_new_datadir }} stop -w -t {{ pg_start_stop_timeout }} + when: + - pg_ctl_status_new_result is defined + - pg_ctl_status_new_result.rc == 0 + +... diff --git a/roles/upgrade/tasks/update_config.yml b/roles/upgrade/tasks/update_config.yml new file mode 100644 index 000000000..c67812fae --- /dev/null +++ b/roles/upgrade/tasks/update_config.yml @@ -0,0 +1,162 @@ +--- +# Prepare the parameters for Patroni. +# Update the directory path to a new version of PostgresSQL +- name: "Edit patroni.yml | update parameters: data_dir, bin_dir, config_dir" + ansible.builtin.replace: + path: "{{ patroni_config_file }}" + regexp: "{{ item.regexp }}" + replace: "{{ item.replace }}" + loop: + - { regexp: 'data_dir: {{ pg_old_datadir }}', replace: 'data_dir: {{ pg_new_datadir }}' } + - { regexp: 'bin_dir: {{ pg_old_bindir }}', replace: 'bin_dir: {{ pg_new_bindir }}' } + - { regexp: 'config_dir: {{ pg_old_confdir }}', replace: 'config_dir: {{ pg_new_confdir }}' } + loop_control: + label: '{{ item.replace }}' + +# To support upgrades in the Patroni Standby Cluster. +- block: # standby_cluster + - name: "Edit patroni.yml | check if the 'standby_cluster' parameter is specified" + ansible.builtin.command: grep standby_cluster {{ patroni_config_file }} + register: standby_cluster_output + changed_when: false + failed_when: false + + # The standby_cluster parameter will be removed from the bootstrap.dcs section (if exists) + # along with all the text between 'standby_cluster' and the next 'initdb' parameter + - name: "Edit patroni.yml | remove parameters: standby_cluster (if exists)" + ansible.builtin.replace: + path: "{{ patroni_config_file }}" + regexp: '^\s*standby_cluster:[^\n]*\n(.|\n)*?initdb:' + replace: ' initdb:' + when: standby_cluster_output.stdout | length > 0 + when: + - patroni_standby_cluster.host is defined + - patroni_standby_cluster.host | length > 0 + +# Prepare the parameters for PostgreSQL (removed or renamed parameters). + +- block: # replacement_sort_tuples (removed in the PG 11) + # check if the replacement_sort_tuples parameter is specified in the patroni.yml + - name: "Edit patroni.yml | check if the 'replacement_sort_tuples' parameter is specified" + ansible.builtin.command: grep replacement_sort_tuples {{ patroni_config_file }} + register: replacement_sort_tuples_output + changed_when: false + failed_when: false + + # if defined, remove the replacement_sort_tuples parameter from the patroni.yml + - name: "Edit patroni.yml | remove parameter: 'replacement_sort_tuples'" + ansible.builtin.lineinfile: + path: "{{ patroni_config_file }}" + regexp: '^(\s*)replacement_sort_tuples:.*' + state: absent + when: replacement_sort_tuples_output.stdout | length > 0 + when: + - pg_old_version|int <= 10 and pg_new_version|int >= 11 + +- block: # default_with_oids (removed in the PG 12) + # check if the default_with_oids parameter is specified in the patroni.yml + - name: "Edit patroni.yml | check if the 'default_with_oids' parameter is specified" + ansible.builtin.command: grep default_with_oids {{ patroni_config_file }} + register: default_with_oids_output + changed_when: false + failed_when: false + + # if defined, remove the default_with_oids parameter from the patroni.yml + - name: "Edit patroni.yml | remove parameter: 'default_with_oids'" + ansible.builtin.lineinfile: + path: "{{ patroni_config_file }}" + regexp: '^(\s*)default_with_oids:.*' + state: absent + when: default_with_oids_output.stdout | length > 0 + when: + - pg_old_version|int <= 11 and pg_new_version|int >= 12 + +- block: # wal_keep_segments (removed in the PG 13) + # check if the wal_keep_segments parameter is specified in the patroni.yml + - name: "Edit patroni.yml | check if the 'wal_keep_segments' parameter is specified" + ansible.builtin.shell: > + set -o pipefail; + grep wal_keep_segments {{ patroni_config_file }} | awk '{print $2}' | tail -n 1 + args: + executable: /bin/bash + register: wal_keep_segments_output + changed_when: false + failed_when: false + + # if defined, replace it to wal_keep_size with a value in MB. + - name: "Edit patroni.yml | replace parameter: 'wal_keep_segments' to 'wal_keep_size'" + ansible.builtin.replace: + path: "{{ patroni_config_file }}" + regexp: 'wal_keep_segments: ([0-9]+)' + replace: "wal_keep_size: {{ (wal_keep_segments_output.stdout | int * 16) | string + 'MB' }}" + when: wal_keep_segments_output.stdout|int > 0 + when: + - pg_old_version|int <= 12 and pg_new_version|int >= 13 + +- block: # operator_precedence_warning (removed in the PG 14) + # check if the operator_precedence_warning parameter is specified in the patroni.yml + - name: "Edit patroni.yml | check if the 'operator_precedence_warning' parameter is specified" + ansible.builtin.command: grep operator_precedence_warning {{ patroni_config_file }} + register: operator_precedence_warning_output + changed_when: false + failed_when: false + + # if defined, remove the operator_precedence_warning parameter from the patroni.yml + - name: "Edit patroni.yml | remove parameter: 'operator_precedence_warning'" + ansible.builtin.lineinfile: + path: "{{ patroni_config_file }}" + regexp: '^(\s*)operator_precedence_warning:.*' + state: absent + when: operator_precedence_warning_output.stdout | length > 0 + when: + - pg_old_version|int <= 13 and pg_new_version|int >= 14 + +- block: # vacuum_cleanup_index_scale_factor (removed in the PG 14) + # check if the vacuum_cleanup_index_scale_factor parameter is specified in the patroni.yml + - name: "Edit patroni.yml | check if the 'vacuum_cleanup_index_scale_factor' parameter is specified" + ansible.builtin.command: grep vacuum_cleanup_index_scale_factor {{ patroni_config_file }} + register: vacuum_cleanup_index_scale_factor_output + changed_when: false + failed_when: false + + # if defined, remove the vacuum_cleanup_index_scale_factor parameter from the patroni.yml + - name: "Edit patroni.yml | remove parameter: 'vacuum_cleanup_index_scale_factor'" + ansible.builtin.lineinfile: + path: "{{ patroni_config_file }}" + regexp: '^(\s*)vacuum_cleanup_index_scale_factor:.*' + state: absent + when: vacuum_cleanup_index_scale_factor_output.stdout | length > 0 + when: + - pg_old_version|int <= 13 and pg_new_version|int >= 14 + +- block: # stats_temp_directory (removed in the PG 15) + # check if the stats_temp_directory parameter is specified in the patroni.yml + - name: "Edit patroni.yml | check if the 'stats_temp_directory' parameter is specified" + ansible.builtin.command: grep stats_temp_directory {{ patroni_config_file }} + register: stats_temp_directory_output + changed_when: false + failed_when: false + + # if defined, remove the stats_temp_directory parameter from the patroni.yml + - name: "Edit patroni.yml | remove parameter: 'stats_temp_directory'" + ansible.builtin.lineinfile: + path: "{{ patroni_config_file }}" + regexp: '^(\s*)stats_temp_directory:.*' + state: absent + when: stats_temp_directory_output.stdout | length > 0 + when: + - pg_old_version|int <= 14 and pg_new_version|int >= 15 + +# TODO: Prepare the parameters for PostgreSQL 16 and etc. + +# Copy the pg_hba.conf file to a new PostgreSQL to save pg_hba rules. +- name: "Copy pg_hba.conf to {{ pg_new_confdir }}" + ansible.builtin.copy: + src: "{{ pg_old_confdir }}/pg_hba.conf" + dest: "{{ pg_new_confdir }}" + owner: postgres + mode: '0600' + force: true + remote_src: true + +... diff --git a/roles/upgrade/tasks/update_extensions.yml b/roles/upgrade/tasks/update_extensions.yml new file mode 100644 index 000000000..fe9775dcd --- /dev/null +++ b/roles/upgrade/tasks/update_extensions.yml @@ -0,0 +1,79 @@ +--- + +- name: "Get list of installed PostgreSQL extensions (database: {{ pg_target_dbname }})" + ansible.builtin.command: >- + psql -d {{ pg_target_dbname }} -tAXc "select + extname from pg_catalog.pg_extension" + register: pg_installed_extensions + changed_when: false + when: + - inventory_hostname in groups['primary'] + +- name: "Get list of old PostgreSQL extensions (database: {{ pg_target_dbname }})" + ansible.builtin.command: >- + psql -d {{ pg_target_dbname }} -tAXc "select + extname from pg_catalog.pg_extension e + join pg_catalog.pg_available_extensions ae on extname = ae.name + where installed_version <> default_version" + register: pg_old_extensions + changed_when: false + when: + - inventory_hostname in groups['primary'] + +# if there are no old extensions +- name: "The extensions are up-to-date (database: {{ pg_target_dbname }})" + ansible.builtin.debug: + msg: + - "The extension versions are up-to-date for the database {{ pg_target_dbname }}" + - "No update is required." + when: + - inventory_hostname in groups['primary'] + - pg_old_extensions.stdout_lines | length < 1 + +# if pg_stat_cache is not installed +# excluding: 'pg_repack' (is exists), as it requires re-creation to update +- name: "Update old PostgreSQL extensions (database: {{ pg_target_dbname }})" + ansible.builtin.command: >- + psql -d {{ pg_target_dbname }} -tAXc "ALTER EXTENSION {{ item }} UPDATE" + ignore_errors: true # show the error and continue the playbook execution + loop: "{{ pg_old_extensions.stdout_lines | reject('match', '^pg_repack$') | list }}" + when: + - inventory_hostname in groups['primary'] + - pg_old_extensions.stdout_lines | length > 0 + - (not 'pg_stat_kcache' in pg_installed_extensions.stdout_lines) + +# if pg_stat_kcache is installed +- block: + # excluding: 'pg_stat_statements','pg_stat_kcache', because extension pg_stat_kcache depends on it (will be re-created) + - name: "Update old PostgreSQL extensions (database: {{ pg_target_dbname }})" + ansible.builtin.command: >- + psql -d {{ pg_target_dbname }} -tAXc "ALTER EXTENSION {{ item }} UPDATE" + ignore_errors: true # show the error and continue the playbook execution + loop: "{{ pg_old_extensions.stdout_lines | reject('match', '^(pg_repack|pg_stat_statements|pg_stat_kcache)$') | list }}" + + # re-create 'pg_stat_statements' and 'pg_stat_kcache' if an update is required + - name: "Recreate old pg_stat_statements and pg_stat_kcache extensions to update (database: {{ pg_target_dbname }})" + ansible.builtin.command: >- + psql -d {{ pg_target_dbname }} -tAXc " + DROP EXTENSION pg_stat_statements CASCADE; + CREATE EXTENSION pg_stat_statements; + CREATE EXTENSION pg_stat_kcache" + ignore_errors: true # show the error and continue the playbook execution + when: + - inventory_hostname in groups['primary'] + - pg_old_extensions.stdout_lines | length > 0 + - ('pg_stat_statements' in pg_old_extensions.stdout_lines or 'pg_stat_kcache' in pg_old_extensions.stdout_lines) + - ('pg_stat_kcache' in pg_installed_extensions.stdout_lines) + +# re-create the 'pg_repack' if it exists and an update is required +- name: "Recreate old pg_repack extension to update (database: {{ pg_target_dbname }})" + ansible.builtin.command: >- + psql -d {{ pg_target_dbname }} -tAXc " + DROP EXTENSION pg_repack; + CREATE EXTENSION pg_repack;" + ignore_errors: true # show the error and continue the playbook execution + when: + - inventory_hostname in groups['primary'] + - (pg_old_extensions.stdout_lines | length > 0 and 'pg_repack' in pg_old_extensions.stdout_lines) + +... diff --git a/roles/upgrade/tasks/upgrade_check.yml b/roles/upgrade/tasks/upgrade_check.yml new file mode 100644 index 000000000..7b4824ad3 --- /dev/null +++ b/roles/upgrade/tasks/upgrade_check.yml @@ -0,0 +1,48 @@ +--- + +- name: Get the current shared_preload_libraries settings + ansible.builtin.command: psql -tAXc "show shared_preload_libraries" + changed_when: false + register: pg_shared_preload_libraries_result + when: + - inventory_hostname in groups['primary'] + +- name: 'Set the variable: pg_shared_preload_libraries_value' + ansible.builtin.set_fact: + pg_shared_preload_libraries_value: "{{ pg_shared_preload_libraries_result.stdout }}" + when: + - inventory_hostname in groups['primary'] + +# In the --new-options argument, an inline if condition checks if 'timescaledb' is present in the pg_shared_preload_libraries_value. +# If it is, it appends '-c timescaledb.restoring='on'' to the --new-options argument. +- name: Verify the two clusters are compatible (pg_upgrade --check) + ansible.builtin.command: > + {{ pg_new_bindir }}/pg_upgrade + --username={{ pg_install_user.stdout }} + --old-bindir {{ pg_old_bindir }} + --new-bindir {{ pg_new_bindir }} + --old-datadir {{ pg_old_datadir }} + --new-datadir {{ pg_new_datadir }} + --old-options "-c config_file={{ pg_old_confdir }}/postgresql.conf" + --new-options "-c config_file={{ pg_new_confdir }}/postgresql.conf {{ shared_preload_libraries }} {{ timescaledb_restoring }}" + --jobs={{ ansible_processor_vcpus }} + --link + --check + args: + chdir: "{{ pg_upper_datadir }}" + vars: + shared_preload_libraries: "-c shared_preload_libraries='{{ pg_shared_preload_libraries_value }}'" + timescaledb_restoring: "{{ \"-c timescaledb.restoring='on'\" if 'timescaledb' in pg_shared_preload_libraries_value else '' }}" + failed_when: false + register: pg_upgrade_check_result + when: + - inventory_hostname in groups['primary'] + +- name: Print the result of the pg_upgrade check + ansible.builtin.debug: + var: pg_upgrade_check_result.stdout_lines + failed_when: "'Clusters are compatible' not in pg_upgrade_check_result.stdout" + when: + - inventory_hostname in groups['primary'] + +... diff --git a/roles/upgrade/tasks/upgrade_primary.yml b/roles/upgrade/tasks/upgrade_primary.yml new file mode 100644 index 000000000..37b2a59b7 --- /dev/null +++ b/roles/upgrade/tasks/upgrade_primary.yml @@ -0,0 +1,49 @@ +--- +# Upgrade with pg_upgrade (hard-links) + +# In the --new-options argument, an inline if condition checks if 'timescaledb' is present in the pg_shared_preload_libraries_value. +# If it is, it appends '-c timescaledb.restoring='on'' to the --new-options argument. +- name: "Upgrade the PostgreSQL to version {{ pg_new_version }} on the Primary (using pg_upgrade --link)" + ansible.builtin.command: > + {{ pg_new_bindir }}/pg_upgrade + --username={{ pg_install_user.stdout }} + --old-bindir {{ pg_old_bindir }} + --new-bindir {{ pg_new_bindir }} + --old-datadir {{ pg_old_datadir }} + --new-datadir {{ pg_new_datadir }} + --old-options "-c config_file={{ pg_old_confdir }}/postgresql.conf" + --new-options "-c config_file={{ pg_new_confdir }}/postgresql.conf {{ shared_preload_libraries }} {{ timescaledb_restoring }}" + --jobs={{ ansible_processor_vcpus }} + --link + args: + chdir: "{{ pg_upper_datadir }}" + vars: + shared_preload_libraries: "-c shared_preload_libraries='{{ pg_shared_preload_libraries_value }}'" + timescaledb_restoring: "{{ \"-c timescaledb.restoring='on'\" if 'timescaledb' in pg_shared_preload_libraries_value else '' }}" + register: pg_upgrade_result + when: + - inventory_hostname in groups['primary'] + +# If the length of the pg_upgrade_result.stdout_lines is greater than 100 lines, +# the upgrade_output variable will include the first 70 lines, an ellipsis (...), +# and the last 30 lines of the pg_upgrade_result.stdout_lines. +- name: Print the result of the pg_upgrade + ansible.builtin.debug: + msg: + - "{{ pg_upgrade_result.stdout_lines[:70] }}" + - " ... " + - "{{ pg_upgrade_result.stdout_lines[-30:] }}" + when: + - inventory_hostname in groups['primary'] + - pg_upgrade_result.stdout_lines | length > 100 + +# Otherwise, it will include all lines of the pg_upgrade_result.stdout_lines. +- name: Print the result of the pg_upgrade + ansible.builtin.debug: + msg: + - "{{ pg_upgrade_result.stdout_lines }}" + when: + - inventory_hostname in groups['primary'] + - pg_upgrade_result.stdout_lines | length <= 100 + +... diff --git a/roles/upgrade/tasks/upgrade_secondary.yml b/roles/upgrade/tasks/upgrade_secondary.yml new file mode 100644 index 000000000..8f3afcec9 --- /dev/null +++ b/roles/upgrade/tasks/upgrade_secondary.yml @@ -0,0 +1,134 @@ +--- +# Upgrade with rsync (hard-links) + +# This task performs the upgrade of PostgreSQL on the replica servers using the RSync utility. +# It follows these steps: +# 1. Retrieve the list of target secondary servers from the inventory, which are the servers where the upgrade will be performed. +# 2. Count the number of target secondary servers to determine the parallel execution limit. +# 3. Use xargs to execute the RSync command in parallel for each target secondary server. + +- name: Make sure that the new data directory "{{ pg_new_datadir }}" are empty on the Replica + ansible.builtin.file: + path: "{{ pg_new_datadir }}" + state: "{{ item }}" + mode: "0700" + group: postgres + owner: postgres + loop: + - absent + - directory + when: + - inventory_hostname in groups['secondary'] + +# If the source and target directories are inside versioned directories +# (example: /pgdata//main -> /pgdata//main) +- block: + - name: "Upgrade the PostgreSQL on the Replica (using rsync --hard-links)" + vars: + secondary_servers: "{{ groups['secondary'] | join('\n') }}" + secondary_count: "{{ groups['secondary'] | length }}" + ansible.builtin.shell: | + set -o pipefail; + echo -e "{{ secondary_servers }}" | xargs -I {} -P "{{ secondary_count }}" -n 1 \ + rsync -e 'ssh -o StrictHostKeyChecking=no' --archive --delete --hard-links --size-only --no-inc-recursive \ + {{ pg_upper_datadir }}/{{ pg_old_version }} {{ pg_upper_datadir }}/{{ pg_new_version }} {}:{{ pg_upper_datadir }} + args: + executable: /bin/bash + async: 3600 # run the command asynchronously with a maximum duration of 1 hour + poll: 0 + register: rsync_result_1 + + - name: Wait for the rsync to complete. + ansible.builtin.async_status: + jid: "{{ rsync_result_1.ansible_job_id }}" + register: rsync_1_job_result + until: rsync_1_job_result.finished + retries: 1800 + delay: 2 + become: true + become_user: postgres + when: + - inventory_hostname in groups['primary'] + - pg_old_datadir|dirname == pg_upper_datadir + '/' + pg_old_version + - pg_new_datadir|dirname == pg_upper_datadir + '/' + pg_new_version + +# If the source and target directories are non-versioned directories +# (example: /pgdata/main -> /pgdata/main) +- block: + - name: "Upgrade the PostgreSQL on the Replica (using rsync --hard-links)" + vars: + secondary_servers: "{{ groups['secondary'] | join('\n') }}" + secondary_count: "{{ groups['secondary'] | length }}" + ansible.builtin.shell: | + set -o pipefail; + echo -e "{{ secondary_servers }}" | xargs -I {} -P "{{ secondary_count }}" -n 1 \ + rsync -e 'ssh -o StrictHostKeyChecking=no' --archive --delete --hard-links --size-only --no-inc-recursive \ + {{ pg_old_datadir }} {{ pg_new_datadir }} {}:{{ pg_upper_datadir }} + args: + executable: /bin/bash + async: 3600 # run the command asynchronously with a maximum duration of 1 hour + poll: 0 + register: rsync_result_2 + + - name: Wait for the rsync to complete. + ansible.builtin.async_status: + jid: "{{ rsync_result_2.ansible_job_id }}" + register: rsync_2_job_result + until: rsync_2_job_result.finished + retries: 1800 + delay: 2 + become: true + become_user: postgres + when: + - inventory_hostname in groups['primary'] + - pg_old_datadir|dirname != pg_upper_datadir + '/' + pg_old_version + - pg_new_datadir|dirname != pg_upper_datadir + '/' + pg_new_version + +# Tablespaces (if exists) +- block: + - name: "Upgrade the PostgreSQL tablespaces on the Replica (using rsync --hard-links)" + vars: + secondary_servers: "{{ groups['secondary'] | join('\n') }}" + secondary_count: "{{ groups['secondary'] | length }}" + ansible.builtin.shell: | + set -o pipefail; + for tablespace_location in {{ tablespace_location.stdout_lines | join(' ') }}; + do + old_tablespace_dir_count=$(ls -d ${tablespace_location}/PG_{{ pg_old_version }}_* | wc -l) + new_tablespace_dir_count=$(ls -d ${tablespace_location}/PG_{{ pg_new_version }}_* | wc -l) + + if [ $old_tablespace_dir_count -ne 1 ] || [ $new_tablespace_dir_count -ne 1 ]; then + echo "Expected exactly one matching directory for each version, \ + but found $old_tablespace_dir_count for old version and $new_tablespace_dir_count for new version. \ + Skipping rsync." + exit 1 + fi + + old_tablespace_dir=$(ls -d ${tablespace_location}/PG_{{ pg_old_version }}_*) + new_tablespace_dir=$(ls -d ${tablespace_location}/PG_{{ pg_new_version }}_*) + + echo -e "{{ secondary_servers }}" | xargs -I {} -P "{{ secondary_count }}" -n 1 \ + rsync -e 'ssh -o StrictHostKeyChecking=no' --archive --delete --hard-links --size-only --no-inc-recursive \ + "${old_tablespace_dir}" "${new_tablespace_dir}" {}:"${tablespace_location}" + done + args: + executable: /bin/bash + async: 3600 # run the command asynchronously with a maximum duration of 1 hour + poll: 0 + register: rsync_tablespace_result + + - name: Wait for the tablespaces rsync to complete. + ansible.builtin.async_status: + jid: "{{ rsync_tablespace_result.ansible_job_id }}" + register: rsync_tablespace_job_result + until: rsync_tablespace_job_result.finished + retries: 1800 + delay: 2 + become: true + become_user: postgres + when: + - inventory_hostname in groups['primary'] + - tablespace_location.stdout_lines is defined + - tablespace_location.stdout_lines | length > 0 + +... diff --git a/roles/upgrade/templates/haproxy-no-http-checks.cfg.j2 b/roles/upgrade/templates/haproxy-no-http-checks.cfg.j2 new file mode 100644 index 000000000..687b1c3b6 --- /dev/null +++ b/roles/upgrade/templates/haproxy-no-http-checks.cfg.j2 @@ -0,0 +1,77 @@ +global + maxconn {{ haproxy_maxconn.global }} + log /dev/log local0 + log /dev/log local1 notice + chroot /var/lib/haproxy + stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners + stats timeout 30s + user haproxy + group haproxy + daemon + +defaults + mode tcp + log global + retries 2 + timeout queue 5s + timeout connect 5s + timeout client {{ haproxy_timeout.client }} + timeout server {{ haproxy_timeout.server }} + timeout check 15s + +listen stats + mode http + bind {{ hostvars[inventory_hostname]['inventory_hostname'] }}:{{ haproxy_listen_port.stats }} + stats enable + stats uri / + +listen master +{% if cluster_vip is defined and cluster_vip | length > 0 %} + bind {{ cluster_vip }}:{{ haproxy_listen_port.master }} +{% else %} + bind {{ hostvars[inventory_hostname]['inventory_hostname'] }}:{{ haproxy_listen_port.master }} +{% endif %} + maxconn {{ haproxy_maxconn.master }} + option tcplog +{% for host in groups['primary'] %} +server {{ hostvars[host]['ansible_hostname'] }} {{ hostvars[host]['inventory_hostname'] }}:{{ pgbouncer_listen_port }} +{% endfor %} + +listen replicas +{% if cluster_vip is defined and cluster_vip | length > 0 %} + bind {{ cluster_vip }}:{{ haproxy_listen_port.replicas }} +{% else %} + bind {{ hostvars[inventory_hostname]['inventory_hostname'] }}:{{ haproxy_listen_port.replicas }} +{% endif %} + maxconn {{ haproxy_maxconn.replica }} + option tcplog + balance roundrobin +{% for host in groups['secondary'] %} +server {{ hostvars[host]['ansible_hostname'] }} {{ hostvars[host]['inventory_hostname'] }}:{{ pgbouncer_listen_port }} +{% endfor %} + +listen replicas_sync +{% if cluster_vip is defined and cluster_vip | length > 0 %} + bind {{ cluster_vip }}:{{ haproxy_listen_port.replicas_sync }} +{% else %} + bind {{ hostvars[inventory_hostname]['inventory_hostname'] }}:{{ haproxy_listen_port.replicas_sync }} +{% endif %} + maxconn {{ haproxy_maxconn.replica }} + option tcplog + balance roundrobin +{% for host in groups['secondary'] %} +server {{ hostvars[host]['ansible_hostname'] }} {{ hostvars[host]['inventory_hostname'] }}:{{ pgbouncer_listen_port }} +{% endfor %} + +listen replicas_async +{% if cluster_vip is defined and cluster_vip | length > 0 %} + bind {{ cluster_vip }}:{{ haproxy_listen_port.replicas_async }} +{% else %} + bind {{ hostvars[inventory_hostname]['inventory_hostname'] }}:{{ haproxy_listen_port.replicas_async }} +{% endif %} + maxconn {{ haproxy_maxconn.replica }} + option tcplog + balance roundrobin +{% for host in groups['secondary'] %} +server {{ hostvars[host]['ansible_hostname'] }} {{ hostvars[host]['inventory_hostname'] }}:{{ pgbouncer_listen_port }} +{% endfor %} diff --git a/vars/Debian.yml b/vars/Debian.yml index ec5b9d3e8..5b82b496e 100644 --- a/vars/Debian.yml +++ b/vars/Debian.yml @@ -2,8 +2,10 @@ # PostgreSQL variables postgresql_cluster_name: "main" -postgresql_data_dir: "/var/lib/postgresql/{{ postgresql_version }}/{{ postgresql_cluster_name }}" # You can specify custom data dir path -postgresql_wal_dir: "" # custom WAL dir path (symlink will be created) [optional] +# You can specify custom data dir path. Example: "/pgdata/{{ postgresql_version }}/main" +postgresql_data_dir: "/var/lib/postgresql/{{ postgresql_version }}/{{ postgresql_cluster_name }}" +# You can specify custom WAL dir path. Example: "/pgwal/{{ postgresql_version }}/pg_wal" +postgresql_wal_dir: "" # if defined, symlink will be created [optional] postgresql_conf_dir: "/etc/postgresql/{{ postgresql_version }}/{{ postgresql_cluster_name }}" postgresql_bin_dir: "/usr/lib/postgresql/{{ postgresql_version }}/bin" postgresql_log_dir: "/var/log/postgresql" @@ -15,8 +17,6 @@ postgresql_home_dir: "/var/lib/postgresql" postgresql_stats_temp_directory_path: "/var/lib/pgsql_stats_tmp" # or 'none' postgresql_stats_temp_directory_size: "1024m" -postgresql_version_terse: "{{ postgresql_version | replace('.', '') }}" - # Repository apt_repository_keys: - key: "https://www.postgresql.org/media/keys/ACCC4CF8.asc" # postgresql repository apt key diff --git a/vars/RedHat.yml b/vars/RedHat.yml index e1e414c69..ebe7954fe 100644 --- a/vars/RedHat.yml +++ b/vars/RedHat.yml @@ -1,8 +1,10 @@ --- # PostgreSQL variables -postgresql_data_dir: "/var/lib/pgsql/{{ postgresql_version }}/data" # You can specify custom data dir path -postgresql_wal_dir: "" # custom WAL dir path (symlink will be created) [optional] +# You can specify custom data dir path. Example: "/pgdata/{{ postgresql_version }}/data" +postgresql_data_dir: "/var/lib/pgsql/{{ postgresql_version }}/data" +# You can specify custom WAL dir path. Example: "/pgwal/{{ postgresql_version }}/pg_wal" +postgresql_wal_dir: "" # if defined, symlink will be created [optional] postgresql_conf_dir: "{{ postgresql_data_dir }}" postgresql_bin_dir: "/usr/pgsql-{{ postgresql_version }}/bin" postgresql_log_dir: "/var/log/postgresql" @@ -14,8 +16,6 @@ postgresql_home_dir: "/var/lib/pgsql" postgresql_stats_temp_directory_path: "/var/lib/pgsql_stats_tmp" # or 'none' postgresql_stats_temp_directory_size: "1024m" -postgresql_version_terse: "{{ postgresql_version | replace('.', '') }}" - # Repository yum_repository: [] # - name: "repo name" @@ -72,11 +72,11 @@ glibc_langpack: # - "glibc-langpack-de" postgresql_packages: - - postgresql{{ postgresql_version_terse }} - - postgresql{{ postgresql_version_terse }}-server - - postgresql{{ postgresql_version_terse }}-contrib - - postgresql{{ postgresql_version_terse }}-devel -# - pg_repack{{ postgresql_version_terse }} + - postgresql{{ postgresql_version | replace('.', '') }} + - postgresql{{ postgresql_version | replace('.', '') }}-server + - postgresql{{ postgresql_version | replace('.', '') }}-contrib + - postgresql{{ postgresql_version | replace('.', '') }}-devel +# - pg_repack{{ postgresql_version | replace('.', '') }} # Extra packages etcd_package_repo: "https://github.com/etcd-io/etcd/releases/download/v{{ etcd_version }}/etcd-v{{ etcd_version }}-linux-amd64.tar.gz" diff --git a/vars/main.yml b/vars/main.yml index 7a17801f4..87b899d83 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -281,6 +281,7 @@ postgresql_pg_ident: [] # the password file (~/.pgpass) postgresql_pgpass: - "localhost:{{ postgresql_port }}:*:{{ patroni_superuser_username }}:{{ patroni_superuser_password }}" + - "localhost:{{ pgbouncer_listen_port }}:*:{{ patroni_superuser_username }}:{{ patroni_superuser_password }}" - "{{ inventory_hostname }}:{{ postgresql_port }}:*:{{ patroni_superuser_username }}:{{ patroni_superuser_password }}" # - hostname:port:database:username:password diff --git a/vars/upgrade.yml b/vars/upgrade.yml new file mode 100644 index 000000000..b1ac263a5 --- /dev/null +++ b/vars/upgrade.yml @@ -0,0 +1,92 @@ +# yamllint disable rule:line-length +--- +# Variables for the pg_upgrade.yml playbook + +# Note: +# There is no need to plan additional disk space, because when updating PostgreSQL, hard links are used instead of copying files. +# However, it is required that the pg_old_datadir and pg_new_datadir are located within the same top-level directory (pg_upper_datadir). +# https://www.postgresql.org/docs/current/pgupgrade.html + +# PostgreSQL versions +pg_old_version: "" # specify the current (old) version of PostgreSQL +pg_new_version: "" # specify the target version of PostgreSQL for the upgrade + +# Paths for old and new PostgreSQL versions +# Adjust these variables if the paths are different from the default value. + +# Directory containing binaries for the old PostgreSQL version. +pg_old_bindir: "{{ postgresql_bin_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_old_version) }}" +# Data directory path for the old PostgreSQL version. +pg_old_datadir: "{{ postgresql_data_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_old_version) }}" +# Configuration directory path for the old PostgreSQL version. +pg_old_confdir: "{{ postgresql_conf_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_old_version) }}" + +# Directory containing binaries for the new PostgreSQL version. +pg_new_bindir: "{{ postgresql_bin_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_new_version) }}" +# Data directory path for the new PostgreSQL version. +pg_new_datadir: "{{ postgresql_data_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_new_version) }}" +# Configuration directory path for the new PostgreSQL version. +pg_new_confdir: "{{ postgresql_conf_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_new_version) }}" +# Custom WAL directory for the new PostgreSQL version (symlink will be created) [optional]. +pg_new_wal_dir: "{{ postgresql_wal_dir | regex_replace('(/$)', '') | regex_replace(postgresql_version, pg_new_version) }}" + +# pg_upper_datadir: Specifies the top-level directory containing both old and new PostgreSQL data directories. +# The variable is derived from pg_new_datadir by removing any trailing slash and getting its grandparent directory. +# Adjust if the data directory location differs from the default. +# Example: /var/lib/postgresql, /var/lib/pgsql, /pgdata +pg_upper_datadir: "{{ pg_new_datadir | regex_replace('/$', '') | dirname | dirname }}" + +# List of package names for the new PostgreSQL version to be installed. +# automatically detects the list of packages based on the 'postgresql_packages' variable +pg_new_packages: "{{ postgresql_packages | regex_replace(postgresql_version, pg_new_version) }}" + +# Alternatively, you can explicitly specify the list of new packages to install. +# This gives you more control and should be used if the automatic update does not meet your needs. +# Uncomment and modify the following lines according to your requirements. Example: +# pg_new_packages: +# - postgresql-{{ pg_new_version }} +# - postgresql-client-{{ pg_new_version }} +# - postgresql-server-dev-{{ pg_new_version }} +# - postgresql-contrib-{{ pg_new_version }} +# - postgresql-{{ pg_new_version }}-repack" + +pg_old_packages_remove: true # remove old postgresql packages after upgrade + +# Timeout (in seconds) to be used when starting/stopping PostgreSQL during the upgrade. +pg_start_stop_timeout: 1800 # 30 minutes + +# Patroni configuration file path. +patroni_config_file: /etc/patroni/patroni.yml + +schema_compatibility_check: true # If 'true', a compatibility check of the database schema with the new PostgreSQL version will be performed before the upgrade. +schema_compatibility_check_port: "{{ (postgresql_port | int) + 1 }}" # Port used to run a temporary PostgreSQL instance for schema compatibility checking. +schema_compatibility_check_timeout: 3600 # Maximum duration (in seconds) for the compatibility check (using pg_dumpall --schema-only). + +vacuumdb_parallel_jobs: "{{ ansible_processor_vcpus }}" # use all CPU cores +vacuumdb_analyze_timeout: 3600 # seconds. The maximum duration of analyze command (soft limit, exceeding won't halt playbook) +update_extensions: true # if 'true', try to update extensions automatically + +# Do not perform an upgrade if +max_replication_lag_bytes: 10485760 # 10 MiB - Maximum allowed replication lag in bytes +max_transaction_sec: 15 # Maximum allowed duration for a transactions in seconds + +# (optional) Copy any files located in the "files" directory to all servers +# example for Postgres Full-Text Search (FTS) files +copy_files_to_all_server: [] +# - { src: "files/numbers.syn", dest: "/usr/share/postgresql/{{ pg_new_version }}/tsearch_data/numbers.syn", owner: "root", group: "root", mode: "0644" } +# - { src: "files/part_of_speech_russian.stop", dest: "/usr/share/postgresql/{{ pg_new_version }}/tsearch_data/part_of_speech_russian.stop", owner: "root", group: "root", mode: "0644" } +# - { src: "files/ru_ru.affix", dest: "/usr/share/postgresql/{{ pg_new_version }}/tsearch_data/ru_ru.affix", owner: "root", group: "root", mode: "0644" } +# - { src: "files/ru_ru.dict", dest: "/usr/share/postgresql/{{ pg_new_version }}/tsearch_data/ru_ru.dict", owner: "root", group: "root", mode: "0644" } + +# if 'pgbouncer_install' is 'true' +pgbouncer_pool_pause: true # or 'false' if you don't want to pause pgbouncer pools during upgrade. +# the time (in seconds) after which instead of waiting for the completion of the active queries, the script terminates the slow active queries. +pgbouncer_pool_pause_terminate_after: 30 +# the time (in seconds) after which the script exit with an error if it was not possible to pause all pgbouncer pools. +pgbouncer_pool_pause_stop_after: 60 +# wait for the completion of active queries that are executed longer than the specified time (in milliseconds) before trying to pause the pool. +pg_slow_active_query_treshold: 1000 +# terminate active queries that longer than the specified time (in milliseconds) after reaching "pgbouncer_pool_pause_terminate_after" before trying to pause the pool. +pg_slow_active_query_treshold_to_terminate: 100 # (0 = terminate all active backends) + +...