From a6a81b080d980878be98e316d30efd1cd8a209e7 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Thu, 28 May 2026 15:09:58 -0400 Subject: [PATCH 01/16] tests: diagnostic on release build --- ebssurrogate/scripts/surrogate-bootstrap-nix.sh | 14 ++++++++++++++ scripts/90-cleanup.sh | 3 +++ 2 files changed, 17 insertions(+) diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index cf804bf57..a3e629874 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -364,7 +364,21 @@ function clean_system { # Copy cleanup scripts cp -v /tmp/ansible-playbook/scripts/90-cleanup.sh /mnt/tmp chmod +x /mnt/tmp/90-cleanup.sh + set +e chroot /mnt /tmp/90-cleanup.sh + cleanup_rc=$? + set -e + echo "==============================================" + echo "[diagnostic] 90-cleanup.sh exit code: ${cleanup_rc}" + echo "[diagnostic] Last 300 lines of /mnt/tmp/90-cleanup.log:" + echo "==============================================" + tail -n 300 /mnt/tmp/90-cleanup.log 2>/dev/null || echo "[diagnostic] no log file present" + echo "==============================================" + echo "[diagnostic] end of 90-cleanup.log tail" + echo "==============================================" + if [ "${cleanup_rc}" -ne 0 ]; then + exit "${cleanup_rc}" + fi # Cleanup logs rm -rf /mnt/var/log/* diff --git a/scripts/90-cleanup.sh b/scripts/90-cleanup.sh index ecb63a8d6..45b37f505 100644 --- a/scripts/90-cleanup.sh +++ b/scripts/90-cleanup.sh @@ -5,6 +5,9 @@ # This code is licensed under Apache 2.0 license (see LICENSE.md for details) set -o errexit +set -x +exec > >(tee -a /tmp/90-cleanup.log) 2>&1 +trap 'echo "[90-cleanup] EXIT $? at line $LINENO: $BASH_COMMAND" >&2' ERR # Ensure /tmp exists and has the proper permissions before # checking for security updates From 13fcd9adffd956b44ff1f583de2e66d1a0dc5d14 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Thu, 28 May 2026 15:13:17 -0400 Subject: [PATCH 02/16] chore: suffix to test release --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 241fb8710..335f56107 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.088-orioledb" - postgres17: "17.6.1.131" - postgres15: "15.14.1.131" + postgresorioledb-17: "17.6.0.086-orioledb-sam-1" + postgres17: "17.6.1.129-sam-1" + postgres15: "15.14.1.129-sam-1" # Non Postgres Extensions pgbouncer_release: 1.25.1 From 94d288ad4266262ba66d8063cae1c2e71c040f97 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 06:54:48 -0400 Subject: [PATCH 03/16] test: add more dignostic --- .../scripts/surrogate-bootstrap-nix.sh | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index a3e629874..54bbedbf7 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -364,6 +364,29 @@ function clean_system { # Copy cleanup scripts cp -v /tmp/ansible-playbook/scripts/90-cleanup.sh /mnt/tmp chmod +x /mnt/tmp/90-cleanup.sh + + # [diagnostic] pre-chroot isolation test for `chmod 1777 /tmp` + # The full 90-cleanup.sh wraps stdout/stderr through `tee` via process + # substitution, which may itself be hiding or perturbing the failure. + # Run the suspect command in a plain chroot bash with no wrapping so we + # can tell whether chmod itself is failing or our instrumentation is. + echo "==[diag pre-chroot]== inspect /tmp and run chmod 1777 /tmp directly" + set +e + chroot /mnt /bin/bash -c ' + set -x + uname -a || true + type chmod || true + ls -lad /tmp || true + stat -f -c "fs=%T mountpoint=%n" /tmp 2>/dev/null || stat -f /tmp 2>/dev/null || true + mount | grep " on /tmp" || true + /bin/chmod 1777 /tmp + echo "chmod_rc=$?" + ls -lad /tmp || true + ' + pre_rc=$? + set -e + echo "==[diag pre-chroot]== returned ${pre_rc}" + set +e chroot /mnt /tmp/90-cleanup.sh cleanup_rc=$? From 108dabe98953e475840fe5c030d377265b8bc5dd Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 07:57:53 -0400 Subject: [PATCH 04/16] test: 5 tests to log failures --- .../scripts/surrogate-bootstrap-nix.sh | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index 54bbedbf7..418150377 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -365,6 +365,54 @@ function clean_system { cp -v /tmp/ansible-playbook/scripts/90-cleanup.sh /mnt/tmp chmod +x /mnt/tmp/90-cleanup.sh + # [diagnostic micro-tests] isolate which dimension of the failing chroot + # call is actually broken. Each tests one variable; output is also captured + # to host-side files so we can `cat` them after, even if the SSH output + # stream is severing. + + echo "==[diag A]== /mnt rootfs inspection" + ls -la /mnt/bin/bash /mnt/usr/bin/chmod /mnt/bin/chmod /mnt/usr/bin/uname 2>&1 || true + mount | grep -E "/mnt(/|$)" || true + echo "==[diag A]== done" + + echo "==[diag B]== chroot /bin/echo (most minimal possible)" + set +e + chroot /mnt /bin/echo "hello-from-echo-no-bash" 2>&1 + echo "==[diag B]== rc=$?" + set -e + + echo "==[diag C]== chroot bash via script file (mirrors line 302 form)" + cat > /mnt/tmp/diag-c.sh <<'SCRIPT' +#!/bin/bash +echo "diag-c-line-1" +echo "diag-c-line-2" +SCRIPT + chmod +x /mnt/tmp/diag-c.sh + set +e + chroot /mnt /tmp/diag-c.sh 2>&1 + echo "==[diag C]== rc=$?" + set -e + + echo "==[diag D]== chroot bash -c with a SINGLE-line arg" + set +e + chroot /mnt /bin/bash -c 'echo single-line-arg-works; exit 0' 2>&1 + echo "==[diag D]== rc=$?" + set -e + + echo "==[diag E]== chroot bash -c with multi-line arg, redirected to host-side file" + set +e + chroot /mnt /bin/bash -c ' +echo multi-line-start +uname -a +echo multi-line-end +' > /mnt/tmp/diag-e.log 2>&1 + e_rc=$? + echo "==[diag E]== rc=${e_rc}" + echo "==[diag E]== /mnt/tmp/diag-e.log contents:" + cat /mnt/tmp/diag-e.log 2>&1 || echo "no log" + echo "==[diag E]== end of log" + set -e + # [diagnostic] pre-chroot isolation test for `chmod 1777 /tmp` # The full 90-cleanup.sh wraps stdout/stderr through `tee` via process # substitution, which may itself be hiding or perturbing the failure. From 043b3945c8edb17033cd571230ce9fd43b05313b Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 07:58:23 -0400 Subject: [PATCH 05/16] chore: bump suffix --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 335f56107..3db211de4 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-1" - postgres17: "17.6.1.129-sam-1" - postgres15: "15.14.1.129-sam-1" + postgresorioledb-17: "17.6.0.086-orioledb-sam-2" + postgres17: "17.6.1.129-sam-2" + postgres15: "15.14.1.129-sam-2" # Non Postgres Extensions pgbouncer_release: 1.25.1 From c27b28b1d70b00bb8754c5909d075e282384576b Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 11:05:20 -0400 Subject: [PATCH 06/16] test: new trap exit checks, rm old diag checks --- .../scripts/surrogate-bootstrap-nix.sh | 99 ++++++------------- 1 file changed, 29 insertions(+), 70 deletions(-) diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index 418150377..3ecd3de6c 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -10,6 +10,35 @@ set -o errexit set -o pipefail set -o xtrace +# [diagnostic exit-trap] capture state at exit regardless of where bash dies. +# Fires on success and failure. No behavior change on success. +# Goal: identify which command exits 123 by capturing resource state, +# kernel dmesg (OOM-kill / ENOSPC), and the tail of ansible.log + 90-cleanup.log +# at the exact moment of exit. +dump_diag_on_exit() { + rc=$? + set +e +o pipefail +x + echo "====================================================================" + echo "[exit-trap] bash exiting with code $rc" + echo "====================================================================" + echo "[exit-trap] --- free -h ---" + free -h 2>&1 || true + echo "[exit-trap] --- df -h ---" + df -h 2>&1 || true + echo "[exit-trap] --- df -i (inodes) ---" + df -i 2>&1 || true + echo "[exit-trap] --- dmesg tail (kernel OOM-kill, ENOSPC, panics) ---" + dmesg 2>&1 | tail -n 100 || true + echo "[exit-trap] --- /tmp/ansible.log tail (last 200 lines) ---" + tail -n 200 /tmp/ansible.log 2>&1 || echo "(no /tmp/ansible.log)" + echo "[exit-trap] --- /mnt/tmp/90-cleanup.log tail (last 200 lines) ---" + tail -n 200 /mnt/tmp/90-cleanup.log 2>&1 || echo "(no /mnt/tmp/90-cleanup.log)" + echo "====================================================================" + echo "[exit-trap] end. final code: $rc" + echo "====================================================================" +} +trap dump_diag_on_exit EXIT + if [ $(dpkg --print-architecture) = "amd64" ]; then ARCH="amd64"; @@ -365,76 +394,6 @@ function clean_system { cp -v /tmp/ansible-playbook/scripts/90-cleanup.sh /mnt/tmp chmod +x /mnt/tmp/90-cleanup.sh - # [diagnostic micro-tests] isolate which dimension of the failing chroot - # call is actually broken. Each tests one variable; output is also captured - # to host-side files so we can `cat` them after, even if the SSH output - # stream is severing. - - echo "==[diag A]== /mnt rootfs inspection" - ls -la /mnt/bin/bash /mnt/usr/bin/chmod /mnt/bin/chmod /mnt/usr/bin/uname 2>&1 || true - mount | grep -E "/mnt(/|$)" || true - echo "==[diag A]== done" - - echo "==[diag B]== chroot /bin/echo (most minimal possible)" - set +e - chroot /mnt /bin/echo "hello-from-echo-no-bash" 2>&1 - echo "==[diag B]== rc=$?" - set -e - - echo "==[diag C]== chroot bash via script file (mirrors line 302 form)" - cat > /mnt/tmp/diag-c.sh <<'SCRIPT' -#!/bin/bash -echo "diag-c-line-1" -echo "diag-c-line-2" -SCRIPT - chmod +x /mnt/tmp/diag-c.sh - set +e - chroot /mnt /tmp/diag-c.sh 2>&1 - echo "==[diag C]== rc=$?" - set -e - - echo "==[diag D]== chroot bash -c with a SINGLE-line arg" - set +e - chroot /mnt /bin/bash -c 'echo single-line-arg-works; exit 0' 2>&1 - echo "==[diag D]== rc=$?" - set -e - - echo "==[diag E]== chroot bash -c with multi-line arg, redirected to host-side file" - set +e - chroot /mnt /bin/bash -c ' -echo multi-line-start -uname -a -echo multi-line-end -' > /mnt/tmp/diag-e.log 2>&1 - e_rc=$? - echo "==[diag E]== rc=${e_rc}" - echo "==[diag E]== /mnt/tmp/diag-e.log contents:" - cat /mnt/tmp/diag-e.log 2>&1 || echo "no log" - echo "==[diag E]== end of log" - set -e - - # [diagnostic] pre-chroot isolation test for `chmod 1777 /tmp` - # The full 90-cleanup.sh wraps stdout/stderr through `tee` via process - # substitution, which may itself be hiding or perturbing the failure. - # Run the suspect command in a plain chroot bash with no wrapping so we - # can tell whether chmod itself is failing or our instrumentation is. - echo "==[diag pre-chroot]== inspect /tmp and run chmod 1777 /tmp directly" - set +e - chroot /mnt /bin/bash -c ' - set -x - uname -a || true - type chmod || true - ls -lad /tmp || true - stat -f -c "fs=%T mountpoint=%n" /tmp 2>/dev/null || stat -f /tmp 2>/dev/null || true - mount | grep " on /tmp" || true - /bin/chmod 1777 /tmp - echo "chmod_rc=$?" - ls -lad /tmp || true - ' - pre_rc=$? - set -e - echo "==[diag pre-chroot]== returned ${pre_rc}" - set +e chroot /mnt /tmp/90-cleanup.sh cleanup_rc=$? From a25ff61d4bca1678f0a6e901d6f288a0c1b4edfd Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 13:41:48 -0400 Subject: [PATCH 07/16] chore: bump suffix --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 3db211de4..37f5795b2 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-2" - postgres17: "17.6.1.129-sam-2" - postgres15: "15.14.1.129-sam-2" + postgresorioledb-17: "17.6.0.086-orioledb-sam-3" + postgres17: "17.6.1.129-sam-3" + postgres15: "15.14.1.129-sam-3" # Non Postgres Extensions pgbouncer_release: 1.25.1 From 21886c306cf9924a5a2483172208baa7f33892da Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 15:19:10 -0400 Subject: [PATCH 08/16] test: more diagnostic steps for amd64 build --- .../scripts/surrogate-bootstrap-nix.sh | 75 +++++++++++++++++-- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index 3ecd3de6c..ed3410de7 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -18,6 +18,10 @@ set -o xtrace dump_diag_on_exit() { rc=$? set +e +o pipefail +x + if [ "${ARCH:-}" != "amd64" ]; then + return 0 + fi + echo "====================================================================" echo "[exit-trap] bash exiting with code $rc" echo "====================================================================" @@ -27,12 +31,10 @@ dump_diag_on_exit() { df -h 2>&1 || true echo "[exit-trap] --- df -i (inodes) ---" df -i 2>&1 || true - echo "[exit-trap] --- dmesg tail (kernel OOM-kill, ENOSPC, panics) ---" - dmesg 2>&1 | tail -n 100 || true - echo "[exit-trap] --- /tmp/ansible.log tail (last 200 lines) ---" - tail -n 200 /tmp/ansible.log 2>&1 || echo "(no /tmp/ansible.log)" - echo "[exit-trap] --- /mnt/tmp/90-cleanup.log tail (last 200 lines) ---" - tail -n 200 /mnt/tmp/90-cleanup.log 2>&1 || echo "(no /mnt/tmp/90-cleanup.log)" + echo "[exit-trap] --- top memory processes (command names only) ---" + ps -eo pid,ppid,comm,%mem,rss --sort=-rss | head -20 || true + echo "[exit-trap] --- sanitized kernel signal counts ---" + dmesg 2>/dev/null | grep -Eic 'out of memory|oom|killed process|kernel panic|panic|ext4|i/o error|no space left' || true echo "====================================================================" echo "[exit-trap] end. final code: $rc" echo "====================================================================" @@ -46,6 +48,58 @@ else ARCH="arm64"; fi +function enable_amd64_build_diagnostics { + if [ "${ARCH}" != "amd64" ]; then + return 0 + fi + + echo "==[amd64-diagnostic]== enabling build-time swap" + if ! swapon --show=NAME --noheadings | grep -qx "/mnt/tmp/build-swapfile"; then + fallocate -l 16G /mnt/tmp/build-swapfile + chmod 600 /mnt/tmp/build-swapfile + mkswap /mnt/tmp/build-swapfile + swapon /mnt/tmp/build-swapfile + fi + swapon --show + free -h + + echo "==[amd64-diagnostic]== disabling OOM panic for diagnostic run" + sysctl -w vm.panic_on_oom=0 || true + sysctl -w kernel.panic=0 || true +} + +function start_amd64_watchdog { + if [ "${ARCH}" != "amd64" ]; then + return 0 + fi + + ( + set +e +o pipefail +x + while true; do + echo "==[amd64-watchdog $(date -Is)]==" + free -h || true + swapon --show || true + df -h / /mnt /mnt/tmp /mnt/data 2>&1 || true + df -i / /mnt /mnt/tmp /mnt/data 2>&1 || true + ps -eo pid,ppid,comm,%mem,rss --sort=-rss | head -20 || true + sleep 15 + done + ) & + WATCHDOG_PID=$! +} + +function stop_amd64_watchdog { + if [ -n "${WATCHDOG_PID:-}" ]; then + kill "${WATCHDOG_PID}" 2>/dev/null || true + fi +} + +function amd64_phase { + if [ "${ARCH}" = "amd64" ]; then + echo "==[phase] $1 $(date -Is)==" + fi +} + # Mirror fallback function for resilient apt-get update function apt_update_with_fallback { local sources_file="/etc/apt/sources.list" @@ -486,9 +540,18 @@ create_swapfile format_build_partition #pull_docker setup_chroot_environment +enable_amd64_build_diagnostics +start_amd64_watchdog #download_ccache +amd64_phase "before execute_playbook" execute_playbook +amd64_phase "after execute_playbook" +amd64_phase "before update_systemd_services" update_systemd_services +amd64_phase "after update_systemd_services" #upload_ccache +amd64_phase "before clean_system" clean_system +amd64_phase "after clean_system" +stop_amd64_watchdog umount_reset_mappings From cc81193a6ddf25e3d506e84ad38c257574001440 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 15:21:45 -0400 Subject: [PATCH 09/16] chore: suffix bump testing --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 37f5795b2..52ce3b19d 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-3" - postgres17: "17.6.1.129-sam-3" - postgres15: "15.14.1.129-sam-3" + postgresorioledb-17: "17.6.0.086-orioledb-sam-4" + postgres17: "17.6.1.129-sam-4" + postgres15: "15.14.1.129-sam-4" # Non Postgres Extensions pgbouncer_release: 1.25.1 From e00fc44477372d551f224c79c38f831f0d0224ab Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 16:01:18 -0400 Subject: [PATCH 10/16] test: refactor diag --- ansible/vars.yml | 6 +++--- ebssurrogate/scripts/surrogate-bootstrap-nix.sh | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 52ce3b19d..c5f8af431 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-4" - postgres17: "17.6.1.129-sam-4" - postgres15: "15.14.1.129-sam-4" + postgresorioledb-17: "17.6.0.086-orioledb-sam-5" + postgres17: "17.6.1.129-sam-5" + postgres15: "15.14.1.129-sam-5" # Non Postgres Extensions pgbouncer_release: 1.25.1 diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index ed3410de7..e176efd45 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -55,10 +55,14 @@ function enable_amd64_build_diagnostics { echo "==[amd64-diagnostic]== enabling build-time swap" if ! swapon --show=NAME --noheadings | grep -qx "/mnt/tmp/build-swapfile"; then - fallocate -l 16G /mnt/tmp/build-swapfile - chmod 600 /mnt/tmp/build-swapfile - mkswap /mnt/tmp/build-swapfile - swapon /mnt/tmp/build-swapfile + if fallocate -l 4G /mnt/tmp/build-swapfile; then + chmod 600 /mnt/tmp/build-swapfile + mkswap /mnt/tmp/build-swapfile + swapon /mnt/tmp/build-swapfile + else + echo "==[amd64-diagnostic]== unable to allocate build swap; continuing without it" + rm -f /mnt/tmp/build-swapfile + fi fi swapon --show free -h From 824605f101f9e571c6427aadbcba1bb9e3e6c547 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 16:16:47 -0400 Subject: [PATCH 11/16] test: moved enable_amd64_build_diagnostics to after execute_playbook --- ansible/vars.yml | 6 +++--- ebssurrogate/scripts/surrogate-bootstrap-nix.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index c5f8af431..375eb2e8b 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-5" - postgres17: "17.6.1.129-sam-5" - postgres15: "15.14.1.129-sam-5" + postgresorioledb-17: "17.6.0.086-orioledb-sam-6" + postgres17: "17.6.1.129-sam-6" + postgres15: "15.14.1.129-sam-6" # Non Postgres Extensions pgbouncer_release: 1.25.1 diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index e176efd45..e4d95037c 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -544,12 +544,12 @@ create_swapfile format_build_partition #pull_docker setup_chroot_environment -enable_amd64_build_diagnostics start_amd64_watchdog #download_ccache amd64_phase "before execute_playbook" execute_playbook amd64_phase "after execute_playbook" +enable_amd64_build_diagnostics amd64_phase "before update_systemd_services" update_systemd_services amd64_phase "after update_systemd_services" From 3cf40dc36a48a1ff4f98317ae07d480d585e3931 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 17:00:30 -0400 Subject: [PATCH 12/16] test: change only the two live panic sysctl tasks from reload: true to reload: false --- ansible/tasks/setup-system.yml | 4 ++-- ansible/vars.yml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/tasks/setup-system.yml b/ansible/tasks/setup-system.yml index 2feddaaf2..1c2749c50 100644 --- a/ansible/tasks/setup-system.yml +++ b/ansible/tasks/setup-system.yml @@ -183,14 +183,14 @@ - name: Set vm.panic_on_oom=1 ansible.builtin.sysctl: name: 'vm.panic_on_oom' - reload: true + reload: false state: 'present' value: '1' - name: Set kernel.panic=10 ansible.builtin.sysctl: name: 'kernel.panic' - reload: true + reload: false state: 'present' value: '10' diff --git a/ansible/vars.yml b/ansible/vars.yml index 375eb2e8b..571172337 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-6" - postgres17: "17.6.1.129-sam-6" - postgres15: "15.14.1.129-sam-6" + postgresorioledb-17: "17.6.0.086-orioledb-sam-7" + postgres17: "17.6.1.129-sam-7" + postgres15: "15.14.1.129-sam-7" # Non Postgres Extensions pgbouncer_release: 1.25.1 From fe584420761fbb0a6ebf56fa5261eed94fbc59d9 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Fri, 29 May 2026 17:27:39 -0400 Subject: [PATCH 13/16] chore: suffix bump --- amazon-amd64-nix.pkr.hcl | 2 +- ansible/vars.yml | 6 +++--- ebssurrogate/scripts/surrogate-bootstrap-nix.sh | 14 ++++++-------- scripts/90-cleanup.sh | 1 - 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/amazon-amd64-nix.pkr.hcl b/amazon-amd64-nix.pkr.hcl index 9ec3ed95c..443387d6d 100644 --- a/amazon-amd64-nix.pkr.hcl +++ b/amazon-amd64-nix.pkr.hcl @@ -168,7 +168,7 @@ source "amazon-ebssurrogate" "source" { } communicator = "ssh" - ssh_pty = true + ssh_pty = false ssh_username = "ubuntu" ssh_timeout = "5m" diff --git a/ansible/vars.yml b/ansible/vars.yml index 571172337..382249625 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-7" - postgres17: "17.6.1.129-sam-7" - postgres15: "15.14.1.129-sam-7" + postgresorioledb-17: "17.6.0.086-orioledb-sam-8" + postgres17: "17.6.1.129-sam-8" + postgres15: "15.14.1.129-sam-8" # Non Postgres Extensions pgbouncer_release: 1.25.1 diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index e4d95037c..f7a3a5a41 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -8,7 +8,6 @@ set -o errexit set -o pipefail -set -o xtrace # [diagnostic exit-trap] capture state at exit regardless of where bash dies. # Fires on success and failure. No behavior change on success. @@ -80,13 +79,12 @@ function start_amd64_watchdog { ( set +e +o pipefail +x while true; do - echo "==[amd64-watchdog $(date -Is)]==" - free -h || true - swapon --show || true - df -h / /mnt /mnt/tmp /mnt/data 2>&1 || true - df -i / /mnt /mnt/tmp /mnt/data 2>&1 || true - ps -eo pid,ppid,comm,%mem,rss --sort=-rss | head -20 || true - sleep 15 + mem_available_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo 2>/dev/null || echo unknown) + root_used=$(df -P / 2>/dev/null | awk 'NR==2 {print $5}' || echo unknown) + mnt_used=$(df -P /mnt 2>/dev/null | awk 'NR==2 {print $5}' || echo unknown) + tmp_used=$(df -P /mnt/tmp 2>/dev/null | awk 'NR==2 {print $5}' || echo unknown) + echo "==[amd64-watchdog $(date -Is)] mem_available_kb=${mem_available_kb} root=${root_used} mnt=${mnt_used} tmp=${tmp_used}==" + sleep 60 done ) & WATCHDOG_PID=$! diff --git a/scripts/90-cleanup.sh b/scripts/90-cleanup.sh index 45b37f505..a6a06fc2b 100644 --- a/scripts/90-cleanup.sh +++ b/scripts/90-cleanup.sh @@ -5,7 +5,6 @@ # This code is licensed under Apache 2.0 license (see LICENSE.md for details) set -o errexit -set -x exec > >(tee -a /tmp/90-cleanup.log) 2>&1 trap 'echo "[90-cleanup] EXIT $? at line $LINENO: $BASH_COMMAND" >&2' ERR From f2e24fbaad499de73daf48c8764a0a1c09ab8ee0 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Sat, 30 May 2026 06:25:57 -0400 Subject: [PATCH 14/16] chore: bump suffix --- ansible/vars.yml | 6 +++--- ebssurrogate/scripts/surrogate-bootstrap-nix.sh | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 382249625..f5dc422d4 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-8" - postgres17: "17.6.1.129-sam-8" - postgres15: "15.14.1.129-sam-8" + postgresorioledb-17: "17.6.0.086-orioledb-sam-9" + postgres17: "17.6.1.129-sam-9" + postgres15: "15.14.1.129-sam-9" # Non Postgres Extensions pgbouncer_release: 1.25.1 diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index f7a3a5a41..41fadad1c 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -71,6 +71,16 @@ function enable_amd64_build_diagnostics { sysctl -w kernel.panic=0 || true } +function disable_amd64_build_diagnostics { + if [ "${ARCH}" != "amd64" ]; then + return 0 + fi + + echo "==[amd64-diagnostic]== disabling build-time swap" + swapoff /mnt/tmp/build-swapfile 2>/dev/null || true + rm -f /mnt/tmp/build-swapfile +} + function start_amd64_watchdog { if [ "${ARCH}" != "amd64" ]; then return 0 @@ -552,6 +562,7 @@ amd64_phase "before update_systemd_services" update_systemd_services amd64_phase "after update_systemd_services" #upload_ccache +disable_amd64_build_diagnostics amd64_phase "before clean_system" clean_system amd64_phase "after clean_system" From 9a8f8e9e824d78e6f81cc54b84a651846a2b6496 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Sat, 30 May 2026 07:18:41 -0400 Subject: [PATCH 15/16] fix: resolve release issues x84 --- ansible/tasks/setup-system.yml | 4 +- ansible/vars.yml | 6 +- .../scripts/surrogate-bootstrap-nix.sh | 122 +----------------- scripts/90-cleanup.sh | 2 - 4 files changed, 6 insertions(+), 128 deletions(-) diff --git a/ansible/tasks/setup-system.yml b/ansible/tasks/setup-system.yml index 1c2749c50..2feddaaf2 100644 --- a/ansible/tasks/setup-system.yml +++ b/ansible/tasks/setup-system.yml @@ -183,14 +183,14 @@ - name: Set vm.panic_on_oom=1 ansible.builtin.sysctl: name: 'vm.panic_on_oom' - reload: false + reload: true state: 'present' value: '1' - name: Set kernel.panic=10 ansible.builtin.sysctl: name: 'kernel.panic' - reload: false + reload: true state: 'present' value: '10' diff --git a/ansible/vars.yml b/ansible/vars.yml index f5dc422d4..f4cb06ec6 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.086-orioledb-sam-9" - postgres17: "17.6.1.129-sam-9" - postgres15: "15.14.1.129-sam-9" + postgresorioledb-17: "17.6.0.088-orioledb-sam-10" + postgres17: "17.6.1.131-sam-10" + postgres15: "15.14.1.131-sam-10" # Non Postgres Extensions pgbouncer_release: 1.25.1 diff --git a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh index 41fadad1c..cf804bf57 100755 --- a/ebssurrogate/scripts/surrogate-bootstrap-nix.sh +++ b/ebssurrogate/scripts/surrogate-bootstrap-nix.sh @@ -8,37 +8,7 @@ set -o errexit set -o pipefail - -# [diagnostic exit-trap] capture state at exit regardless of where bash dies. -# Fires on success and failure. No behavior change on success. -# Goal: identify which command exits 123 by capturing resource state, -# kernel dmesg (OOM-kill / ENOSPC), and the tail of ansible.log + 90-cleanup.log -# at the exact moment of exit. -dump_diag_on_exit() { - rc=$? - set +e +o pipefail +x - if [ "${ARCH:-}" != "amd64" ]; then - return 0 - fi - - echo "====================================================================" - echo "[exit-trap] bash exiting with code $rc" - echo "====================================================================" - echo "[exit-trap] --- free -h ---" - free -h 2>&1 || true - echo "[exit-trap] --- df -h ---" - df -h 2>&1 || true - echo "[exit-trap] --- df -i (inodes) ---" - df -i 2>&1 || true - echo "[exit-trap] --- top memory processes (command names only) ---" - ps -eo pid,ppid,comm,%mem,rss --sort=-rss | head -20 || true - echo "[exit-trap] --- sanitized kernel signal counts ---" - dmesg 2>/dev/null | grep -Eic 'out of memory|oom|killed process|kernel panic|panic|ext4|i/o error|no space left' || true - echo "====================================================================" - echo "[exit-trap] end. final code: $rc" - echo "====================================================================" -} -trap dump_diag_on_exit EXIT +set -o xtrace if [ $(dpkg --print-architecture) = "amd64" ]; then @@ -47,71 +17,6 @@ else ARCH="arm64"; fi -function enable_amd64_build_diagnostics { - if [ "${ARCH}" != "amd64" ]; then - return 0 - fi - - echo "==[amd64-diagnostic]== enabling build-time swap" - if ! swapon --show=NAME --noheadings | grep -qx "/mnt/tmp/build-swapfile"; then - if fallocate -l 4G /mnt/tmp/build-swapfile; then - chmod 600 /mnt/tmp/build-swapfile - mkswap /mnt/tmp/build-swapfile - swapon /mnt/tmp/build-swapfile - else - echo "==[amd64-diagnostic]== unable to allocate build swap; continuing without it" - rm -f /mnt/tmp/build-swapfile - fi - fi - swapon --show - free -h - - echo "==[amd64-diagnostic]== disabling OOM panic for diagnostic run" - sysctl -w vm.panic_on_oom=0 || true - sysctl -w kernel.panic=0 || true -} - -function disable_amd64_build_diagnostics { - if [ "${ARCH}" != "amd64" ]; then - return 0 - fi - - echo "==[amd64-diagnostic]== disabling build-time swap" - swapoff /mnt/tmp/build-swapfile 2>/dev/null || true - rm -f /mnt/tmp/build-swapfile -} - -function start_amd64_watchdog { - if [ "${ARCH}" != "amd64" ]; then - return 0 - fi - - ( - set +e +o pipefail +x - while true; do - mem_available_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo 2>/dev/null || echo unknown) - root_used=$(df -P / 2>/dev/null | awk 'NR==2 {print $5}' || echo unknown) - mnt_used=$(df -P /mnt 2>/dev/null | awk 'NR==2 {print $5}' || echo unknown) - tmp_used=$(df -P /mnt/tmp 2>/dev/null | awk 'NR==2 {print $5}' || echo unknown) - echo "==[amd64-watchdog $(date -Is)] mem_available_kb=${mem_available_kb} root=${root_used} mnt=${mnt_used} tmp=${tmp_used}==" - sleep 60 - done - ) & - WATCHDOG_PID=$! -} - -function stop_amd64_watchdog { - if [ -n "${WATCHDOG_PID:-}" ]; then - kill "${WATCHDOG_PID}" 2>/dev/null || true - fi -} - -function amd64_phase { - if [ "${ARCH}" = "amd64" ]; then - echo "==[phase] $1 $(date -Is)==" - fi -} - # Mirror fallback function for resilient apt-get update function apt_update_with_fallback { local sources_file="/etc/apt/sources.list" @@ -459,22 +364,7 @@ function clean_system { # Copy cleanup scripts cp -v /tmp/ansible-playbook/scripts/90-cleanup.sh /mnt/tmp chmod +x /mnt/tmp/90-cleanup.sh - - set +e chroot /mnt /tmp/90-cleanup.sh - cleanup_rc=$? - set -e - echo "==============================================" - echo "[diagnostic] 90-cleanup.sh exit code: ${cleanup_rc}" - echo "[diagnostic] Last 300 lines of /mnt/tmp/90-cleanup.log:" - echo "==============================================" - tail -n 300 /mnt/tmp/90-cleanup.log 2>/dev/null || echo "[diagnostic] no log file present" - echo "==============================================" - echo "[diagnostic] end of 90-cleanup.log tail" - echo "==============================================" - if [ "${cleanup_rc}" -ne 0 ]; then - exit "${cleanup_rc}" - fi # Cleanup logs rm -rf /mnt/var/log/* @@ -552,19 +442,9 @@ create_swapfile format_build_partition #pull_docker setup_chroot_environment -start_amd64_watchdog #download_ccache -amd64_phase "before execute_playbook" execute_playbook -amd64_phase "after execute_playbook" -enable_amd64_build_diagnostics -amd64_phase "before update_systemd_services" update_systemd_services -amd64_phase "after update_systemd_services" #upload_ccache -disable_amd64_build_diagnostics -amd64_phase "before clean_system" clean_system -amd64_phase "after clean_system" -stop_amd64_watchdog umount_reset_mappings diff --git a/scripts/90-cleanup.sh b/scripts/90-cleanup.sh index a6a06fc2b..ecb63a8d6 100644 --- a/scripts/90-cleanup.sh +++ b/scripts/90-cleanup.sh @@ -5,8 +5,6 @@ # This code is licensed under Apache 2.0 license (see LICENSE.md for details) set -o errexit -exec > >(tee -a /tmp/90-cleanup.log) 2>&1 -trap 'echo "[90-cleanup] EXIT $? at line $LINENO: $BASH_COMMAND" >&2' ERR # Ensure /tmp exists and has the proper permissions before # checking for security updates From fc88b918752eb34cb7ef89262d3621d3ed9f23aa Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Sat, 30 May 2026 08:14:42 -0400 Subject: [PATCH 16/16] chore: bump to release --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index f4cb06ec6..aca4f67ac 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.088-orioledb-sam-10" - postgres17: "17.6.1.131-sam-10" - postgres15: "15.14.1.131-sam-10" + postgresorioledb-17: "17.6.0.089-orioledb" + postgres17: "17.6.1.132" + postgres15: "15.14.1.132" # Non Postgres Extensions pgbouncer_release: 1.25.1