From f31449fff2d61d7022c033640cdbb70da57edf44 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Tue, 9 Dec 2025 14:05:44 -0500 Subject: [PATCH 1/6] fix: apt install with mirrors --- ebssurrogate/scripts/chroot-bootstrap-nix.sh | 173 +++++++++++++++---- 1 file changed, 137 insertions(+), 36 deletions(-) diff --git a/ebssurrogate/scripts/chroot-bootstrap-nix.sh b/ebssurrogate/scripts/chroot-bootstrap-nix.sh index 7e7991bce..68b5c09b7 100755 --- a/ebssurrogate/scripts/chroot-bootstrap-nix.sh +++ b/ebssurrogate/scripts/chroot-bootstrap-nix.sh @@ -21,40 +21,56 @@ else ARCH="arm64"; fi -# Mirror fallback function for resilient apt-get update -function apt_update_with_fallback { +# Get current mirror from sources.list +function get_current_mirror { + grep -oP 'http://[^/]+(?=/ubuntu-ports/)' /etc/apt/sources.list | head -1 || echo "" +} + +# Switch to a different mirror +function switch_mirror { + local new_mirror="$1" local sources_file="/etc/apt/sources.list" - local max_attempts=2 - local attempt=1 - # Detect the current region from sources.list (it's already been substituted) - # Extract the region from existing sources.list entries + echo "Switching to mirror: ${new_mirror}" + sed -i "s|http://[^/]*/ubuntu-ports/|http://${new_mirror}/ubuntu-ports/|g" "${sources_file}" + + # Show what we're using + echo "Current sources.list configuration:" + grep -E '^deb ' "${sources_file}" | head -3 +} + +# Get list of mirrors to try +function get_mirror_list { + local sources_file="/etc/apt/sources.list" local current_region=$(grep -oP '(?<=http://)[^.]+(?=\.clouds\.ports\.ubuntu\.com)' "${sources_file}" | head -1 || echo "") - # Define mirror tiers (in priority order) - local -a mirror_tiers=( - "${current_region}.clouds.ports.ubuntu.com" # Tier 1: Regional CDN (as set in sources.list) - "ports.ubuntu.com" # Tier 2: Global pool - ) + local -a mirrors=() - # If we couldn't detect current region, skip tier 1 - if [ -z "${current_region}" ]; then - echo "Warning: Could not determine region from sources.list, skipping regional CDN" - mirror_tiers=("${mirror_tiers[@]:1}") # Remove first element + # Add regional CDN if detected + if [ -n "${current_region}" ]; then + mirrors+=("${current_region}.clouds.ports.ubuntu.com") fi - for mirror in "${mirror_tiers[@]}"; do + # Add global fallback + mirrors+=("ports.ubuntu.com") + + echo "${mirrors[@]}" +} + +# Mirror fallback function for resilient apt-get update +function apt_update_with_fallback { + local sources_file="/etc/apt/sources.list" + local -a mirror_list=($(get_mirror_list)) + local attempt=1 + local max_attempts=${#mirror_list[@]} + + for mirror in "${mirror_list[@]}"; do echo "=========================================" echo "Attempting apt-get update with mirror: ${mirror}" echo "Attempt ${attempt} of ${max_attempts}" echo "=========================================" - # Update sources.list to use current mirror - sed -i "s|http://[^/]*/ubuntu-ports/|http://${mirror}/ubuntu-ports/|g" "${sources_file}" - - # Show what we're using - echo "Current sources.list configuration:" - grep -E '^deb ' "${sources_file}" | head -3 + switch_mirror "${mirror}" # Attempt update with timeout (5 minutes) if timeout 300 apt-get $APT_OPTIONS update 2>&1; then @@ -90,6 +106,73 @@ function apt_update_with_fallback { return 1 } +# Wrapper for apt-get install with mirror fallback on 404 errors +function apt_install_with_fallback { + local -a mirror_list=($(get_mirror_list)) + local attempt=1 + local max_attempts=${#mirror_list[@]} + local original_mirror=$(get_current_mirror) + + for mirror in "${mirror_list[@]}"; do + echo "=========================================" + echo "Attempting apt-get install with mirror: ${mirror}" + echo "Attempt ${attempt} of ${max_attempts}" + echo "=========================================" + + switch_mirror "${mirror}" + + # Re-run apt-get update to get package lists from new mirror + if ! timeout 300 apt-get $APT_OPTIONS update 2>&1; then + echo "Warning: apt-get update failed for mirror ${mirror}, trying next..." + attempt=$((attempt + 1)) + continue + fi + + # Attempt install with timeout (15 minutes for large packages) + # Capture output to detect 404 errors + local output + local exit_code + output=$(timeout 900 apt-get "$@" 2>&1) && exit_code=0 || exit_code=$? + echo "${output}" + + if [ ${exit_code} -eq 0 ]; then + echo "=========================================" + echo "✓ Successfully installed packages using mirror: ${mirror}" + echo "=========================================" + return 0 + fi + + # Check if failure was due to 404/mirror issues + if echo "${output}" | grep -qE '(404\s+Not Found|Failed to fetch|Hash Sum mismatch|Size mismatch)'; then + echo "=========================================" + echo "✗ Mirror issue detected (404/hash/size mismatch), trying next mirror..." + echo "=========================================" + + # Clean apt cache to force re-download + apt-get clean + + if [ ${attempt} -lt ${max_attempts} ]; then + local sleep_time=$((attempt * 5)) + echo "Waiting ${sleep_time} seconds before trying next mirror..." + sleep ${sleep_time} + fi + else + # Non-mirror related failure, don't retry + echo "=========================================" + echo "✗ Install failed with non-mirror error, not retrying" + echo "=========================================" + return ${exit_code} + fi + + attempt=$((attempt + 1)) + done + + echo "=========================================" + echo "ERROR: All mirror tiers failed for apt-get install after ${max_attempts} attempts" + echo "=========================================" + return 1 +} + function update_install_packages { @@ -107,16 +190,19 @@ function update_install_packages { if [ "${ARCH}" = "amd64" ]; then echo 'grub-pc grub-pc/install_devices_empty select true' | debconf-set-selections echo 'grub-pc grub-pc/install_devices select' | debconf-set-selections - # Install various packages needed for a booting system - apt-get install -y \ - linux-aws \ - grub-pc \ - e2fsprogs + # Install various packages needed for a booting system (with mirror fallback) + if ! apt_install_with_fallback install -y linux-aws grub-pc e2fsprogs; then + echo "FATAL: Failed to install boot packages" + exit 1 + fi else - apt-get install -y e2fsprogs + if ! apt_install_with_fallback install -y e2fsprogs; then + echo "FATAL: Failed to install e2fsprogs" + exit 1 + fi fi - # Install standard packages - apt-get install -y \ + # Install standard packages (with mirror fallback) + if ! apt_install_with_fallback install -y \ sudo \ wget \ cloud-init \ @@ -125,7 +211,10 @@ function update_install_packages { ec2-instance-connect \ hibagent \ ncurses-term \ - ssh-import-id \ + ssh-import-id; then + echo "FATAL: Failed to install standard packages" + exit 1 + fi # apt upgrade apt-get upgrade -y @@ -136,7 +225,7 @@ function update_install_packages { echo "FATAL: Failed to update package lists after adding universe repository" exit 1 fi - apt-get install -y --no-install-recommends \ + if ! apt_install_with_fallback install -y --no-install-recommends \ openssh-server \ git \ ufw \ @@ -146,10 +235,16 @@ function update_install_packages { locales \ at \ less \ - python3-systemd + python3-systemd; then + echo "FATAL: Failed to install universe packages" + exit 1 + fi if [ "${ARCH}" = "arm64" ]; then - apt-get $APT_OPTIONS --yes install linux-aws initramfs-tools dosfstools + if ! apt_install_with_fallback $APT_OPTIONS --yes install linux-aws initramfs-tools dosfstools; then + echo "FATAL: Failed to install arm64 boot packages" + exit 1 + fi fi } @@ -199,7 +294,10 @@ function install_packages_for_build { } function setup_apparmor { - apt-get install -y apparmor apparmor-utils auditd + if ! apt_install_with_fallback install -y apparmor apparmor-utils auditd; then + echo "FATAL: Failed to install apparmor packages" + exit 1 + fi # Copy apparmor profiles cp -rv /tmp/apparmor_profiles/* /etc/apparmor.d/ @@ -218,7 +316,10 @@ EOF # Install GRUB function install_configure_grub { if [ "${ARCH}" = "arm64" ]; then - apt-get $APT_OPTIONS --yes install cloud-guest-utils fdisk grub-efi-arm64 efibootmgr + if ! apt_install_with_fallback $APT_OPTIONS --yes install cloud-guest-utils fdisk grub-efi-arm64 efibootmgr; then + echo "FATAL: Failed to install grub packages for arm64" + exit 1 + fi setup_grub_conf_arm64 rm -rf /etc/grub.d/30_os-prober sleep 1 From 7787297c948accef10eeb462d30756c30a2bc671 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Tue, 9 Dec 2025 18:37:29 -0500 Subject: [PATCH 2/6] fix: longer timeout and error handling --- ebssurrogate/scripts/chroot-bootstrap-nix.sh | 66 ++++++++++++++++++-- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/ebssurrogate/scripts/chroot-bootstrap-nix.sh b/ebssurrogate/scripts/chroot-bootstrap-nix.sh index 68b5c09b7..5e60deeda 100755 --- a/ebssurrogate/scripts/chroot-bootstrap-nix.sh +++ b/ebssurrogate/scripts/chroot-bootstrap-nix.sh @@ -128,11 +128,11 @@ function apt_install_with_fallback { continue fi - # Attempt install with timeout (15 minutes for large packages) + # Attempt install with timeout (30 minutes for large packages) # Capture output to detect 404 errors local output local exit_code - output=$(timeout 900 apt-get "$@" 2>&1) && exit_code=0 || exit_code=$? + output=$(timeout 1800 apt-get "$@" 2>&1) && exit_code=0 || exit_code=$? echo "${output}" if [ ${exit_code} -eq 0 ]; then @@ -142,6 +142,62 @@ function apt_install_with_fallback { return 0 fi + # Handle timeout (exit code 124) - check if packages were actually installed + if [ ${exit_code} -eq 124 ]; then + echo "=========================================" + echo "⚠ Timeout occurred (exit code 124), verifying if packages were installed..." + echo "=========================================" + + # Extract package names from arguments (skip flags like -y, --no-install-recommends, install) + local -a packages_to_check=() + local skip_next=false + for arg in "$@"; do + if [ "${skip_next}" = true ]; then + skip_next=false + continue + fi + case "${arg}" in + install|-y|--yes|--no-install-recommends|--no-install-suggests) + continue + ;; + -o*) + continue + ;; + -*) + # Skip flags and their potential arguments + continue + ;; + *) + packages_to_check+=("${arg}") + ;; + esac + done + + # Verify each package is installed + local all_installed=true + for pkg in "${packages_to_check[@]}"; do + if dpkg -l "${pkg}" 2>/dev/null | grep -q "^ii"; then + echo "✓ Package '${pkg}' is installed" + else + echo "✗ Package '${pkg}' is NOT installed" + all_installed=false + fi + done + + if [ "${all_installed}" = true ]; then + echo "=========================================" + echo "✓ All packages verified as installed despite timeout" + echo " (Timeout likely occurred during post-install triggers)" + echo "=========================================" + return 0 + else + echo "=========================================" + echo "✗ Some packages missing after timeout, will retry..." + echo "=========================================" + # Fall through to retry logic + fi + fi + # Check if failure was due to 404/mirror issues if echo "${output}" | grep -qE '(404\s+Not Found|Failed to fetch|Hash Sum mismatch|Size mismatch)'; then echo "=========================================" @@ -156,10 +212,10 @@ function apt_install_with_fallback { echo "Waiting ${sleep_time} seconds before trying next mirror..." sleep ${sleep_time} fi - else - # Non-mirror related failure, don't retry + elif [ ${exit_code} -ne 124 ]; then + # Non-mirror, non-timeout related failure, don't retry echo "=========================================" - echo "✗ Install failed with non-mirror error, not retrying" + echo "✗ Install failed with non-mirror error (exit code: ${exit_code}), not retrying" echo "=========================================" return ${exit_code} fi From 0e70635c9e5a5c4b5ed9ce09fa55cbdcc0f357f6 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Tue, 9 Dec 2025 19:16:50 -0500 Subject: [PATCH 3/6] fix: country specific mirror --- ebssurrogate/scripts/chroot-bootstrap-nix.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ebssurrogate/scripts/chroot-bootstrap-nix.sh b/ebssurrogate/scripts/chroot-bootstrap-nix.sh index 5e60deeda..f1e2579fa 100755 --- a/ebssurrogate/scripts/chroot-bootstrap-nix.sh +++ b/ebssurrogate/scripts/chroot-bootstrap-nix.sh @@ -46,12 +46,20 @@ function get_mirror_list { local -a mirrors=() - # Add regional CDN if detected + # Priority order: + # 1. Country-specific mirror (most reliable) + # 2. Regional CDN (can be inconsistent) + # 3. Global fallback + + # Singapore country mirror for ap-southeast-1 + if [ "${current_region}" = "ap-southeast-1" ]; then + mirrors+=("sg.ports.ubuntu.com") + fi + if [ -n "${current_region}" ]; then mirrors+=("${current_region}.clouds.ports.ubuntu.com") fi - # Add global fallback mirrors+=("ports.ubuntu.com") echo "${mirrors[@]}" From 6a17f8ea75209412555084936e517220ae430834 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Tue, 9 Dec 2025 19:48:47 -0500 Subject: [PATCH 4/6] fix: move to stage 2 --- ebssurrogate/scripts/chroot-bootstrap-nix.sh | 6 +++--- scripts/nix-provision.sh | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ebssurrogate/scripts/chroot-bootstrap-nix.sh b/ebssurrogate/scripts/chroot-bootstrap-nix.sh index f1e2579fa..5af260895 100755 --- a/ebssurrogate/scripts/chroot-bootstrap-nix.sh +++ b/ebssurrogate/scripts/chroot-bootstrap-nix.sh @@ -266,14 +266,14 @@ function update_install_packages { fi fi # Install standard packages (with mirror fallback) + # Note: ec2-hibinit-agent, ec2-instance-connect, hibagent moved to stage 2 + # because their post-install scripts try to access EC2 metadata service + # which doesn't work in a chroot and causes long hangs if ! apt_install_with_fallback install -y \ sudo \ wget \ cloud-init \ acpid \ - ec2-hibinit-agent \ - ec2-instance-connect \ - hibagent \ ncurses-term \ ssh-import-id; then echo "FATAL: Failed to install standard packages" diff --git a/scripts/nix-provision.sh b/scripts/nix-provision.sh index 9fbd37153..38a740ed4 100644 --- a/scripts/nix-provision.sh +++ b/scripts/nix-provision.sh @@ -9,6 +9,11 @@ function install_packages { # Setup Ansible on host VM sudo apt-get update && sudo apt-get install -y software-properties-common + # Install EC2-specific packages that were deferred from stage 1 + # These packages have post-install scripts that need EC2 metadata service access + # which only works on a real running EC2 instance (not in chroot) + sudo apt-get install -y ec2-hibinit-agent ec2-instance-connect hibagent + # Manually add GPG key with explicit keyserver sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 93C4A3FD7BB9C367 From 310377d684681b1210a9650d0de007a597c7f634 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Tue, 9 Dec 2025 20:07:31 -0500 Subject: [PATCH 5/6] fix: don't run services in chroot when installing package --- ebssurrogate/scripts/chroot-bootstrap-nix.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ebssurrogate/scripts/chroot-bootstrap-nix.sh b/ebssurrogate/scripts/chroot-bootstrap-nix.sh index 5af260895..2d38acd3a 100755 --- a/ebssurrogate/scripts/chroot-bootstrap-nix.sh +++ b/ebssurrogate/scripts/chroot-bootstrap-nix.sh @@ -14,6 +14,14 @@ export APT_OPTIONS="-oAPT::Install-Recommends=false \ -oAPT::Install-Suggests=false \ -oAcquire::Languages=none" +# Prevent services from starting during package installation in chroot +# This avoids hangs from cloud-init, dbus, etc. trying to start services +cat > /usr/sbin/policy-rc.d <<'EOF' +#!/bin/sh +exit 101 +EOF +chmod +x /usr/sbin/policy-rc.d + if [ $(dpkg --print-architecture) = "amd64" ]; then ARCH="amd64"; @@ -457,6 +465,11 @@ function cleanup_cache { apt-get clean } +# Remove policy-rc.d so services start normally on boot +function enable_services { + rm -f /usr/sbin/policy-rc.d +} + update_install_packages setup_locale setup_postgesql_env @@ -471,3 +484,4 @@ disable_sshd_passwd_auth disable_fsck #setup_ccache cleanup_cache +enable_services From e7bd06868f70f5fec4b1a5451246fd3b021d7949 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Wed, 10 Dec 2025 10:59:27 -0500 Subject: [PATCH 6/6] fix: simplify apt_install_with_fallback --- ebssurrogate/scripts/chroot-bootstrap-nix.sh | 93 +++----------------- 1 file changed, 13 insertions(+), 80 deletions(-) diff --git a/ebssurrogate/scripts/chroot-bootstrap-nix.sh b/ebssurrogate/scripts/chroot-bootstrap-nix.sh index 2d38acd3a..bb9da1b87 100755 --- a/ebssurrogate/scripts/chroot-bootstrap-nix.sh +++ b/ebssurrogate/scripts/chroot-bootstrap-nix.sh @@ -144,12 +144,9 @@ function apt_install_with_fallback { continue fi - # Attempt install with timeout (30 minutes for large packages) - # Capture output to detect 404 errors - local output - local exit_code - output=$(timeout 1800 apt-get "$@" 2>&1) && exit_code=0 || exit_code=$? - echo "${output}" + # Run apt-get install directly (no output capture to avoid buffering/timeout issues) + local exit_code=0 + apt-get "$@" || exit_code=$? if [ ${exit_code} -eq 0 ]; then echo "=========================================" @@ -158,82 +155,18 @@ function apt_install_with_fallback { return 0 fi - # Handle timeout (exit code 124) - check if packages were actually installed - if [ ${exit_code} -eq 124 ]; then - echo "=========================================" - echo "⚠ Timeout occurred (exit code 124), verifying if packages were installed..." - echo "=========================================" - - # Extract package names from arguments (skip flags like -y, --no-install-recommends, install) - local -a packages_to_check=() - local skip_next=false - for arg in "$@"; do - if [ "${skip_next}" = true ]; then - skip_next=false - continue - fi - case "${arg}" in - install|-y|--yes|--no-install-recommends|--no-install-suggests) - continue - ;; - -o*) - continue - ;; - -*) - # Skip flags and their potential arguments - continue - ;; - *) - packages_to_check+=("${arg}") - ;; - esac - done - - # Verify each package is installed - local all_installed=true - for pkg in "${packages_to_check[@]}"; do - if dpkg -l "${pkg}" 2>/dev/null | grep -q "^ii"; then - echo "✓ Package '${pkg}' is installed" - else - echo "✗ Package '${pkg}' is NOT installed" - all_installed=false - fi - done - - if [ "${all_installed}" = true ]; then - echo "=========================================" - echo "✓ All packages verified as installed despite timeout" - echo " (Timeout likely occurred during post-install triggers)" - echo "=========================================" - return 0 - else - echo "=========================================" - echo "✗ Some packages missing after timeout, will retry..." - echo "=========================================" - # Fall through to retry logic - fi - fi - - # Check if failure was due to 404/mirror issues - if echo "${output}" | grep -qE '(404\s+Not Found|Failed to fetch|Hash Sum mismatch|Size mismatch)'; then - echo "=========================================" - echo "✗ Mirror issue detected (404/hash/size mismatch), trying next mirror..." - echo "=========================================" + # On failure, check if it's a mirror issue worth retrying + echo "=========================================" + echo "✗ apt-get failed with exit code: ${exit_code}" + echo "=========================================" - # Clean apt cache to force re-download - apt-get clean + # Clean apt cache before potential retry + apt-get clean - if [ ${attempt} -lt ${max_attempts} ]; then - local sleep_time=$((attempt * 5)) - echo "Waiting ${sleep_time} seconds before trying next mirror..." - sleep ${sleep_time} - fi - elif [ ${exit_code} -ne 124 ]; then - # Non-mirror, non-timeout related failure, don't retry - echo "=========================================" - echo "✗ Install failed with non-mirror error (exit code: ${exit_code}), not retrying" - echo "=========================================" - return ${exit_code} + if [ ${attempt} -lt ${max_attempts} ]; then + local sleep_time=$((attempt * 5)) + echo "Waiting ${sleep_time} seconds before trying next mirror..." + sleep ${sleep_time} fi attempt=$((attempt + 1))