From 93ba584091907e1c889c5f09d64e4771cfe5fc5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Roche?= Date: Fri, 15 Aug 2025 13:03:33 +0200 Subject: [PATCH 1/3] fix(ci): replace EC2 Instance Connect with cloud-init SSH key injection Tests are often failing due to the SSH access to the instance. EC2 Instance Connect push the temporary SSH key which is then available only for 60 seconds. Recently, errors often occur when the SSH key is sent to the instance, resulting in a timeout. We replace runtime SSH key injection via EC2 Instance Connect API with cloud-init configuration to add the SSH public key during instance initialization. Note that we are still using EC2 Instance Connect to create the SSH key pair, but we are not using it to push the key to the instance. --- testinfra/test_ami_nix.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/testinfra/test_ami_nix.py b/testinfra/test_ami_nix.py index 96ce12dab..4e234752f 100644 --- a/testinfra/test_ami_nix.py +++ b/testinfra/test_ami_nix.py @@ -9,7 +9,6 @@ from ec2instanceconnectcli.EC2InstanceConnectLogger import EC2InstanceConnectLogger from ec2instanceconnectcli.EC2InstanceConnectKey import EC2InstanceConnectKey from time import sleep -import subprocess import paramiko # if GITHUB_RUN_ID is not set, use a default value that includes the user and hostname @@ -233,6 +232,10 @@ def host(): def gzip_then_base64_encode(s: str) -> str: return base64.b64encode(gzip.compress(s.encode())).decode() + # Create temporary SSH key pair + ec2logger = EC2InstanceConnectLogger(debug=False) + temp_key = EC2InstanceConnectKey(ec2logger.get_logger()) + instance = list( ec2.create_instances( BlockDeviceMappings=[ @@ -279,6 +282,10 @@ def gzip_then_base64_encode(s: str) -> str: - 'bash init.sh "staging"' - 'touch /var/lib/init-complete' - 'rm -rf /tmp/*' +users: + - name: ubuntu + ssh_authorized_keys: + - {temp_key.get_pub_key()} """, TagSpecifications=[ { @@ -297,16 +304,6 @@ def gzip_then_base64_encode(s: str) -> str: # Increase wait time before starting health checks sleep(30) # Wait for 30 seconds to allow services to start - ec2logger = EC2InstanceConnectLogger(debug=False) - temp_key = EC2InstanceConnectKey(ec2logger.get_logger()) - ec2ic = boto3.client("ec2-instance-connect", region_name="ap-southeast-1") - response = ec2ic.send_ssh_public_key( - InstanceId=instance.id, - InstanceOSUser="ubuntu", - SSHPublicKey=temp_key.get_pub_key(), - ) - assert response["Success"] - # Wait for instance to have public IP while not instance.public_ip_address: logger.warning("waiting for ip to be available") From 3882f3e93dd78b14643e37a77b6ac324ed37940c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Roche?= Date: Fri, 15 Aug 2025 15:51:30 +0200 Subject: [PATCH 2/3] fix(ci): terminate only the ec2 instance of the matrix job For the moment, the first matrix job that finishes will terminate all the ec2 instances running in the current workflow run. This is not what we want. This change only terminates the instance that is running the matrix job. --- .github/workflows/ami-release-nix-single.yml | 12 +++++++----- .github/workflows/ami-release-nix.yml | 12 +++++++----- .github/workflows/qemu-image-build.yml | 8 +++++--- .github/workflows/testinfra-ami-build.yml | 12 +++++++----- testinfra/test_ami_nix.py | 4 ++-- 5 files changed, 28 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ami-release-nix-single.yml b/.github/workflows/ami-release-nix-single.yml index dd579df30..6209c5692 100644 --- a/.github/workflows/ami-release-nix-single.yml +++ b/.github/workflows/ami-release-nix-single.yml @@ -49,7 +49,9 @@ jobs: trusted-public-keys = nix-postgres-artifacts:dGZlQOvKcNEjvT7QEAJbcV6b6uk7VF/hWMjhYleiaLI=% cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= - name: Set PostgreSQL version environment variable - run: echo "POSTGRES_MAJOR_VERSION=${{ github.event.inputs.postgres_version }}" >> $GITHUB_ENV + run: | + echo "POSTGRES_MAJOR_VERSION=${{ github.event.inputs.postgres_version }}" >> $GITHUB_ENV + echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV - name: Generate common-nix.vars.pkr.hcl run: | @@ -65,7 +67,7 @@ jobs: run: | GIT_SHA=${{ steps.get_sha.outputs.sha }} nix run github:supabase/postgres/${GIT_SHA}#packer -- init amazon-arm64-nix.pkr.hcl - nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl + nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl - name: Build AMI stage 2 env: @@ -74,7 +76,7 @@ jobs: GIT_SHA=${{ steps.get_sha.outputs.sha }} nix run github:supabase/postgres/${GIT_SHA}#packer -- init stage2-nix-psql.pkr.hcl POSTGRES_MAJOR_VERSION=${{ env.POSTGRES_MAJOR_VERSION }} - nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl + nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl - name: Grab release version id: process_release_version @@ -153,10 +155,10 @@ jobs: - name: Cleanup resources after build if: ${{ always() }} run: | - aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids + aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids - name: Cleanup resources on build cancellation if: ${{ cancelled() }} run: | - aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids + aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids diff --git a/.github/workflows/ami-release-nix.yml b/.github/workflows/ami-release-nix.yml index df0bf096e..405deeeb4 100644 --- a/.github/workflows/ami-release-nix.yml +++ b/.github/workflows/ami-release-nix.yml @@ -77,7 +77,9 @@ jobs: fi - name: Set PostgreSQL version environment variable - run: echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV + run: | + echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV + echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV - name: Generate common-nix.vars.pkr.hcl run: | @@ -94,7 +96,7 @@ jobs: GIT_SHA=${{github.sha}} nix run github:supabase/postgres/${GIT_SHA}#packer -- init amazon-arm64-nix.pkr.hcl # why is postgresql_major defined here instead of where the _three_ other postgresql_* variables are defined? - nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl + nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl - name: Build AMI stage 2 env: @@ -103,7 +105,7 @@ jobs: GIT_SHA=${{github.sha}} nix run github:supabase/postgres/${GIT_SHA}#packer -- init stage2-nix-psql.pkr.hcl POSTGRES_MAJOR_VERSION=${{ env.POSTGRES_MAJOR_VERSION }} - nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl + nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl - name: Grab release version id: process_release_version @@ -182,9 +184,9 @@ jobs: - name: Cleanup resources after build if: ${{ always() }} run: | - aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids + aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids - name: Cleanup resources on build cancellation if: ${{ cancelled() }} run: | - aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids + aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids diff --git a/.github/workflows/qemu-image-build.yml b/.github/workflows/qemu-image-build.yml index fad63a510..1be4caa15 100644 --- a/.github/workflows/qemu-image-build.yml +++ b/.github/workflows/qemu-image-build.yml @@ -65,7 +65,9 @@ jobs: sudo chmod 666 /dev/kvm - name: Set PostgreSQL version environment variable - run: echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV + run: | + echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV + echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV - name: Generate common-nix.vars.pkr.hcl run: | @@ -155,9 +157,9 @@ jobs: - name: Cleanup resources after build if: ${{ always() }} run: | - aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids + aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids - name: Cleanup resources on build cancellation if: ${{ cancelled() }} run: | - aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids + aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids diff --git a/.github/workflows/testinfra-ami-build.yml b/.github/workflows/testinfra-ami-build.yml index 892b54b74..a468b6860 100644 --- a/.github/workflows/testinfra-ami-build.yml +++ b/.github/workflows/testinfra-ami-build.yml @@ -96,7 +96,9 @@ jobs: run: echo "random_string=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT - name: Set PostgreSQL version environment variable - run: echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV + run: | + echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV + echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV - name: Generate common-nix.vars.pkr.hcl run: | @@ -110,13 +112,13 @@ jobs: run: | GIT_SHA=${{github.sha}} nix run github:supabase/postgres/${GIT_SHA}#packer -- init amazon-arm64-nix.pkr.hcl - nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl + nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl - name: Build AMI stage 2 run: | GIT_SHA=${{github.sha}} nix run github:supabase/postgres/${GIT_SHA}#packer -- init stage2-nix-psql.pkr.hcl - nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "git_sha=${GITHUB_SHA}" stage2-nix-psql.pkr.hcl + nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "git_sha=${GITHUB_SHA}" stage2-nix-psql.pkr.hcl - name: Run tests timeout-minutes: 10 @@ -130,12 +132,12 @@ jobs: - name: Cleanup resources on build cancellation if: ${{ cancelled() }} run: | - aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids + aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids - name: Cleanup resources after build if: ${{ always() }} run: | - aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:testinfra-run-id,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids || true + aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:testinfra-run-id,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids || true - name: Cleanup AMIs if: always() diff --git a/testinfra/test_ami_nix.py b/testinfra/test_ami_nix.py index 4e234752f..21db574a6 100644 --- a/testinfra/test_ami_nix.py +++ b/testinfra/test_ami_nix.py @@ -11,9 +11,9 @@ from time import sleep import paramiko -# if GITHUB_RUN_ID is not set, use a default value that includes the user and hostname +# if EXECUTION_ID is not set, use a default value that includes the user and hostname RUN_ID = os.environ.get( - "GITHUB_RUN_ID", + "EXECUTION_ID", "unknown-ci-run-" + os.environ.get("USER", "unknown-user") + "@" From 3f97cbecceafdddc943c32ce1a34c9b92109cdcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Roche?= Date: Mon, 18 Aug 2025 15:00:37 +0200 Subject: [PATCH 3/3] fix(ci): init.sh completion check should not be blocking Add optional timeout parameter to run_ssh_command() to check init completion status with a 5-second timeout. --- testinfra/test_ami_nix.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testinfra/test_ami_nix.py b/testinfra/test_ami_nix.py index 21db574a6..19219f74a 100644 --- a/testinfra/test_ami_nix.py +++ b/testinfra/test_ami_nix.py @@ -205,9 +205,9 @@ def get_ssh_connection(instance_ip, ssh_identity_file, max_retries=10): sleep(5) -def run_ssh_command(ssh, command): +def run_ssh_command(ssh, command, timeout=None): """Run a command over the established SSH connection.""" - stdin, stdout, stderr = ssh.exec_command(command) + stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout) exit_code = stdout.channel.recv_exit_status() return { 'succeeded': exit_code == 0, @@ -330,7 +330,7 @@ def gzip_then_base64_encode(s: str) -> str: attempt = 0 while attempt < max_attempts: try: - result = run_ssh_command(ssh, "test -f /var/lib/init-complete") + result = run_ssh_command(ssh, "test -f /var/lib/init-complete", timeout=5) if result['succeeded']: logger.info("init.sh has completed") break