From 271286cc8ae68298755bf08a68d1af02dd016603 Mon Sep 17 00:00:00 2001 From: Gerhard Lazu Date: Mon, 31 Jul 2023 08:29:59 +0100 Subject: [PATCH] Make our ship_it.yml GHA workflow resilient (#476) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As you already know, we use Dagger for CI/CD. By default, this runs on Fly.io (via Docker). In some cases, this can fail. The last failure was when DNS resolution stopped working after the Docker instance was auto-upgraded from apps v1 -> v2 (a.k.a. Fly.io machines), e.g. https://github.com/thechangelog/changelog.com/actions/runs/5673476702/attempts/1 As a temporary fix, we had to delete some secrets and re-run the job. The job ran on GHA free runners & failed for genuine reasons 6 mins later: https://github.com/thechangelog/changelog.com/actions/runs/5673476702/job/15395264391 While running on the free GHA runners can be 3x-8x slower, it's a good fall-back. You heard us mention on multiple occasions: "always have redundancies in place". Since we already have multiple CI runtimes in place (Fly.io. K8s), let's make our GHA workflow resilient by: - Run on our preferred back-end by default (Dagger on Fly.io) - โœ… If it succeeds, we are done - โŒ If it fails, fallback to running on the free GitHub runners - In forks, use free GitHub runners by default (we cannot share `secrets`) While this means that a workflow which fails for genuine reasons will fail twice for us (1. Dagger on Fly.io, 2. Dagger on GitHub), it seems like a better place to improve from. This change goes one step further. We are using a third back-end: Dagger on K8s. This uses a self-hosted GitHub runner on K8s which is already integrated with Dagger. For now, we are using it just to see how the CI part compares to our primary setup (Dagger on Fly.io). We are not using Dagger on K8s to deploy the app. Let's see how this setup behaves over a few weeks/months before we consider taking it further. Part of this, we also improved on how we check for Fly.io connectivity. Things that could be improved in follow-ups: - the workflow should succeed if the `dagger-on-github-fallback` job succeeds - currently it fails if `dagger-on-fly-docker` fails - add `dagger-on-k8s` job as secondary fallback - GitHub Actions is currently missing https://github.com/actions/runner/issues/1665 - maybe leverage a Dagger cache that works in forks too ๐Ÿ˜‰ - Run Dagger Engine as a Fly Machine (no more Docker) - https://github.com/thechangelog/changelog.com/pull/471 Signed-off-by: Gerhard Lazu --- .github/workflows/dagger_on_fly_docker.yml | 69 ++++++++++++++++++ .github/workflows/dagger_on_github.yml | 38 ++++++++++ .github/workflows/dagger_on_k8s.yml | 32 +++++++++ .github/workflows/ship_it.yml | 81 ++++++++-------------- 4 files changed, 169 insertions(+), 51 deletions(-) create mode 100644 .github/workflows/dagger_on_fly_docker.yml create mode 100644 .github/workflows/dagger_on_github.yml create mode 100644 .github/workflows/dagger_on_k8s.yml diff --git a/.github/workflows/dagger_on_fly_docker.yml b/.github/workflows/dagger_on_fly_docker.yml new file mode 100644 index 0000000000..4dfb5be001 --- /dev/null +++ b/.github/workflows/dagger_on_fly_docker.yml @@ -0,0 +1,69 @@ +name: "Dagger on Fly.io Docker" + +on: + workflow_call: + secrets: + FLY_WIREGUARD: + required: true + +jobs: + run: + runs-on: ubuntu-latest + steps: + - name: "Checkout code..." + uses: actions/checkout@v3 + + # โš ๏ธ FLY_WIREGUARD is configured via `fly wireguard create ...` - see 2022.fly/docker/README.md + - name: "Set up WireGuard for Fly.io Docker Engine..." + run: | + echo "๐Ÿ”’ Install WireGuard & friends..." + sudo DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends wireguard-tools openresolv + echo "๐Ÿ” Configure WireGuard tunnel..." + printf "${{ secrets.FLY_WIREGUARD }}" | sudo tee /etc/wireguard/fly.conf + sudo wg-quick up fly + echo "๐Ÿฉป Check IPv6 routes..." + sudo ip -6 route list + echo "๐Ÿฉป Check DNS resolution..." + sudo resolvconf -v + + - name: "Check remote Docker Engine..." + env: + DOCKER_ENGINE_HOST: ${{ vars.DOCKER_ENGINE_HOST }} + run: | + echo "๐Ÿคจ Can we resolve ${DOCKER_ENGINE_HOST:?must be set} IPv6?" + dig +short "$DOCKER_ENGINE_HOST" AAAA + echo "๐Ÿคจ Can we ping $DOCKER_ENGINE_HOST IPv6?" + ping6 -c 3 "$(dig +short $DOCKER_ENGINE_HOST AAAA)" + echo "๐Ÿคจ Can we ping $DOCKER_ENGINE_HOST FQDN?" + ping6 -c 3 "$DOCKER_ENGINE_HOST" + echo "๐Ÿคจ Can we connect to Docker running on $DOCKER_ENGINE_HOST?" + nc -vz6 "$DOCKER_ENGINE_HOST" 2375 + + - uses: actions/setup-go@v4 + with: + go-version: "1.20" + cache-dependency-path: "magefiles/go.sum" + + - name: "Build, test, publish & deploy..." + id: cicd + env: + DOCKER_HOST: "${{ vars.DOCKER_ENGINE_HOST_FQDN }}" + IMAGE_OWNER: "${{ vars.IMAGE_OWNER }}" + GHCR_USERNAME: "${{ github.actor }}" + GHCR_PASSWORD: "${{ secrets.GHCR_PASSWORD }}" + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" + run: | + cd magefiles + go run main.go -w ../ ci cd + + - name: "Announce deploy in #dev Slack..." + if: ${{ github.repository == 'thechangelog/changelog.com' && github.ref_name == 'master' }} + uses: rtCamp/action-slack-notify@v2 + env: + MSG_MINIMAL: "commit,actions url" + SLACK_CHANNEL: dev + SLACK_USERNAME: "GitHub Actions" + SLACK_FOOTER: "Just got shipped to https://changelog.com" + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} diff --git a/.github/workflows/dagger_on_github.yml b/.github/workflows/dagger_on_github.yml new file mode 100644 index 0000000000..90d2fdcb63 --- /dev/null +++ b/.github/workflows/dagger_on_github.yml @@ -0,0 +1,38 @@ +name: "Dagger on GitHub" + +on: + workflow_call: + +jobs: + run: + runs-on: ubuntu-latest + steps: + - name: "Checkout code..." + uses: actions/checkout@v3 + + - uses: actions/setup-go@v4 + with: + go-version: "1.20" + cache-dependency-path: "magefiles/go.sum" + + - name: "Build, test, publish & deploy..." + env: + IMAGE_OWNER: "${{ vars.IMAGE_OWNER }}" + GHCR_USERNAME: "${{ github.actor }}" + GHCR_PASSWORD: "${{ secrets.GHCR_PASSWORD }}" + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" + run: | + cd magefiles + go run main.go -w ../ ci cd + + - name: "Announce deploy in #dev Slack..." + if: ${{ github.repository == 'thechangelog/changelog.com' && github.ref_name == 'master' }} + uses: rtCamp/action-slack-notify@v2 + env: + MSG_MINIMAL: "commit,actions url" + SLACK_CHANNEL: dev + SLACK_USERNAME: "GitHub Actions" + SLACK_FOOTER: "Just got shipped to https://changelog.com" + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} diff --git a/.github/workflows/dagger_on_k8s.yml b/.github/workflows/dagger_on_k8s.yml new file mode 100644 index 0000000000..196ffae0e0 --- /dev/null +++ b/.github/workflows/dagger_on_k8s.yml @@ -0,0 +1,32 @@ +name: "Dagger on K8s" + +on: + workflow_call: + +jobs: + run: + runs-on: self-hosted + continue-on-error: true + steps: + - name: "Checkout code..." + uses: actions/checkout@v3 + + - uses: actions/setup-go@v4 + with: + go-version: "1.20" + cache-dependency-path: "magefiles/go.sum" + + - name: "Build, test, publish & deploy..." + env: + IMAGE_OWNER: "${{ vars.IMAGE_OWNER }}" + GHCR_USERNAME: "${{ github.actor }}" + GHCR_PASSWORD: "${{ secrets.GHCR_PASSWORD }}" + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" + run: | + cd magefiles + go run main.go -w ../ ci + + # TODO: run this in Dagger + # - name: "Announce deploy in #dev Slack..." diff --git a/.github/workflows/ship_it.yml b/.github/workflows/ship_it.yml index cb5e63bc03..037e1fc9ae 100644 --- a/.github/workflows/ship_it.yml +++ b/.github/workflows/ship_it.yml @@ -1,5 +1,10 @@ name: "Ship It!" +concurrency: + # There should only be able one running job per repository / branch combo. + # We do not want multiple deploys running in parallel. + group: ${{ github.repository }}-${{ github.ref_name }} + on: push: branches: @@ -9,58 +14,32 @@ on: pull_request: workflow_dispatch: +# All jobs have the same outcome. We define multiple for resiliency reasons. jobs: - cicd: - runs-on: ubuntu-latest - steps: - - name: "Checkout code..." - uses: actions/checkout@v3 - - # โš ๏ธ FLY_WIREGUARD is configured via `fly wireguard create ...` - see 2022.fly/docker/README.md - - name: "Set up WireGuard for Fly.io Docker Engine..." - env: - FLY_WIREGUARD: ${{ secrets.FLY_WIREGUARD }} - if: "${{ env.FLY_WIREGUARD != '' }}" - run: | - sudo DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends wireguard-tools openresolv - printf "${{ secrets.FLY_WIREGUARD }}" | sudo tee /etc/wireguard/fly.conf - sudo wg-quick up fly + # In thechangelog/changelog repository (a.k.a. upstream), + # this is the preferred default: + dagger-on-fly-docker: + if: ${{ contains(vars.RUNS_ON, 'fly') }} + uses: ./.github/workflows/dagger_on_fly_docker.yml + secrets: inherit - # โš ๏ธ IPv6 is configured via `fly ips private` - see 2022.fly/docker/README.md - - name: "Check Fly.io Docker Engine" - env: - DOCKER_ENGINE_HOST: ${{ secrets.DOCKER_ENGINE_HOST }} - if: "${{ env.DOCKER_ENGINE_HOST != '' }}" - run: | - ping6 -c 5 "$DOCKER_ENGINE_HOST" - nc -vz6 "$DOCKER_ENGINE_HOST" 2375 + # When our Fly.io setup misbehaves, we want a fallback: + dagger-on-github-fallback: + needs: dagger-on-fly-docker + if: ${{ failure() }} + uses: ./.github/workflows/dagger_on_github.yml + secrets: inherit - - uses: actions/setup-go@v4 - with: - go-version: "1.20" - - name: "Build, test, publish & deploy..." - env: - DOCKER_HOST: "${{ secrets.DOCKER_ENGINE_HOST_FQDN }}" - IMAGE_OWNER: "${{ secrets.IMAGE_OWNER }}" - GHCR_USERNAME: "${{ github.actor }}" - GHCR_PASSWORD: "${{ secrets.GHCR_PASSWORD }}" - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" - AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" - run: | - cd magefiles - go run main.go -w ../ ci cd + # As forks will not have access to our Fly.io, + # we fallback to GitHub default: + dagger-on-github: + if: ${{ !contains(vars.RUNS_ON, 'fly') }} + uses: ./.github/workflows/dagger_on_github.yml + secrets: inherit - notify: - if: ${{ github.repository == 'thechangelog/changelog.com' && github.ref_name == 'master' }} - needs: cicd - runs-on: ubuntu-latest - steps: - - name: "Notify Slack about deploy..." - uses: rtCamp/action-slack-notify@v2 - env: - MSG_MINIMAL: "commit,actions url" - SLACK_CHANNEL: dev - SLACK_USERNAME: "GitHub Actions" - SLACK_FOOTER: "Just got shipped to https://changelog.com" - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + # This is an experimental job which only runs the CI part of our pipeline. + # In other words, this does not run CD, it does not deploy our app. + dagger-on-k8s: + if: ${{ contains(vars.RUNS_ON, 'k8s') }} + uses: ./.github/workflows/dagger_on_k8s.yml + secrets: inherit