Skip to content

Commit

Permalink
Split gpu tests in head and non-head versions (horovod#3155)
Browse files Browse the repository at this point in the history
Signed-off-by: Enrico Minack <github@enrico.minack.dev>
Signed-off-by: weihanmines <weihan13@amd.com>
  • Loading branch information
EnricoMi authored and weihanmines committed Dec 10, 2021
1 parent d7acb6c commit 111bcfe
Show file tree
Hide file tree
Showing 7 changed files with 320 additions and 217 deletions.
6 changes: 4 additions & 2 deletions .buildkite/gen-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ baseline="test-cpu-gloo-py3_8-tf2_6_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspar
# skip tests when there are no code changes
dir="$(dirname "$0")"
code_files=$(python "$dir/get_changed_code_files.py" || echo failure)
tests=$(if [[ "${PIPELINE_MODE:-}" == *"FULL"* ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${BUILDKITE_PIPELINE_DEFAULT_BRANCH:-}" ]] || [[ -n "$code_files" ]] ); then
tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${BUILDKITE_PIPELINE_DEFAULT_BRANCH:-}" ]] || [[ -n "$code_files" ]] ); then
# we vary the baseline along the Python dimension and PySpark together
# run_gloo_integration expects these to have Gloo mpi kind to run 'Elastic Spark * Tests'
printf "test-cpu-gloo-py3_7-tf2_6_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8 "
Expand Down Expand Up @@ -59,7 +59,9 @@ tests=$(if [[ "${PIPELINE_MODE:-}" == *"FULL"* ]] && ( [[ "${BUILDKITE_BRANCH:-}
# and one final test with mixed cpu+gpu
printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 "
fi | if [[ "${PIPELINE_MODE:-}" == "GPU FULL" ]]; then sed -E "s/[^ ]*-cpu-[^ ]*//g"; else cat; fi)
fi | if [[ "${PIPELINE_MODE:-}" == "GPU"* ]]; then sed -E "s/[^ ]*-cpu-[^ ]*//g"; else cat; fi \
| if [[ "${PIPELINE_MODE:-}" == "GPU HEADS" ]]; then sed -E "s/ /\n/g" | grep -e "-tfhead-keras_none-torchhead-mxnethead-" | paste -s -d " "; else cat; fi \
| if [[ "${PIPELINE_MODE:-}" == "GPU NON HEADS" ]]; then sed -E "s/[^ ]*-tfhead-keras_none-torchhead-mxnethead-[^ ]*//g"; else cat; fi)
read -r -a tests <<< "$tests"
Expand Down
17 changes: 9 additions & 8 deletions .github/gen-workflow-ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,11 +462,11 @@ def build_and_test_macos(id: str, name: str, needs: List[str], attempts: int = 3
'\n'.join([f' ${{{{ steps.test-{attempt}.outputs.artifacts-path }}}}'
for attempt in range(1, attempts+1)]))

def trigger_buildkite_job(id: str, needs: List[str]) -> str:
def trigger_buildkite_job(id: str, name: str, needs: List[str], mode: str) -> str:
if 'init-workflow' not in needs:
needs.insert(0, 'init-workflow')
return (f' {id}:\n'
f' name: "Build and Test GPU (on Builtkite)"\n'
f' name: "{name}"\n'
f' needs: [{", ".join(needs)}]\n'
f' runs-on: ubuntu-latest\n'
f' if: >\n'
Expand All @@ -486,7 +486,7 @@ def trigger_buildkite_job(id: str, needs: List[str]) -> str:
f' BRANCH: "${{{{ github.event.pull_request.head.ref }}}}"\n'
f' MESSAGE: "GPU Tests triggered by GitHub"\n'
f' BUILDKITE_API_ACCESS_TOKEN: ${{{{ secrets.BUILDKITE_TOKEN }}}}\n'
f' BUILD_ENV_VARS: "{{\\"PIPELINE_MODE\\": \\"GPU FULL\\"}}"\n'
f' BUILD_ENV_VARS: "{{\\"PIPELINE_MODE\\": \\"{mode}\\"}}"\n'
f'\n'
f' - name: Download Buildkite Artifacts\n'
f' id: download\n'
Expand All @@ -497,14 +497,14 @@ def trigger_buildkite_job(id: str, needs: List[str]) -> str:
f' buildkite_build_url: ${{{{ steps.build.outputs.url }}}}\n'
f' ignore_build_states: blocked,canceled,skipped,not_run\n'
f' ignore_job_states: timed_out\n'
f' output_path: artifacts/Unit Test Results - GPUs on Buildkite\n'
f' output_path: artifacts/Unit Test Results - {mode} on Builtkite\n'
f'\n'
f' - name: Upload Test Results\n'
f' uses: actions/upload-artifact@v2\n'
f' if: always()\n'
f' with:\n'
f' name: Unit Test Results - GPUs on Builtkite\n'
f' path: artifacts/Unit Test Results - GPUs on Buildkite/**/*.xml\n'
f' name: Unit Test Results - {mode} on Builtkite\n'
f' path: artifacts/Unit Test Results - {mode} on Builtkite/**/*.xml\n' +
f'\n'
f' - name: Check Buildkite job state\n'
f' if: >\n'
Expand Down Expand Up @@ -779,8 +779,9 @@ def sync_files(needs: List[str]) -> str:
build_and_test_images(id='build-and-test', name='Build and Test', needs=['init-workflow'], images=release_images, parallel_images='-cpu-', tests_per_image=tests_per_image, tests=tests),
build_and_test_images(id='build-and-test-heads', name='Build and Test heads', needs=['build-and-test'], images=allhead_images, parallel_images='', tests_per_image=tests_per_image, tests=tests),
build_and_test_macos(id='build-and-test-macos', name='Build and Test macOS', needs=['build-and-test']),
trigger_buildkite_job(id='buildkite', needs=['build-and-test']),
publish_unit_test_results(id='publish-test-results', needs=['build-and-test', 'build-and-test-heads', 'build-and-test-macos', 'buildkite']),
trigger_buildkite_job(id='buildkite', name='Build and Test GPU (on Builtkite)', needs=['build-and-test'], mode='GPU NON HEADS'),
trigger_buildkite_job(id='buildkite-heads', name='Build and Test GPU heads (on Builtkite)', needs=['buildkite'], mode='GPU HEADS'),
publish_unit_test_results(id='publish-test-results', needs=['build-and-test', 'build-and-test-heads', 'build-and-test-macos', 'buildkite', 'buildkite-heads']),
publish_docker_images(needs=['build-and-test', 'buildkite'], images=['horovod', 'horovod-cpu', 'horovod-ray']),
sync_files(needs=['init-workflow'])
)
Expand Down
56 changes: 51 additions & 5 deletions .github/workflows/ci-fork.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
BRANCH: "${{ github.event.workflow_run.head_repository.owner.login }}:${{ github.event.workflow_run.head_branch }}"
MESSAGE: "${{ github.event.workflow_run.message }}"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU FULL\"}"
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU NON HEADS\"}"

- name: Download Buildkite Artifacts
id: download
Expand All @@ -61,14 +61,60 @@ jobs:
buildkite_build_url: ${{ steps.buildkite.outputs.url }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out
output_path: artifacts/Unit Test Results - GPUs on Buildkite
output_path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite

- name: Upload Test Results
uses: actions/upload-artifact@v2
if: always()
with:
name: Unit Test Results - GPUs on Builtkite
path: artifacts/Unit Test Results - GPUs on Buildkite/**/*.xml
name: Unit Test Results - GPU NON HEADS on Builtkite
path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite/**/*.xml

- name: Check Buildkite job state
if: >
always() &&
steps.download.conclusion == 'success' &&
steps.download.outputs.build-state != 'passed'
run: |
echo "::warning::Buildkite pipeline did not pass: ${{ steps.buildkite.outputs.url }}"
exit 1
buildkite-heads:
name: "Build and Test GPU heads (on Builtkite)"
needs: [buildkite]
runs-on: ubuntu-latest
# only run if CI workflow's build-and-test job succeeded and CI workflow ran on a fork
if: needs.ci-workflow.outputs.build-and-test == 'success'

steps:
- name: Trigger Buildkite Pipeline
id: buildkite
uses: EnricoMi/trigger-pipeline-action@master
env:
PIPELINE: "horovod/horovod"
COMMIT: "${{ github.event.workflow_run.head_sha }}"
BRANCH: "${{ github.event.workflow_run.head_repository.owner.login }}:${{ github.event.workflow_run.head_branch }}"
MESSAGE: "${{ github.event.workflow_run.message }}"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU HEADS\"}"

- name: Download Buildkite Artifacts
id: download
uses: docker://ghcr.io/enricomi/download-buildkite-artifact-action:v1
with:
github_token: ${{ github.token }}
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
buildkite_build_url: ${{ steps.buildkite.outputs.url }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out
output_path: artifacts/Unit Test Results - GPU HEADS on Builtkite

- name: Upload Test Results
uses: actions/upload-artifact@v2
if: always()
with:
name: Unit Test Results - GPU HEADS on Builtkite
path: artifacts/Unit Test Results - GPU HEADS on Builtkite/**/*.xml

- name: Check Buildkite job state
if: >
Expand All @@ -81,7 +127,7 @@ jobs:
publish-test-results:
name: "Publish Unit Tests Results"
needs: [buildkite]
needs: [buildkite, buildkite-heads]
runs-on: ubuntu-latest
# only run if CI workflow ran on a fork
if: >
Expand Down
58 changes: 53 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3429,7 +3429,7 @@ jobs:
BRANCH: "${{ github.event.pull_request.head.ref }}"
MESSAGE: "GPU Tests triggered by GitHub"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU FULL\"}"
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU NON HEADS\"}"

- name: Download Buildkite Artifacts
id: download
Expand All @@ -3440,14 +3440,62 @@ jobs:
buildkite_build_url: ${{ steps.build.outputs.url }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out
output_path: artifacts/Unit Test Results - GPUs on Buildkite
output_path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite

- name: Upload Test Results
uses: actions/upload-artifact@v2
if: always()
with:
name: Unit Test Results - GPUs on Builtkite
path: artifacts/Unit Test Results - GPUs on Buildkite/**/*.xml
name: Unit Test Results - GPU NON HEADS on Builtkite
path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite/**/*.xml

- name: Check Buildkite job state
if: >
always() &&
steps.download.conclusion == 'success' &&
steps.download.outputs.build-state != 'passed'
run: |
echo "::warning::Buildkite pipeline did not pass: ${{ steps.build.outputs.url }}"
exit 1
buildkite-heads:
name: "Build and Test GPU heads (on Builtkite)"
needs: [init-workflow, buildkite]
runs-on: ubuntu-latest
if: >
github.repository == 'horovod/horovod' &&
needs.init-workflow.outputs.run_at_all == 'true' &&
needs.init-workflow.outputs.run_builds_and_tests == 'true' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
steps:
- name: Trigger Buildkite Pipeline
id: build
uses: EnricoMi/trigger-pipeline-action@master
env:
PIPELINE: "horovod/horovod"
BRANCH: "${{ github.event.pull_request.head.ref }}"
MESSAGE: "GPU Tests triggered by GitHub"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU HEADS\"}"

- name: Download Buildkite Artifacts
id: download
uses: docker://ghcr.io/enricomi/download-buildkite-artifact-action:v1
with:
github_token: ${{ github.token }}
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
buildkite_build_url: ${{ steps.build.outputs.url }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out
output_path: artifacts/Unit Test Results - GPU HEADS on Builtkite

- name: Upload Test Results
uses: actions/upload-artifact@v2
if: always()
with:
name: Unit Test Results - GPU HEADS on Builtkite
path: artifacts/Unit Test Results - GPU HEADS on Builtkite/**/*.xml

- name: Check Buildkite job state
if: >
Expand All @@ -3460,7 +3508,7 @@ jobs:
publish-test-results:
name: "Publish Unit Tests Results"
needs: [build-and-test, build-and-test-heads, build-and-test-macos, buildkite]
needs: [build-and-test, build-and-test-heads, build-and-test-macos, buildkite, buildkite-heads]
runs-on: ubuntu-latest
# only run this job when the workflow is in success or failure state,
# not when it is in cancelled or skipped state
Expand Down
Loading

0 comments on commit 111bcfe

Please sign in to comment.