From e6de94398a5e38a717dd216b6cd4ec135a4c9d1d Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Tue, 9 Sep 2025 00:00:53 -0700 Subject: [PATCH 1/5] Add DCP to CI Signed-off-by: Ming Yang --- .buildkite/test-pipeline.yaml | 17 ++++++++++++++++- tests/distributed/test_context_parallel.py | 11 +++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b0d4c4456d33..b2c2eb8de7e9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -851,7 +851,6 @@ steps: commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support - label: LoRA TP Test (Distributed) # 17 min timeout_in_minutes: 30 @@ -925,9 +924,25 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +##### H200 test ##### - label: Qwen MoE EP Test # optional gpu: h200 optional: true num_gpus: 2 commands: - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + + +- label: Hopper Decode Context Parallelism Test # optional + gpu: h200 + optional: true + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + +- label: Blackwell Decode Context Parallelism Test # optional + gpu: b200 + optional: true + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 23be703a3068..11685bc90c41 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -71,12 +71,13 @@ def detailed( parallel_setups = [] for eager_mode_val in [False]: for pp_multiplier in [1]: - for dcp_multiplier in [2, 4]: + for dcp_multiplier in [0.5, 1]: for chunked_prefill_val in [True]: parallel_setups.append( ParallelSetup(tp_size=tp_base, pp_size=pp_multiplier * pp_base, - dcp_size=dcp_multiplier * dcp_base, + dcp_size=int(dcp_multiplier * + tp_base), eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val)) return CPTestSettings( @@ -223,7 +224,9 @@ def _compare_cp_with_tp( CP_TEXT_GENERATION_MODELS = { # [MLA attention only] - "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(), + "deepseek-ai/DeepSeek-V2-Lite-Chat": + [CPTestSettings.detailed(), + CPTestSettings.detailed(tp_base=2)], } CP_TEST_MODELS = [ @@ -238,7 +241,7 @@ def _compare_cp_with_tp( "runner", "test_options"), [ params for model_id, settings in CP_TEXT_GENERATION_MODELS.items() - for params in settings.iter_params(model_id) + for setting in settings for params in setting.iter_params(model_id) if model_id in CP_TEST_MODELS ], ) From e1a4326750b439f35d5fdd8f7007733d489bad69 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Tue, 9 Sep 2025 00:03:48 -0700 Subject: [PATCH 2/5] add comment Signed-off-by: Ming Yang --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b2c2eb8de7e9..c2e9b1e55f45 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -940,6 +940,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py +##### B200 test ##### - label: Blackwell Decode Context Parallelism Test # optional gpu: b200 optional: true From 07a0b956fa1bc8ddf66e0271c579e621f188aeff Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Tue, 9 Sep 2025 11:11:11 -0700 Subject: [PATCH 3/5] address comment: combine CI tests Signed-off-by: Ming Yang --- .buildkite/test-pipeline.yaml | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c2e9b1e55f45..8b23c3481a37 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -925,23 +925,16 @@ steps: - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 ##### H200 test ##### -- label: Qwen MoE EP Test # optional +- label: Distrubted Tests (H200) # optional gpu: h200 optional: true num_gpus: 2 commands: - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - - -- label: Hopper Decode Context Parallelism Test # optional - gpu: h200 - optional: true - num_gpus: 2 - commands: - pytest -v -s tests/distributed/test_context_parallel.py ##### B200 test ##### -- label: Blackwell Decode Context Parallelism Test # optional +- label: Distributed Tests (B200) # optional gpu: b200 optional: true num_gpus: 2 From 0d62d75202254c09e0408707d2979ee848d0738a Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Fri, 12 Sep 2025 20:19:41 -0700 Subject: [PATCH 4/5] add working_dir; fix relative path Signed-off-by: Ming Yang --- .buildkite/test-pipeline.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f3f776973a03..32ad01aa150d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -975,15 +975,17 @@ steps: - label: Distrubted Tests (H200) # optional gpu: h200 optional: true + working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/distributed/test_context_parallel.py ##### B200 test ##### - label: Distributed Tests (B200) # optional gpu: b200 optional: true + working_dir: "/vllm-workspace/" num_gpus: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py From 8375e797c21c29016b018a09ea713a6b5495dd29 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Fri, 12 Sep 2025 23:35:17 -0700 Subject: [PATCH 5/5] run dcp test before moe (for testing purpose) Signed-off-by: Ming Yang --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 32ad01aa150d..0d944e911be1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -978,8 +978,8 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 ##### B200 test ##### - label: Distributed Tests (B200) # optional