diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b0f5fe418dcf..0d944e911be1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -898,7 +898,6 @@ steps: commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support - label: LoRA TP Test (Distributed) # 17 min timeout_in_minutes: 30 @@ -972,9 +971,21 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: Qwen MoE EP Test # optional +##### H200 test ##### +- label: Distrubted Tests (H200) # optional gpu: h200 optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/distributed/test_context_parallel.py diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 23be703a3068..11685bc90c41 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -71,12 +71,13 @@ def detailed( parallel_setups = [] for eager_mode_val in [False]: for pp_multiplier in [1]: - for dcp_multiplier in [2, 4]: + for dcp_multiplier in [0.5, 1]: for chunked_prefill_val in [True]: parallel_setups.append( ParallelSetup(tp_size=tp_base, pp_size=pp_multiplier * pp_base, - dcp_size=dcp_multiplier * dcp_base, + dcp_size=int(dcp_multiplier * + tp_base), eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val)) return CPTestSettings( @@ -223,7 +224,9 @@ def _compare_cp_with_tp( CP_TEXT_GENERATION_MODELS = { # [MLA attention only] - "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(), + "deepseek-ai/DeepSeek-V2-Lite-Chat": + [CPTestSettings.detailed(), + CPTestSettings.detailed(tp_base=2)], } CP_TEST_MODELS = [ @@ -238,7 +241,7 @@ def _compare_cp_with_tp( "runner", "test_options"), [ params for model_id, settings in CP_TEXT_GENERATION_MODELS.items() - for params in settings.iter_params(model_id) + for setting in settings for params in setting.iter_params(model_id) if model_id in CP_TEST_MODELS ], )