diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 375645fde747..ead61035231d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1303,11 +1303,11 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - - pytest -v -s tests/distributed/test_sequence_parallel.py + - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/tests/conftest.py b/tests/conftest.py index 163593eb3f14..11c573befb2d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1424,3 +1424,32 @@ def disable_deepgemm_ue8m0(monkeypatch): # Clear cache so the next time it is used it is processed with the # default VLLM_USE_DEEP_GEMM_E8M0 setting. is_deep_gemm_e8m0_used.cache_clear() + + +@pytest.fixture(autouse=True) +def clean_gpu_memory_between_tests(): + if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1": + yield + return + + # Wait for GPU memory to be cleared before starting the test + import gc + + from tests.utils import wait_for_gpu_memory_to_clear + + num_gpus = torch.cuda.device_count() + if num_gpus > 0: + try: + wait_for_gpu_memory_to_clear( + devices=list(range(num_gpus)), + threshold_ratio=0.1, + ) + except ValueError as e: + logger.info("Failed to clean GPU memory: %s", e) + + yield + + # Clean up GPU memory after the test + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect()