Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1303,11 +1303,11 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/compile/distributed/test_async_tp.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
Expand Down
29 changes: 29 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1424,3 +1424,32 @@ def disable_deepgemm_ue8m0(monkeypatch):
# Clear cache so the next time it is used it is processed with the
# default VLLM_USE_DEEP_GEMM_E8M0 setting.
is_deep_gemm_e8m0_used.cache_clear()


@pytest.fixture(autouse=True)
def clean_gpu_memory_between_tests():
if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
yield
return

# Wait for GPU memory to be cleared before starting the test
import gc

from tests.utils import wait_for_gpu_memory_to_clear

num_gpus = torch.cuda.device_count()
if num_gpus > 0:
try:
wait_for_gpu_memory_to_clear(
devices=list(range(num_gpus)),
threshold_ratio=0.1,
)
except ValueError as e:
logger.info("Failed to clean GPU memory: %s", e)

yield

# Clean up GPU memory after the test
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()