Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
Expand All @@ -317,7 +318,7 @@ steps:
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
Expand Down
9 changes: 2 additions & 7 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import torch

from vllm import LLM
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from vllm.v1.engine.llm_engine import LLMEngine

from ..conftest import HfRunner, VllmRunner
from ..models.utils import check_outputs_equal
Expand Down Expand Up @@ -211,16 +211,11 @@ def test_models_distributed(


def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
from vllm.envs import VLLM_USE_V1

if not VLLM_USE_V1:
pytest.skip("Skipping V0 test, dump input not supported")

# Needed to mock an error in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
if isinstance(vllm_model.llm.llm_engine, LLMEngine):
v1_test_failed_model_execution(vllm_model)


Expand Down
105 changes: 48 additions & 57 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,68 +117,59 @@ def model(x):

@create_new_process_for_each_test()
@pytest.mark.parametrize(
"model, use_v1",
"model",
[
# sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True),
"meta-llama/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
("facebook/opt-125m", True),
"facebook/opt-125m",
],
)
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
assert use_v1
m.setenv("VLLM_USE_V1", "1")
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)

# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)

free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)

# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
if use_v1:
assert used_bytes < 7 * GiB_bytes
else:
assert used_bytes < 2 * GiB_bytes

llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text

llm.sleep(level=1)
llm.wake_up(tags=["weights"])

free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline

# should just reallocate memory for weights (1B model, ~2GiB weights)
if use_v1:
assert used_bytes < 10 * GiB_bytes
else:
assert used_bytes < 6 * GiB_bytes

# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)

# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
def test_end_to_end(model: str):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)

# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)

free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)

# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
assert used_bytes < 7 * GiB_bytes

llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text

llm.sleep(level=1)
llm.wake_up(tags=["weights"])

free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline

# should just reallocate memory for weights (1B model, ~2GiB weights)
assert used_bytes < 10 * GiB_bytes

# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)

# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text


@create_new_process_for_each_test()
Expand Down
2 changes: 0 additions & 2 deletions tests/compile/piecewise/test_full_cudagraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def llm_pair(request):
pytest.skip("Only Blackwell GPUs support Cutlass MLA")

env_vars = {
"VLLM_USE_V1": "1",
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0",
Expand Down Expand Up @@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
with (
temporary_environ(
{
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}
Expand Down
3 changes: 0 additions & 3 deletions tests/compile/piecewise/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
VllmConfig,
set_current_vllm_config,
)
from vllm.envs import VLLM_USE_V1
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer

Expand Down Expand Up @@ -127,7 +126,6 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
def test_simple_piecewise_compile(use_inductor):
assert VLLM_USE_V1
_run_simple_model(
splitting_ops=["silly.attention"],
use_inductor_graph_partition=False,
Expand All @@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@torch.inference_mode()
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
def test_simple_inductor_graph_partition(splitting_ops):
assert VLLM_USE_V1
if not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

Expand Down
8 changes: 1 addition & 7 deletions tests/compile/test_async_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
"pass_config": {"enable_async_tp": async_tp_enabled},
}

async_tp_env = tp_env = {
"VLLM_USE_V1": "1",
}

async_tp_args = [
*common_args,
"--tensor-parallel-size",
Expand All @@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
"mp",
]

compare_two_settings(
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
)
compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
12 changes: 1 addition & 11 deletions tests/compile/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

import vllm
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.utils import _is_torch_equal_or_newer
Expand All @@ -16,15 +15,10 @@ def test_version():
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")


def test_use_cudagraphs_dynamic(monkeypatch):
assert vllm.envs.VLLM_USE_V1
def test_use_cudagraphs_dynamic():
vllm_config = VllmConfig()
assert vllm_config.compilation_config.use_cudagraph

monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_config = VllmConfig()
assert not vllm_config.compilation_config.use_cudagraph


def test_custom_op():
# proper syntax
Expand All @@ -41,8 +35,6 @@ def test_custom_op():
# may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
assert vllm.envs.VLLM_USE_V1

# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
Expand All @@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@pytest.mark.forked
@pytest.mark.parametrize("enabled", [True, False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1

# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

Expand Down
3 changes: 0 additions & 3 deletions tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,6 @@ def test_attention_quant_pattern(
model_class: type[AttentionQuantPatternModel],
backend: _Backend,
use_inductor_graph_partition: bool,
monkeypatch,
dist_init,
caplog_vllm,
):
Expand All @@ -312,8 +311,6 @@ def test_attention_quant_pattern(
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

monkeypatch.setenv("VLLM_USE_V1", "1")

device = torch.device("cuda:0")
torch.manual_seed(42)

Expand Down
5 changes: 1 addition & 4 deletions tests/config/test_mp_reducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,13 @@
from vllm.v1.engine.async_llm import AsyncLLM


def test_mp_reducer(monkeypatch):
def test_mp_reducer():
"""
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
without transformers_modules. This is a regression test for
https://github.com/vllm-project/vllm/pull/18640.
"""

# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
monkeypatch.setenv("VLLM_USE_V1", "1")

# Ensure transformers_modules is not in sys.modules
if "transformers_modules" in sys.modules:
del sys.modules["transformers_modules"]
Expand Down
6 changes: 2 additions & 4 deletions tests/detokenizer/test_stop_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from vllm import LLM, SamplingParams, envs
from vllm import LLM, SamplingParams

MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200
Expand Down Expand Up @@ -111,9 +111,7 @@ def _stop_token_id(llm):

@pytest.mark.skip_global_cleanup
def test_stop_strings():
# If V0, must set enforce_eager=False since we use
# async output processing below.
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
llm = LLM(MODEL, enforce_eager=True)

_stop_basic(llm)
_stop_multi_tokens(llm)
Expand Down
Loading