diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d11a43377548..ef717f8d1f59 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -296,6 +296,7 @@ steps: - tests/v1 commands: # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core - pytest -v -s v1/executor - pytest -v -s v1/kv_offload - pytest -v -s v1/sample @@ -317,7 +318,7 @@ steps: no_gpu: true commands: # split the test to avoid interference - - pytest -v -s v1/core + - pytest -v -s -m 'cpu_test' v1/core - pytest -v -s v1/structured_output - pytest -v -s v1/test_serial_utils.py - pytest -v -s -m 'cpu_test' v1/kv_connector/unit diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index d63c82102b6b..9b9d8cfea7fa 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -13,7 +13,7 @@ import torch from vllm import LLM -from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1 +from vllm.v1.engine.llm_engine import LLMEngine from ..conftest import HfRunner, VllmRunner from ..models.utils import check_outputs_equal @@ -211,16 +211,11 @@ def test_models_distributed( def test_failed_model_execution(vllm_runner, monkeypatch) -> None: - from vllm.envs import VLLM_USE_V1 - - if not VLLM_USE_V1: - pytest.skip("Skipping V0 test, dump input not supported") - # Needed to mock an error in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model: - if isinstance(vllm_model.llm.llm_engine, LLMEngineV1): + if isinstance(vllm_model.llm.llm_engine, LLMEngine): v1_test_failed_model_execution(vllm_model) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index b7cd98e27403..f1b0f7b2de89 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -117,68 +117,59 @@ def model(x): @create_new_process_for_each_test() @pytest.mark.parametrize( - "model, use_v1", + "model", [ # sleep mode with safetensors - ("meta-llama/Llama-3.2-1B", True), + "meta-llama/Llama-3.2-1B", # sleep mode with pytorch checkpoint - ("facebook/opt-125m", True), + "facebook/opt-125m", ], ) -def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): - with monkeypatch.context() as m: - assert use_v1 - m.setenv("VLLM_USE_V1", "1") - free, total = torch.cuda.mem_get_info() - used_bytes_baseline = total - free # in case other process is running - llm = LLM(model, enable_sleep_mode=True) - prompt = "How are you?" - sampling_params = SamplingParams(temperature=0, max_tokens=10) - output = llm.generate(prompt, sampling_params) - - # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, - # which is difficult to measure in the test. therefore, we only - # test sleep level 1 here. - llm.sleep(level=1) - - free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() - used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline - # now the memory usage is mostly cudagraph memory pool, - # and it should be less than the model weights (1B model, 2GiB weights) - - # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) - # is captured but cannot be releasesd from PyTorch due to a known bug, - # therefore high memory usage after `llm.sleep` is called is expected. - # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode - # in V1. - if use_v1: - assert used_bytes < 7 * GiB_bytes - else: - assert used_bytes < 2 * GiB_bytes - - llm.wake_up() - output2 = llm.generate(prompt, sampling_params) - # cmp output - assert output[0].outputs[0].text == output2[0].outputs[0].text - - llm.sleep(level=1) - llm.wake_up(tags=["weights"]) - - free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() - used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline - - # should just reallocate memory for weights (1B model, ~2GiB weights) - if use_v1: - assert used_bytes < 10 * GiB_bytes - else: - assert used_bytes < 6 * GiB_bytes - - # now allocate kv cache memory - llm.wake_up(tags=["kv_cache"]) - output3 = llm.generate(prompt, sampling_params) - - # cmp output - assert output[0].outputs[0].text == output3[0].outputs[0].text +def test_end_to_end(model: str): + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM(model, enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) + + # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) + # is captured but cannot be releasesd from PyTorch due to a known bug, + # therefore high memory usage after `llm.sleep` is called is expected. + # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode + # in V1. + assert used_bytes < 7 * GiB_bytes + + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text + + llm.sleep(level=1) + llm.wake_up(tags=["weights"]) + + free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline + + # should just reallocate memory for weights (1B model, ~2GiB weights) + assert used_bytes < 10 * GiB_bytes + + # now allocate kv cache memory + llm.wake_up(tags=["kv_cache"]) + output3 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output3[0].outputs[0].text @create_new_process_for_each_test() diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 927c838ae74e..84194f3ed01e 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -66,7 +66,6 @@ def llm_pair(request): pytest.skip("Only Blackwell GPUs support Cutlass MLA") env_vars = { - "VLLM_USE_V1": "1", # Force native sampler to avoid potential nondeterminism in FlashInfer # when per-request generators are not used in V1. "VLLM_USE_FLASHINFER_SAMPLER": "0", @@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend(): with ( temporary_environ( { - "VLLM_USE_V1": "1", "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION", # Flex_Attention is not supported with full cuda graph } diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 920cd5a06c26..8241d248fa53 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -18,7 +18,6 @@ VllmConfig, set_current_vllm_config, ) -from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils import is_torch_equal_or_newer @@ -127,7 +126,6 @@ def _run_simple_model( @pytest.mark.parametrize("use_inductor", [True, False]) @torch.inference_mode() def test_simple_piecewise_compile(use_inductor): - assert VLLM_USE_V1 _run_simple_model( splitting_ops=["silly.attention"], use_inductor_graph_partition=False, @@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor): @torch.inference_mode() @pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) def test_simple_inductor_graph_partition(splitting_ops): - assert VLLM_USE_V1 if not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("inductor graph partition is only available in PyTorch 2.9+") diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 03cd510eb5d0..88ad4f81df50 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -388,10 +388,6 @@ def test_async_tp_pass_correctness( "pass_config": {"enable_async_tp": async_tp_enabled}, } - async_tp_env = tp_env = { - "VLLM_USE_V1": "1", - } - async_tp_args = [ *common_args, "--tensor-parallel-size", @@ -410,6 +406,4 @@ def test_async_tp_pass_correctness( "mp", ] - compare_two_settings( - model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate" - ) + compare_two_settings(model_id, async_tp_args, tp_args, method="generate") diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index d055a41af4c4..0da7f58a2f5f 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -import vllm from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.utils import _is_torch_equal_or_newer @@ -16,15 +15,10 @@ def test_version(): assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev") -def test_use_cudagraphs_dynamic(monkeypatch): - assert vllm.envs.VLLM_USE_V1 +def test_use_cudagraphs_dynamic(): vllm_config = VllmConfig() assert vllm_config.compilation_config.use_cudagraph - monkeypatch.setenv("VLLM_USE_V1", "0") - vllm_config = VllmConfig() - assert not vllm_config.compilation_config.use_cudagraph - def test_custom_op(): # proper syntax @@ -41,8 +35,6 @@ def test_custom_op(): # may be influenced by other tests. @pytest.mark.parametrize("val", ["1"]) def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val): - assert vllm.envs.VLLM_USE_V1 - # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val) @@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val): @pytest.mark.forked @pytest.mark.parametrize("enabled", [True, False]) def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): - assert vllm.envs.VLLM_USE_V1 - # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 54d3d4ed0295..0f2e3bffbd31 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -303,7 +303,6 @@ def test_attention_quant_pattern( model_class: type[AttentionQuantPatternModel], backend: _Backend, use_inductor_graph_partition: bool, - monkeypatch, dist_init, caplog_vllm, ): @@ -312,8 +311,6 @@ def test_attention_quant_pattern( if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("inductor graph partition is only available in PyTorch 2.9+") - monkeypatch.setenv("VLLM_USE_V1", "1") - device = torch.device("cuda:0") torch.manual_seed(42) diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py index 9c03f26c504e..56dc542f1c76 100644 --- a/tests/config/test_mp_reducer.py +++ b/tests/config/test_mp_reducer.py @@ -8,16 +8,13 @@ from vllm.v1.engine.async_llm import AsyncLLM -def test_mp_reducer(monkeypatch): +def test_mp_reducer(): """ Test that _reduce_config reducer is registered when AsyncLLM is instantiated without transformers_modules. This is a regression test for https://github.com/vllm-project/vllm/pull/18640. """ - # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value - monkeypatch.setenv("VLLM_USE_V1", "1") - # Ensure transformers_modules is not in sys.modules if "transformers_modules" in sys.modules: del sys.modules["transformers_modules"] diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index 70cc7e31b8ad..d59b394393e3 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -5,7 +5,7 @@ import pytest -from vllm import LLM, SamplingParams, envs +from vllm import LLM, SamplingParams MODEL = "meta-llama/llama-2-7b-hf" MAX_TOKENS = 200 @@ -111,9 +111,7 @@ def _stop_token_id(llm): @pytest.mark.skip_global_cleanup def test_stop_strings(): - # If V0, must set enforce_eager=False since we use - # async output processing below. - llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) + llm = LLM(MODEL, enforce_eager=True) _stop_basic(llm) _stop_multi_tokens(llm) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index c8b6dc9781df..53fc9957b910 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple): @dataclass class CPTestSettings: parallel_setups: list[ParallelSetup] - # NOTE: the length of distributed_backends and - # vllm_major_versions should be the same, and they - # are first zipped together to iterate over all - # test settings. distributed_backends: list[str] - # vllm major version: "0" for V0, "1" for V1 - vllm_major_versions: list[str] runner: RunnerOption test_options: CPTestOptions - def __post_init__(self): - if len(self.distributed_backends) != len(self.vllm_major_versions): - raise ValueError( - f"Length mismatch: distributed_backends " - f"({len(self.distributed_backends)}) != " - f"vllm_major_versions ({len(self.vllm_major_versions)})" - ) - @staticmethod def detailed( *, @@ -87,7 +73,6 @@ def detailed( return CPTestSettings( parallel_setups=parallel_setups, distributed_backends=["mp"], - vllm_major_versions=["1"], runner=runner, test_options=CPTestOptions( multi_node_only=multi_node_only, load_format=load_format @@ -98,14 +83,11 @@ def iter_params(self, model_id: str): opts = self.test_options for parallel_setup in self.parallel_setups: - for backend, vllm_major_version in zip( - self.distributed_backends, self.vllm_major_versions - ): + for backend in self.distributed_backends: yield ( model_id, parallel_setup, backend, - vllm_major_version, self.runner, opts, ) @@ -115,7 +97,6 @@ def _compare_cp_with_tp( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: CPTestOptions, num_gpus_available: int, @@ -191,10 +172,6 @@ def _compare_cp_with_tp( if hf_overrides: common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) - cp_env = tp_env = { - "VLLM_USE_V1": vllm_major_version, # Note(hc): DCP only support V1 engine only - } - cp_args = [ *common_args, "--tensor-parallel-size", @@ -217,24 +194,13 @@ def _compare_cp_with_tp( distributed_backend, ] - try: - compare_two_settings( - model_id, - cp_args, - tp_args, - cp_env, - tp_env, - method=method, - max_wait_seconds=720, - ) - except Exception: - testing_ray_compiled_graph = cp_env is not None - if testing_ray_compiled_graph and vllm_major_version == "0": - # Ray Compiled Graph tests are flaky for V0, - # so we don't want to fail the test - logger.exception("Ray Compiled Graph tests failed") - else: - raise + compare_two_settings( + model_id, + cp_args, + tp_args, + method=method, + max_wait_seconds=720, + ) CP_TEXT_GENERATION_MODELS = { @@ -257,7 +223,6 @@ def _compare_cp_with_tp( "model_id", "parallel_setup", "distributed_backend", - "vllm_major_version", "runner", "test_options", ), @@ -274,7 +239,6 @@ def test_cp_generation( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: CPTestOptions, num_gpus_available, @@ -283,7 +247,6 @@ def test_cp_generation( model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 7d55c40754b4..43f0c9dd1a85 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -307,7 +307,6 @@ def _compare_tp( if distributed_backend == "ray": # For V1, test Ray Compiled Graph for all the tests pp_env = { - "VLLM_USE_V1": "1", "VLLM_USE_RAY_COMPILED_DAG": "1", "VLLM_USE_RAY_SPMD_WORKER": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", @@ -316,15 +315,11 @@ def _compare_tp( # terminate because of a Ray Compiled Graph issue. common_args.append("--disable-frontend-multiprocessing") elif distributed_backend == "mp": - pp_env = { - "VLLM_USE_V1": "1", - } + pp_env = None else: pp_env = None - tp_env = { - "VLLM_USE_V1": "1", - } + tp_env = None pp_args = [ *common_args, diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 82eaed66717c..1defd9690241 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple): @dataclass class SPTestSettings: parallel_setups: list[ParallelSetup] - # NOTE: the length of distributed_backends and - # vllm_major_versions should be the same, and they - # are first zipped together to iterate over all - # test settings. distributed_backends: list[str] - # vllm major version: "0" for V0, "1" for V1 - vllm_major_versions: list[str] runner: RunnerOption test_options: SPTestOptions - def __post_init__(self): - if len(self.distributed_backends) != len(self.vllm_major_versions): - raise ValueError( - f"Length mismatch: distributed_backends " - f"({len(self.distributed_backends)}) != " - f"vllm_major_versions ({len(self.vllm_major_versions)})" - ) - @staticmethod def detailed( *, @@ -85,7 +71,6 @@ def detailed( return SPTestSettings( parallel_setups=parallel_setups, distributed_backends=["mp", "ray"], - vllm_major_versions=["1", "1"], runner=runner, test_options=SPTestOptions( multi_node_only=multi_node_only, load_format=load_format @@ -117,7 +102,6 @@ def fast( return SPTestSettings( parallel_setups=parallel_setups, distributed_backends=["mp", "ray"], - vllm_major_versions=["1", "1"], runner=runner, test_options=SPTestOptions( multi_node_only=multi_node_only, load_format=load_format @@ -147,7 +131,6 @@ def fp8_quant( return SPTestSettings( parallel_setups=parallel_setups, distributed_backends=["mp", "ray"], - vllm_major_versions=["1", "1"], runner=runner, test_options=SPTestOptions( multi_node_only=multi_node_only, load_format=load_format @@ -158,14 +141,11 @@ def iter_params(self, model_id: str): opts = self.test_options for parallel_setup in self.parallel_setups: - for backend, vllm_major_version in zip( - self.distributed_backends, self.vllm_major_versions - ): + for backend in self.distributed_backends: yield ( model_id, parallel_setup, backend, - vllm_major_version, self.runner, opts, ) @@ -175,7 +155,6 @@ def _compare_sp( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: SPTestOptions, num_gpus_available: int, @@ -265,10 +244,6 @@ def _compare_sp( }, } - tp_sp_env = tp_env = { - "VLLM_USE_V1": vllm_major_version, - } - tp_sp_args = [ *common_args, "--tensor-parallel-size", @@ -281,9 +256,6 @@ def _compare_sp( json.dumps(compilation_config), ] - tp_env = { - "VLLM_USE_V1": vllm_major_version, - } tp_args = [ *common_args, "--tensor-parallel-size", @@ -292,18 +264,7 @@ def _compare_sp( "mp", ] - try: - compare_two_settings( - model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method - ) - except Exception: - testing_ray_compiled_graph = tp_sp_env is not None - if testing_ray_compiled_graph and vllm_major_version == "0": - # Ray Compiled Graph tests are flaky for V0, - # so we don't want to fail the test - logger.exception("Ray Compiled Graph tests failed") - else: - raise + compare_two_settings(model_id, tp_sp_args, tp_args, method=method) SP_TEXT_GENERATION_MODELS = { @@ -325,7 +286,6 @@ def _compare_sp( "model_id", "parallel_setup", "distributed_backend", - "vllm_major_version", "runner", "test_options", ), @@ -341,7 +301,6 @@ def test_tp_sp_generation( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: SPTestOptions, num_gpus_available, @@ -350,7 +309,6 @@ def test_tp_sp_generation( model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index e2d107b60586..af607720c8b0 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -61,50 +61,34 @@ def run_test(model_name, more_args=None): TPU_TP_TEST_STR = "" # "tensor_parallel_size=4" -@pytest.mark.skipif( - not current_platform.is_cuda() and not current_platform.is_tpu(), - reason="V1 is currently only supported on CUDA and TPU", -) @pytest.mark.parametrize("model", MODEL_NAMES) -def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch): +def test_lm_eval_accuracy_v1_engine(model): """Run with the V1 Engine.""" - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + more_args = None + if current_platform.is_tpu(): + # Limit compilation time for TPU V1 - more_args = None - if current_platform.is_tpu(): - # Limit compilation time for TPU V1 + more_args = "max_model_len=2048,max_num_seqs=64" - more_args = "max_model_len=2048,max_num_seqs=64" + # Add TP test (if provided) + if TPU_TP_TEST_STR: + more_args += ",{}".format(TPU_TP_TEST_STR) - # Add TP test (if provided) - if TPU_TP_TEST_STR: - more_args += ",{}".format(TPU_TP_TEST_STR) + run_test(model, more_args) - run_test(model, more_args) - -@pytest.mark.skipif( - not current_platform.is_cuda() and not current_platform.is_tpu(), - reason="V1 is currently only supported on CUDA and TPU", -) @pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES) -def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( - model, monkeypatch: pytest.MonkeyPatch -): +def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model): """Run with the V1 Engine.""" - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - more_args = None - if current_platform.is_tpu(): - # Limit compilation time for TPU V1 - more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8" + more_args = None + if current_platform.is_tpu(): + # Limit compilation time for TPU V1 + more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8" - # Add TP test (if provided) - if TPU_TP_TEST_STR: - more_args += ",{}".format(TPU_TP_TEST_STR) + # Add TP test (if provided) + if TPU_TP_TEST_STR: + more_args += ",{}".format(TPU_TP_TEST_STR) - run_test(model, more_args) + run_test(model, more_args) diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index 919b7793628e..5b23b4239027 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -10,7 +10,6 @@ """ import lm_eval -import pytest from vllm.platforms import current_platform @@ -67,21 +66,13 @@ def run_test(more_args): ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" -@pytest.mark.skipif( - not current_platform.is_cuda() - and not current_platform.is_tpu() - and not current_platform.is_xpu(), - reason="V1 currently only supported on CUDA, XPU and TPU", -) -def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): +def test_lm_eval_accuracy_v1_engine(): """Run with the V1 Engine.""" - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - more_args = [] + more_args = [] - # Limit compilation time for V1 - if current_platform.is_tpu(): - more_args = ["--max-num-seqs", "64"] + # Limit compilation time for V1 + if current_platform.is_tpu(): + more_args = ["--max-num-seqs", "64"] - run_test(more_args) + run_test(more_args) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index ed0b284bda62..d110234d60ac 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -21,18 +21,7 @@ @pytest.fixture(scope="module") -def monkeypatch_module(): - from _pytest.monkeypatch import MonkeyPatch - - mpatch = MonkeyPatch() - yield mpatch - mpatch.undo() - - -@pytest.fixture(scope="module") -def server(monkeypatch_module, zephyr_lora_files): # noqa: F811 - monkeypatch_module.setenv("VLLM_USE_V1", "1") - +def server(zephyr_lora_files): # noqa: F811 args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 379e7d36d9e1..674e14e4f5c1 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -37,21 +37,8 @@ ] -@pytest.fixture(scope="module") -def monkeypatch_module(): - from _pytest.monkeypatch import MonkeyPatch - - mpatch = MonkeyPatch() - yield mpatch - mpatch.undo() - - @pytest.fixture(scope="module", params=[True]) -def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files): - use_v1 = request.param - assert use_v1 - monkeypatch_module.setenv("VLLM_USE_V1", "1") - +def server_with_lora_modules_json(request, zephyr_lora_files): # Define the json format LoRA module configurations lora_module_1 = { "name": "zephyr-lora", diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 711505c74bca..6b00dde494d1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -22,24 +22,6 @@ PREV_MINOR_VERSION = version._prev_minor_version() -@pytest.fixture(scope="module", params=[True]) -def use_v1(request): - # Module-scoped variant of run_with_both_engines - # - # Use this fixture to run a test with both v0 and v1, and - # also to conditionalize the test logic e.g. - # - # def test_metrics_exist(use_v1, server, client): - # ... - # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS - # for metric in expected: - # assert metric in response.text - # - # @skip_v1 wouldn't work here because this is a module-level - # fixture - per-function decorators would have no effect - yield request.param - - @pytest.fixture(scope="module") def default_server_args(): return [ @@ -63,13 +45,11 @@ def default_server_args(): f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", ], ) -def server(use_v1, default_server_args, request): +def server(default_server_args, request): if request.param: default_server_args.append(request.param) - env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0") - with RemoteOpenAIServer( - MODEL_NAME, default_server_args, env_dict=env_dict - ) as remote_server: + + with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: yield remote_server @@ -129,7 +109,8 @@ async def client(server): @pytest.mark.asyncio async def test_metrics_counts( - server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool + server: RemoteOpenAIServer, + client: openai.AsyncClient, ): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. @@ -145,7 +126,7 @@ async def test_metrics_counts( # Loop over all expected metric_families for metric_family, suffix_values_list in EXPECTED_VALUES.items(): - if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or ( + if (metric_family not in EXPECTED_METRICS_V1) or ( not server.show_hidden_metrics and metric_family in HIDDEN_DEPRECATED_METRICS ): @@ -183,62 +164,6 @@ async def test_metrics_counts( assert found_metric, f"Did not find {metric_family} in prom endpoint" -EXPECTED_METRICS = [ - "vllm:num_requests_running", - "vllm:num_requests_waiting", - "vllm:gpu_cache_usage_perc", - "vllm:time_to_first_token_seconds_sum", - "vllm:time_to_first_token_seconds_bucket", - "vllm:time_to_first_token_seconds_count", - "vllm:time_per_output_token_seconds_sum", - "vllm:time_per_output_token_seconds_bucket", - "vllm:time_per_output_token_seconds_count", - "vllm:e2e_request_latency_seconds_sum", - "vllm:e2e_request_latency_seconds_bucket", - "vllm:e2e_request_latency_seconds_count", - "vllm:request_queue_time_seconds_sum", - "vllm:request_queue_time_seconds_bucket", - "vllm:request_queue_time_seconds_count", - "vllm:request_inference_time_seconds_sum", - "vllm:request_inference_time_seconds_bucket", - "vllm:request_inference_time_seconds_count", - "vllm:request_prefill_time_seconds_sum", - "vllm:request_prefill_time_seconds_bucket", - "vllm:request_prefill_time_seconds_count", - "vllm:request_decode_time_seconds_sum", - "vllm:request_decode_time_seconds_bucket", - "vllm:request_decode_time_seconds_count", - "vllm:request_prompt_tokens_sum", - "vllm:request_prompt_tokens_bucket", - "vllm:request_prompt_tokens_count", - "vllm:request_generation_tokens_sum", - "vllm:request_generation_tokens_bucket", - "vllm:request_generation_tokens_count", - "vllm:request_params_n_sum", - "vllm:request_params_n_bucket", - "vllm:request_params_n_count", - "vllm:request_params_max_tokens_sum", - "vllm:request_params_max_tokens_bucket", - "vllm:request_params_max_tokens_count", - "vllm:iteration_tokens_total", - "vllm:num_preemptions_total", - "vllm:prompt_tokens_total", - "vllm:generation_tokens_total", - "vllm:request_success_total", - "vllm:cache_config_info", - # labels in cache_config_info - "block_size", - "cache_dtype", - "cpu_offload_gb", - "enable_prefix_caching", - "gpu_memory_utilization", - "num_cpu_blocks", - "num_gpu_blocks", - "num_gpu_blocks_override", - "sliding_window", - "swap_space_bytes", -] - EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", @@ -304,17 +229,21 @@ async def test_metrics_counts( @pytest.mark.asyncio async def test_metrics_exist( - server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool + server: RemoteOpenAIServer, + client: openai.AsyncClient, ): # sending a request triggers the metrics to be logged. await client.completions.create( - model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0 + model=MODEL_NAME, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0, ) response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK - for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS: + for metric in EXPECTED_METRICS_V1: if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics: continue assert metric in response.text @@ -322,10 +251,11 @@ async def test_metrics_exist( @pytest.mark.asyncio async def test_abort_metrics_reset( - server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool + server: RemoteOpenAIServer, + client: openai.AsyncClient, ): running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api( - server, use_v1 + server ) # Expect no running requests or kvcache usage @@ -351,7 +281,7 @@ async def test_abort_metrics_reset( # Check that we have running requests running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api( - server, use_v1 + server ) # Expect running requests and kvcache usage @@ -371,7 +301,7 @@ async def test_abort_metrics_reset( # Verify running and waiting requests counts and KV cache usage are zero running_requests_after, waiting_requests_after, kv_cache_usage_after = ( - _get_running_metrics_from_api(server, use_v1) + _get_running_metrics_from_api(server) ) assert running_requests_after == 0, ( @@ -385,7 +315,7 @@ async def test_abort_metrics_reset( ) -def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): +def _get_running_metrics_from_api(server: RemoteOpenAIServer): """Return (running_count, waiting_count, kv_cache_usage)""" response = requests.get(server.url_for("metrics")) @@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): # Verify running and waiting requests counts and KV cache usage are zero running_requests, waiting_requests, kv_cache_usage = None, None, None - kv_cache_usage_metric = ( - "vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc" - ) + kv_cache_usage_metric = "vllm:kv_cache_usage_perc" for family in text_string_to_metric_families(response.text): if family.name == "vllm:num_requests_running": @@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): return running_requests, waiting_requests, kv_cache_usage -def test_metrics_exist_run_batch(use_v1: bool): +def test_metrics_exist_run_batch(): input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 base_url = "0.0.0.0" @@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool): "--port", port, ], - env={"VLLM_USE_V1": "1"}, ) def is_server_up(url): diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 81e2b52dfa71..3d0885414b24 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -15,11 +15,6 @@ from ...utils import RemoteOpenAIServer -@pytest.fixture(scope="function", autouse=True) -def use_v1_only(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "1") - - @pytest.mark.asyncio async def test_empty_prompt(): model_name = "gpt2" diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 6b99ba7af50e..5c607f921536 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -80,7 +80,6 @@ def test_env( ): """Test attention backend selection with valid device-backend pairs.""" with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv(STR_BACKEND_ENV_VAR, name) m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") @@ -212,30 +211,21 @@ def test_env( @pytest.mark.parametrize("device", ["cpu", "cuda"]) -def test_fp32_fallback( - device: str, - monkeypatch: pytest.MonkeyPatch, -): +def test_fp32_fallback(device: str): """Test attention backend selection with fp32.""" - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + if device == "cpu": + with patch("vllm.attention.selector.current_platform", CpuPlatform()): + backend = get_attn_backend(16, torch.float32, None, 16) + assert backend.get_name() == "TORCH_SDPA" - if device == "cpu": - with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16) - assert backend.get_name() == "TORCH_SDPA" - - elif device == "cuda": - with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16) - assert backend.get_name() == "FLEX_ATTENTION" + elif device == "cuda": + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + backend = get_attn_backend(16, torch.float32, None, 16) + assert backend.get_name() == "FLEX_ATTENTION" def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" - # TODO: When testing for v1, pipe in `use_v1` as an argument to - # get_attn_backend - pytest.skip( "Skipping as current backend selector does not " "handle fallbacks when a backend is set via env var." @@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch): monkeypatch.context() as m, patch("vllm.attention.selector.current_platform", CudaPlatform()), ): - m.setenv("VLLM_USE_V1", "1") m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) # Should raise ValueError for invalid backend diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index 87002c72f6e1..ae33f422d373 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): # Run with flex attention with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") set_seed(seed) @@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): # Run with default backend with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") set_seed(seed) with vllm_runner( model_name, @@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): # Run with flex attention with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") with vllm_runner( model_name, @@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): flex_outputs = llm_flex.embed(prompts) # Run with default backend - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - with vllm_runner( + with ( + monkeypatch.context() as m, + vllm_runner( model_name, runner="pooling", dtype=torch.bfloat16, tensor_parallel_size=1, max_model_len=100, enforce_eager=True, - ) as llm_default: - default_outputs = llm_default.embed(prompts) + ) as llm_default, + ): + default_outputs = llm_default.embed(prompts) check_embeddings_close( embeddings_0_lst=flex_outputs, diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py index 2f9b09f4026c..fd3386ff67df 100644 --- a/tests/models/multimodal/generation/test_maverick.py +++ b/tests/models/multimodal/generation/test_maverick.py @@ -613,7 +613,6 @@ def test_dummy_maverick( profile: bool = False, ) -> None: # Disable multiprocessing allows us to access model executor from LLM engine - monkeypatch.setenv("VLLM_USE_V1", "1") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") model_path = create_reduced_maverick_model( diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index 90cb461a6caf..0389e28746cb 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -8,7 +8,6 @@ from vllm.config import VllmConfig else: VllmConfig = None -from vllm import envs class DummyPlatform(Platform): @@ -19,10 +18,7 @@ class DummyPlatform(Platform): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - if envs.VLLM_USE_V1: - compilation_config = vllm_config.compilation_config - # Activate custom ops for v1. - compilation_config.custom_ops = ["all"] + vllm_config.compilation_config.custom_ops = ["all"] def get_attn_backend_cls( self, diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 1c37d6a39261..45902cc874c3 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -16,7 +16,6 @@ def schedule(self): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") # Explicitly turn off engine multiprocessing so # that the scheduler runs in this process m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 42aebcd52414..fa0ca48f9bd9 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -8,18 +8,11 @@ from typing import Optional -import pytest from transformers import AutoTokenizer from vllm import LLM, SamplingParams -@pytest.fixture(autouse=True) -def v1(monkeypatch): - """Only run on vLLM v1.""" - monkeypatch.setenv("VLLM_USE_V1", "1") - - def _generate( llm: LLM, prompt: str, diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index 5999c9cf1e0e..9780092b25e6 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -17,17 +17,6 @@ # 100 training iterations with a training batch size of 100. -@pytest.fixture(scope="function", autouse=True) -def use_v1_only(monkeypatch: pytest.MonkeyPatch): - """ - Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1 - for all tests in this file - """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - yield - - def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: return vllm.LLM( model="Qwen/Qwen2.5-3B-Instruct", diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index f30a6628b1bf..fcc0b6a5f7de 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -305,7 +305,6 @@ class BackendConfig: "CutlassMLA": BackendConfig( name="CutlassMLA", env_vars={ - "VLLM_USE_V1": "1", "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed }, diff --git a/tests/v1/core/test_kv_sharing.py b/tests/v1/core/test_kv_sharing.py index 328f2640f218..e6d37b1d63c8 100644 --- a/tests/v1/core/test_kv_sharing.py +++ b/tests/v1/core/test_kv_sharing.py @@ -1,11 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest import torch from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups +pytestmark = pytest.mark.cpu_test + def new_kv_cache_spec(): return FullAttentionSpec(16, 1, 1, torch.float32, False) diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 6983c3b92f6b..90f8757ae493 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -1,14 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import pytest from vllm import LLM -if os.getenv("VLLM_USE_V1", "0") != "1": - pytest.skip("Test package requires V1", allow_module_level=True) - MODEL = "meta-llama/Llama-3.2-1B" PROMPT = "Hello my name is Robert and I" diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 77d5c5d87fc1..8c8148ae2094 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ): pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") - env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} + env_vars = backend_configs[backend_name].env_vars with temporary_environ(env_vars), ExitStack() as stack: if not supported: @@ -117,7 +117,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte def test_cudagraph_compilation_combo(combo_case): backend_name, cudagraph_mode, compilation_level, supported = combo_case - env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} + env_vars = backend_configs[backend_name].env_vars with temporary_environ(env_vars), ExitStack() as stack: if not supported: diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 5f26c2f1c651..0fcb97fe6305 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): ) with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index c9018ee177e8..71b0e86c75c1 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -32,7 +32,7 @@ class TestConfig: @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False]) def test_sliding_window_retrieval( - monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager + model, batch_size, seed, disable_hybrid_kv_cache_manager ): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then @@ -40,39 +40,34 @@ def test_sliding_window_retrieval( If we tell it upfront which we are going to be looking for, then it answers correctly (mostly). """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - test_config = model_config[model] - - llm = LLM( - model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager - ) - sampling_params = SamplingParams(temperature=0.0, max_tokens=100) - - prompts, answer, indices = prep_prompts( - batch_size, ln_range=test_config.ln_range - ) - - check_length(prompts, llm, test_config.sliding_window) - - # Fresh generation - responses = llm.generate(prompts, sampling_params) - check_answers( - indices, - answer, - [response.outputs[0].text for response in responses], - accept_rate=1.0, - ) - - # Re-generate with the same prompts to test prefix caching - responses = llm.generate(prompts, sampling_params) - check_answers( - indices, - answer, - [response.outputs[0].text for response in responses], - accept_rate=1.0, - ) + test_config = model_config[model] + + llm = LLM( + model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager + ) + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + + prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range) + + check_length(prompts, llm, test_config.sliding_window) + + # Fresh generation + responses = llm.generate(prompts, sampling_params) + check_answers( + indices, + answer, + [response.outputs[0].text for response in responses], + accept_rate=1.0, + ) + + # Re-generate with the same prompts to test prefix caching + responses = llm.generate(prompts, sampling_params) + check_answers( + indices, + answer, + [response.outputs[0].text for response in responses], + accept_rate=1.0, + ) def check_length(prompts: list[str], llm: LLM, sliding_window: int): diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index b9052d8a58b8..89e5f26ac627 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill( ) with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Make scheduling deterministic for reproducibility m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") diff --git a/tests/v1/e2e/test_min_tokens.py b/tests/v1/e2e/test_min_tokens.py index f15982b7e5f3..e00a3d58debe 100644 --- a/tests/v1/e2e/test_min_tokens.py +++ b/tests/v1/e2e/test_min_tokens.py @@ -13,7 +13,6 @@ 5) Multiple stop conditions """ -import os from typing import Optional, Union import pytest @@ -161,9 +160,6 @@ def __str__(self): @pytest.fixture(scope="module") def llm_v1(): """Create V1 LLM instance for testing""" - # Ensure V1 engine is used - os.environ["VLLM_USE_V1"] = "1" - llm = LLM( model=TEST_MODEL, tensor_parallel_size=1, @@ -503,6 +499,6 @@ def test_min_tokens_validation(): Usage: cd vllm/ - VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v + python -m pytest tests/v1/e2e/test_min_tokens.py -v """ pytest.main([__file__, "-v"]) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 9ed9cd7950a9..fbbbd0389c26 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -301,7 +301,6 @@ def test_mtp_correctness( model_setup: (method, model_name, tp_size) """ with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_MLA_DISABLE", "1") method, model_name, tp_size = model_setup diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 3e30d28111c8..444d771a18d6 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -95,17 +95,11 @@ async def generate( ) @pytest.mark.asyncio async def test_load( - monkeypatch: pytest.MonkeyPatch, output_kind: RequestOutputKind, engine_args: AsyncEngineArgs, prompt: PromptType, ): - # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 - # so that in the future when we switch, we don't have to change all the - # tests. - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) @@ -149,14 +143,11 @@ async def test_load( ) @pytest.mark.asyncio async def test_abort( - monkeypatch: pytest.MonkeyPatch, output_kind: RequestOutputKind, engine_args: AsyncEngineArgs, prompt: PromptType, ): - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) @@ -222,13 +213,8 @@ async def test_abort( "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] ) @pytest.mark.asyncio -async def test_multi_abort( - monkeypatch: pytest.MonkeyPatch, - output_kind: RequestOutputKind, -): - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - +async def test_multi_abort(output_kind: RequestOutputKind): + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) after.callback(engine.shutdown) @@ -304,14 +290,11 @@ async def test_multi_abort( ) @pytest.mark.asyncio async def test_finished_flag( - monkeypatch: pytest.MonkeyPatch, n: int, engine_args: AsyncEngineArgs, prompt: PromptType, ): - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) @@ -341,12 +324,10 @@ async def test_finished_flag( ) @pytest.mark.asyncio async def test_mid_stream_cancellation( - monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType + engine_args: AsyncEngineArgs, prompt: PromptType ): """Test that requests can be cancelled mid-stream.""" - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) @@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch): be added to the default loggers. """ - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args( TEXT_ENGINE_ARGS, @@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch): @pytest.mark.asyncio(scope="module") -async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - +async def test_dp_rank_argument(): + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) after.callback(engine.shutdown) @@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): @pytest.mark.asyncio -async def test_check_health(monkeypatch: pytest.MonkeyPatch): +async def test_check_health(): """Test that check_health returns normally for healthy engine and raises EngineDeadError when the engine is dead. """ @@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): from vllm.v1.engine.exceptions import EngineDeadError - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) after.callback(engine.shutdown) @@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch): "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] ) @pytest.mark.asyncio -async def test_abort_final_output( - monkeypatch: pytest.MonkeyPatch, - output_kind: RequestOutputKind, -): +async def test_abort_final_output(output_kind: RequestOutputKind): """Test that abort() returns a final output with correct information.""" - with monkeypatch.context() as m, ExitStack() as after: - m.setenv("VLLM_USE_V1", "1") - + with ExitStack() as after: with set_default_torch_num_threads(1): engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) after.callback(engine.shutdown) diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index f6b10fa67b3b..943402e429b6 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -5,18 +5,11 @@ import pytest -from vllm import envs from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser -if not envs.VLLM_USE_V1: - pytest.skip( - "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", - allow_module_level=True, - ) - def test_prefix_caching_from_cli(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 28d7854ab5d2..997b2b74bb6b 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest: @create_new_process_for_each_test() -def test_engine_core(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - """Setup the EngineCore.""" - engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() - executor_class = Executor.get_class(vllm_config) - - with set_default_torch_num_threads(1): - engine_core = EngineCore( - vllm_config=vllm_config, executor_class=executor_class, log_stats=True - ) - """Test basic request lifecycle.""" - - # First request. - engine_core.add_request(*engine_core.preprocess_add_request(make_request())) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 1 - - # Second request. - engine_core.add_request(*engine_core.preprocess_add_request(make_request())) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 1 - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 2 - - # Add two requests in a row. - engine_core.add_request(*engine_core.preprocess_add_request(make_request())) - engine_core.add_request(*engine_core.preprocess_add_request(make_request())) - assert len(engine_core.scheduler.waiting) == 2 - assert len(engine_core.scheduler.running) == 2 - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 4 - - # Loop through until they are all done. - while (outs := engine_core.step()[0].get(0)) and outs.outputs: - pass +def test_engine_core(): + """Setup the EngineCore.""" + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + engine_core = EngineCore( + vllm_config=vllm_config, executor_class=executor_class, log_stats=True + ) + """Test basic request lifecycle.""" + + # First request. + engine_core.add_request(*engine_core.preprocess_add_request(make_request())) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 1 + + # Second request. + engine_core.add_request(*engine_core.preprocess_add_request(make_request())) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 1 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + # Add two requests in a row. + engine_core.add_request(*engine_core.preprocess_add_request(make_request())) + engine_core.add_request(*engine_core.preprocess_add_request(make_request())) + assert len(engine_core.scheduler.waiting) == 2 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 4 + + # Loop through until they are all done. + while (outs := engine_core.step()[0].get(0)) and outs.outputs: + pass + + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + """Test abort cycle.""" + + # Basic abort. + req = make_request() + request_id = req.request_id + + engine_core.add_request(*engine_core.preprocess_add_request(req)) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + assert engine_core.scheduler.has_unfinished_requests() + assert not engine_core.scheduler.has_finished_requests() + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 1 + assert engine_core.scheduler.has_unfinished_requests() + assert not engine_core.scheduler.has_finished_requests() + + engine_core.abort_requests([request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + assert not engine_core.scheduler.has_unfinished_requests() + assert engine_core.scheduler.has_finished_requests() + + _ = engine_core.step() + assert not engine_core.scheduler.has_unfinished_requests() + assert not engine_core.scheduler.has_finished_requests() + + # Add, step, abort 1 of the 3. + req0 = make_request() + req1 = make_request() + req2 = make_request() + + engine_core.add_request(*engine_core.preprocess_add_request(req0)) + engine_core.add_request(*engine_core.preprocess_add_request(req1)) + assert len(engine_core.scheduler.waiting) == 2 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + engine_core.add_request(*engine_core.preprocess_add_request(req2)) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 3 + + # Abort just one. + engine_core.abort_requests([req1.request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + # Abort the other requests at the same time. + engine_core.abort_requests([req2.request_id, req0.request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + # Sending duplicate requests with same request_id + req0 = make_request() + req1 = make_request() + req0.request_id = req1.request_id = "test" + engine_core.add_request(*engine_core.preprocess_add_request(req0)) + + while (outs := engine_core.step()[0].get(0)) and outs.outputs: + pass + + engine_core.add_request(*engine_core.preprocess_add_request(req1)) + while (outs := engine_core.step()[0].get(0)) and outs.outputs: + pass + + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 - """Test abort cycle.""" - # Basic abort. - req = make_request() - request_id = req.request_id +@create_new_process_for_each_test() +def test_engine_core_advanced_sampling(): + """ + A basic end-to-end test to verify that the engine functions correctly + when additional sampling parameters, such as top_p, min_tokens, and + presence_penalty, are set. + """ + """Setup the EngineCore.""" + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + engine_core = EngineCore( + vllm_config=vllm_config, executor_class=executor_class, log_stats=True + ) + """Test basic request lifecycle.""" + # First request. + request: EngineCoreRequest = make_request() + request.sampling_params = SamplingParams( + min_tokens=4, + presence_penalty=1.0, + frequency_penalty=1.0, + repetition_penalty=0.1, + stop_token_ids=[1001, 1002], + ) + engine_core.add_request(*engine_core.preprocess_add_request(request)) - engine_core.add_request(*engine_core.preprocess_add_request(req)) + def _check_engine_state(): assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 - assert engine_core.scheduler.has_unfinished_requests() - assert not engine_core.scheduler.has_finished_requests() - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 1 - assert engine_core.scheduler.has_unfinished_requests() - assert not engine_core.scheduler.has_finished_requests() - - engine_core.abort_requests([request_id]) - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 - assert not engine_core.scheduler.has_unfinished_requests() - assert engine_core.scheduler.has_finished_requests() - - _ = engine_core.step() - assert not engine_core.scheduler.has_unfinished_requests() - assert not engine_core.scheduler.has_finished_requests() - - # Add, step, abort 1 of the 3. - req0 = make_request() - req1 = make_request() - req2 = make_request() - - engine_core.add_request(*engine_core.preprocess_add_request(req0)) - engine_core.add_request(*engine_core.preprocess_add_request(req1)) - assert len(engine_core.scheduler.waiting) == 2 - assert len(engine_core.scheduler.running) == 0 - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 2 - - engine_core.add_request(*engine_core.preprocess_add_request(req2)) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 2 - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 3 - - # Abort just one. - engine_core.abort_requests([req1.request_id]) - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 2 - - _ = engine_core.step() - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 2 - - # Abort the other requests at the same time. - engine_core.abort_requests([req2.request_id, req0.request_id]) - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 - - # Sending duplicate requests with same request_id - req0 = make_request() - req1 = make_request() - req0.request_id = req1.request_id = "test" - engine_core.add_request(*engine_core.preprocess_add_request(req0)) - - while (outs := engine_core.step()[0].get(0)) and outs.outputs: - pass - - engine_core.add_request(*engine_core.preprocess_add_request(req1)) + # Loop through until they are all done. while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass - assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.running) == 0 + _check_engine_state() -@create_new_process_for_each_test() -def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): - """ - A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as top_p, min_tokens, and - presence_penalty, are set. - """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - """Setup the EngineCore.""" - engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() - executor_class = Executor.get_class(vllm_config) - - with set_default_torch_num_threads(1): - engine_core = EngineCore( - vllm_config=vllm_config, executor_class=executor_class, log_stats=True - ) - """Test basic request lifecycle.""" - # First request. - request: EngineCoreRequest = make_request() - request.sampling_params = SamplingParams( - min_tokens=4, - presence_penalty=1.0, - frequency_penalty=1.0, - repetition_penalty=0.1, - stop_token_ids=[1001, 1002], - ) - engine_core.add_request(*engine_core.preprocess_add_request(request)) - - def _check_engine_state(): - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 - # Loop through until they are all done. - while (outs := engine_core.step()[0].get(0)) and outs.outputs: - pass - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 - - _check_engine_state() - - # Second request. - request2 = make_request() - request2.sampling_params = SamplingParams( - top_p=0.99, - top_k=50, - ) - engine_core.add_request(*engine_core.preprocess_add_request(request2)) - _check_engine_state() + # Second request. + request2 = make_request() + request2.sampling_params = SamplingParams( + top_p=0.99, + top_k=50, + ) + engine_core.add_request(*engine_core.preprocess_add_request(request2)) + _check_engine_state() @create_new_process_for_each_test() -def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): +def test_engine_core_concurrent_batches(): """ Test that the engine can handle multiple concurrent batches. """ @@ -272,173 +268,163 @@ def shutdown(self): if hasattr(self, "thread_pool"): self.thread_pool.shutdown(wait=False) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - engine_args = EngineArgs( - model=MODEL_NAME, - # To test concurrent batches. - max_num_seqs=2, - # Avoid all requests being scheduled once. - enable_prefix_caching=False, - max_num_batched_tokens=10, - # Reduce startup time. - enforce_eager=True, + engine_args = EngineArgs( + model=MODEL_NAME, + # To test concurrent batches. + max_num_seqs=2, + # Avoid all requests being scheduled once. + enable_prefix_caching=False, + max_num_batched_tokens=10, + # Reduce startup time. + enforce_eager=True, + ) + vllm_config = engine_args.create_engine_config() + with set_default_torch_num_threads(1): + engine_core = EngineCore( + vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor ) - vllm_config = engine_args.create_engine_config() - with set_default_torch_num_threads(1): - engine_core = EngineCore( - vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor - ) - assert engine_core.batch_queue is not None - - # Add two requests in a row. Each request have 12 prompt tokens. - req0 = make_request_with_max_tokens("0", 5) - engine_core.add_request(*engine_core.preprocess_add_request(req0)) - req1 = make_request_with_max_tokens("1", 5) - engine_core.add_request(*engine_core.preprocess_add_request(req1)) - - # Schedule Batch 1: (10, req0) - assert engine_core.step_with_batch_queue()[0] is None - assert len(engine_core.batch_queue) == 1 - scheduler_output = engine_core.batch_queue[-1][1] - assert scheduler_output.num_scheduled_tokens["0"] == 10 - # num_computed_tokens should have been updated immediately. - assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10 - - # Schedule Batch 2: (2, req0), (8, req1) - assert engine_core.step_with_batch_queue()[0] == {} - assert len(engine_core.batch_queue) == 1 - scheduler_output = engine_core.batch_queue[-1][1] - assert scheduler_output.num_scheduled_tokens["0"] == 2 - assert scheduler_output.num_scheduled_tokens["1"] == 8 - # num_computed_tokens should have been updated immediately. - assert engine_core.scheduler.requests["0"].num_computed_tokens == 12 - assert engine_core.scheduler.requests["1"].num_computed_tokens == 8 - - assert engine_core.scheduler.get_num_unfinished_requests() == 2 - - # Finish Batch 1 and schedule Batch 3: (4, req1). - # Note that req0 cannot be scheduled - # because it is in the decoding stage now. - engine_core.step_with_batch_queue() - assert len(engine_core.batch_queue) == 1 - scheduler_output = engine_core.batch_queue[-1][1] - assert scheduler_output.num_scheduled_tokens["1"] == 4 - - # Finish Batch 2. Get first token of req0. - # Schedule Batch 4: (1, req0). - output = engine_core.step_with_batch_queue()[0].get(0) + assert engine_core.batch_queue is not None + + # Add two requests in a row. Each request have 12 prompt tokens. + req0 = make_request_with_max_tokens("0", 5) + engine_core.add_request(*engine_core.preprocess_add_request(req0)) + req1 = make_request_with_max_tokens("1", 5) + engine_core.add_request(*engine_core.preprocess_add_request(req1)) + + # Schedule Batch 1: (10, req0) + assert engine_core.step_with_batch_queue()[0] is None + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] + assert scheduler_output.num_scheduled_tokens["0"] == 10 + # num_computed_tokens should have been updated immediately. + assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10 + + # Schedule Batch 2: (2, req0), (8, req1) + assert engine_core.step_with_batch_queue()[0] == {} + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] + assert scheduler_output.num_scheduled_tokens["0"] == 2 + assert scheduler_output.num_scheduled_tokens["1"] == 8 + # num_computed_tokens should have been updated immediately. + assert engine_core.scheduler.requests["0"].num_computed_tokens == 12 + assert engine_core.scheduler.requests["1"].num_computed_tokens == 8 + + assert engine_core.scheduler.get_num_unfinished_requests() == 2 + + # Finish Batch 1 and schedule Batch 3: (4, req1). + # Note that req0 cannot be scheduled + # because it is in the decoding stage now. + engine_core.step_with_batch_queue() + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] + assert scheduler_output.num_scheduled_tokens["1"] == 4 + + # Finish Batch 2. Get first token of req0. + # Schedule Batch 4: (1, req0). + output = engine_core.step_with_batch_queue()[0].get(0) + assert output is not None + assert len(output.outputs) == 1 + assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 + scheduler_output = engine_core.batch_queue[-1][1] + assert scheduler_output.num_scheduled_tokens["0"] == 1 + + # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1). + output = engine_core.step_with_batch_queue()[0].get(0) + assert output is not None + assert len(output.outputs) == 1 + assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 + scheduler_output = engine_core.batch_queue[-1][1] + assert scheduler_output.num_scheduled_tokens["1"] == 1 + + # Loop until req0 is finished. + req_id = 0 + expected_num_tokens = [ + engine_core.scheduler.requests["0"].num_tokens + 1, + engine_core.scheduler.requests["1"].num_tokens + 1, + ] + while engine_core.scheduler.get_num_unfinished_requests() == 2: + output = engine_core.step_with_batch_queue()[0] + # Every step consumes an output. assert output is not None - assert len(output.outputs) == 1 - assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 - scheduler_output = engine_core.batch_queue[-1][1] - assert scheduler_output.num_scheduled_tokens["0"] == 1 - - # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1). - output = engine_core.step_with_batch_queue()[0].get(0) - assert output is not None - assert len(output.outputs) == 1 - assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 - scheduler_output = engine_core.batch_queue[-1][1] - assert scheduler_output.num_scheduled_tokens["1"] == 1 - - # Loop until req0 is finished. - req_id = 0 - expected_num_tokens = [ - engine_core.scheduler.requests["0"].num_tokens + 1, - engine_core.scheduler.requests["1"].num_tokens + 1, - ] - while engine_core.scheduler.get_num_unfinished_requests() == 2: - output = engine_core.step_with_batch_queue()[0] - # Every step consumes an output. - assert output is not None - assert len(output[0].outputs) == 1 - if req_id in engine_core.scheduler.requests: - assert ( - engine_core.scheduler.requests[req_id].num_tokens - == expected_num_tokens[req_id] - ) - expected_num_tokens[req_id] += 1 - req_id = (req_id + 1) % 2 + assert len(output[0].outputs) == 1 + if req_id in engine_core.scheduler.requests: + assert ( + engine_core.scheduler.requests[req_id].num_tokens + == expected_num_tokens[req_id] + ) + expected_num_tokens[req_id] += 1 + req_id = (req_id + 1) % 2 @multi_gpu_test(num_gpus=2) -def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): +def test_engine_core_tp(): """ Test engine can initialize worker in tp properly """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - """Setup the EngineCore.""" - engine_args = EngineArgs( - model=MODEL_NAME, - tensor_parallel_size=2, - # Reduce startup time. - enforce_eager=True, - ) - vllm_config = engine_args.create_engine_config() - executor_class = Executor.get_class(vllm_config) + """Setup the EngineCore.""" + engine_args = EngineArgs( + model=MODEL_NAME, + tensor_parallel_size=2, + # Reduce startup time. + enforce_eager=True, + ) + vllm_config = engine_args.create_engine_config() + executor_class = Executor.get_class(vllm_config) - with set_default_torch_num_threads(1): - engine_core = EngineCore( - vllm_config=vllm_config, executor_class=executor_class, log_stats=True - ) + with set_default_torch_num_threads(1): + engine_core = EngineCore( + vllm_config=vllm_config, executor_class=executor_class, log_stats=True + ) - def get_worker_cache_config_field(worker, key: str): - return getattr(worker.cache_config, key) + def get_worker_cache_config_field(worker, key: str): + return getattr(worker.cache_config, key) - num_gpu_blocks = engine_core.collective_rpc( - get_worker_cache_config_field, args=("num_gpu_blocks",) - ) - num_cpu_blocks = engine_core.collective_rpc( - get_worker_cache_config_field, args=("num_cpu_blocks",) - ) - assert all(x is not None for x in num_gpu_blocks) - assert all(x is not None for x in num_cpu_blocks) + num_gpu_blocks = engine_core.collective_rpc( + get_worker_cache_config_field, args=("num_gpu_blocks",) + ) + num_cpu_blocks = engine_core.collective_rpc( + get_worker_cache_config_field, args=("num_cpu_blocks",) + ) + assert all(x is not None for x in num_gpu_blocks) + assert all(x is not None for x in num_cpu_blocks) @create_new_process_for_each_test() -def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): +def test_engine_core_invalid_request_id_type(): """Test that engine raises TypeError for non-string request_id.""" - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = Executor.get_class(vllm_config) - engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() - executor_class = Executor.get_class(vllm_config) - - with set_default_torch_num_threads(1): - engine_core = EngineCore( - vllm_config=vllm_config, executor_class=executor_class, log_stats=True - ) + with set_default_torch_num_threads(1): + engine_core = EngineCore( + vllm_config=vllm_config, executor_class=executor_class, log_stats=True + ) - # Test with UUID object (common mistake) - uuid_request = make_request() - uuid_request.request_id = uuid.uuid4() # UUID object instead of string + # Test with UUID object (common mistake) + uuid_request = make_request() + uuid_request.request_id = uuid.uuid4() # UUID object instead of string - with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"): - engine_core.add_request(*engine_core.preprocess_add_request(uuid_request)) + with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"): + engine_core.add_request(*engine_core.preprocess_add_request(uuid_request)) - # Test with integer - int_request = make_request() - int_request.request_id = 12345 + # Test with integer + int_request = make_request() + int_request.request_id = 12345 - with pytest.raises(TypeError, match="request_id must be a string, got.*int"): - engine_core.add_request(*engine_core.preprocess_add_request(int_request)) + with pytest.raises(TypeError, match="request_id must be a string, got.*int"): + engine_core.add_request(*engine_core.preprocess_add_request(int_request)) - # Test with None - none_request = make_request() - none_request.request_id = None + # Test with None + none_request = make_request() + none_request.request_id = None - with pytest.raises( - TypeError, match="request_id must be a string, got.*NoneType" - ): - engine_core.add_request(*engine_core.preprocess_add_request(none_request)) + with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"): + engine_core.add_request(*engine_core.preprocess_add_request(none_request)) - # Verify engine is still functional after errors - valid_request = make_request() - engine_core.add_request(*engine_core.preprocess_add_request(valid_request)) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 + # Verify engine is still functional after errors + valid_request = make_request() + engine_core.add_request(*engine_core.preprocess_add_request(valid_request)) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 90284fc54d06..bc04d1f93f95 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -130,8 +130,6 @@ def test_engine_core_client( monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool ): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Monkey-patch core engine utility function to test. m.setattr(EngineCore, "echo", echo, raising=False) @@ -218,8 +216,6 @@ def test_engine_core_client( @pytest.mark.asyncio(loop_scope="function") async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Monkey-patch core engine utility function to test. m.setattr(EngineCore, "echo", echo, raising=False) @@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return( monkeypatch: pytest.MonkeyPatch, ): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Must set insecure serialization to allow returning custom types. m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return( monkeypatch: pytest.MonkeyPatch, ): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Must set insecure serialization to allow returning custom types. m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures( monkeypatch: pytest.MonkeyPatch, ): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Must set insecure serialization to allow returning custom types. m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures( indirect=["publisher_config"], ) def test_kv_cache_events( - monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool, publisher_config, ): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - block_size = 16 - num_blocks = 2 - - engine_args = EngineArgs( - model=MODEL_NAME, - enforce_eager=True, - enable_prefix_caching=True, - block_size=block_size, - ) - engine_args.kv_events_config = publisher_config + block_size = 16 + num_blocks = 2 + + engine_args = EngineArgs( + model=MODEL_NAME, + enforce_eager=True, + enable_prefix_caching=True, + block_size=block_size, + ) + engine_args.kv_events_config = publisher_config - vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) + vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) - executor_class = Executor.get_class(vllm_config) - with set_default_torch_num_threads(1): - client = EngineCoreClient.make_client( - multiprocess_mode=multiprocessing_mode, - asyncio_mode=False, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=False, - ) - endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") - subscriber = MockSubscriber( - endpoint, topic=publisher_config.topic, decode_type=KVEventBatch + executor_class = Executor.get_class(vllm_config) + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, ) + endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") + subscriber = MockSubscriber( + endpoint, topic=publisher_config.topic, decode_type=KVEventBatch + ) - try: - custom_tokens = list(range(num_blocks * block_size)) - sampling_params = SamplingParams(max_tokens=1) - request = make_request(sampling_params, custom_tokens) - client.add_request(request) + try: + custom_tokens = list(range(num_blocks * block_size)) + sampling_params = SamplingParams(max_tokens=1) + request = make_request(sampling_params, custom_tokens) + client.add_request(request) - outputs: dict[str, list] = {request.request_id: []} - loop_until_done(client, outputs) + outputs: dict[str, list] = {request.request_id: []} + loop_until_done(client, outputs) - result = subscriber.receive_one(timeout=1000) - assert result is not None, "No message received" + result = subscriber.receive_one(timeout=1000) + assert result is not None, "No message received" - seq, received = result + seq, received = result - assert seq == 0, "Sequence number mismatch" - assert len(received.events) == 1, ( - "We should have exactly one BlockStored event" - ) - event = received.events[0] - assert isinstance(event, BlockStored), "We should have a BlockStored event" - assert len(event.block_hashes) == num_blocks, ( - "We should have a BlockStored event with 2 block_hashes" - ) - assert event.block_size == block_size, ( - "Block size should be the same as the block size" - ) - assert event.parent_block_hash is None, "Parent block hash should be None" - assert event.lora_id is None, "Lora id should be None" - assert len(event.token_ids) == num_blocks * block_size, ( - "Token ids should be the same as the custom tokens" - ) - assert event.token_ids == custom_tokens, ( - "Token ids should be the same as the custom tokens" - ) - finally: - client.shutdown() - subscriber.close() + assert seq == 0, "Sequence number mismatch" + assert len(received.events) == 1, "We should have exactly one BlockStored event" + event = received.events[0] + assert isinstance(event, BlockStored), "We should have a BlockStored event" + assert len(event.block_hashes) == num_blocks, ( + "We should have a BlockStored event with 2 block_hashes" + ) + assert event.block_size == block_size, ( + "Block size should be the same as the block size" + ) + assert event.parent_block_hash is None, "Parent block hash should be None" + assert event.lora_id is None, "Lora id should be None" + assert len(event.token_ids) == num_blocks * block_size, ( + "Token ids should be the same as the custom tokens" + ) + assert event.token_ids == custom_tokens, ( + "Token ids should be the same as the custom tokens" + ) + finally: + client.shutdown() + subscriber.close() @pytest.mark.asyncio @@ -672,101 +657,96 @@ def test_kv_cache_events( ) @multi_gpu_test(num_gpus=4) async def test_kv_cache_events_dp( - monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool, publisher_config, ): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - block_size = 16 - num_blocks = 2 - dp_size = 2 - tp_size = 2 - - engine_args = EngineArgs( - model=MODEL_NAME, - enforce_eager=True, - enable_prefix_caching=True, - data_parallel_size=dp_size, - tensor_parallel_size=tp_size, - block_size=block_size, - ) - engine_args.kv_events_config = publisher_config + block_size = 16 + num_blocks = 2 + dp_size = 2 + tp_size = 2 + + engine_args = EngineArgs( + model=MODEL_NAME, + enforce_eager=True, + enable_prefix_caching=True, + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + block_size=block_size, + ) + engine_args.kv_events_config = publisher_config - vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) + vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) - executor_class = Executor.get_class(vllm_config) - with set_default_torch_num_threads(1): - client = EngineCoreClient.make_client( - multiprocess_mode=multiprocessing_mode, - asyncio_mode=True, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=False, - ) - await asyncio.sleep(1) + executor_class = Executor.get_class(vllm_config) + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) + await asyncio.sleep(1) - # Build endpoints for all DP ranks - base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") - endpoints = [] - for i in range(dp_size): - offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i) - endpoints.append(offset_endpoint) + # Build endpoints for all DP ranks + base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") + endpoints = [] + for i in range(dp_size): + offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i) + endpoints.append(offset_endpoint) - subscriber = MockSubscriber( - endpoints, topic=publisher_config.topic, decode_type=KVEventBatch - ) + subscriber = MockSubscriber( + endpoints, topic=publisher_config.topic, decode_type=KVEventBatch + ) - try: - custom_tokens = list(range(num_blocks * block_size)) - sampling_params = SamplingParams(max_tokens=1) - all_request_ids = [] + try: + custom_tokens = list(range(num_blocks * block_size)) + sampling_params = SamplingParams(max_tokens=1) + all_request_ids = [] - # Create and add 25 requests - # NOTE: attempts to force routing to both dp groups but can be flaky - for i in range(25): - await asyncio.sleep(0.01) - request = make_request(sampling_params, custom_tokens) - await client.add_request_async(request) - all_request_ids.append(request.request_id) + # Create and add 25 requests + # NOTE: attempts to force routing to both dp groups but can be flaky + for i in range(25): + await asyncio.sleep(0.01) + request = make_request(sampling_params, custom_tokens) + await client.add_request_async(request) + all_request_ids.append(request.request_id) - await asyncio.sleep(0.1) + await asyncio.sleep(0.1) - # Initialize outputs dict for all requests - outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids} + # Initialize outputs dict for all requests + outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids} - print("processing requests...") - await asyncio.wait_for( - loop_until_fully_done_async(client, outputs), timeout=20.0 - ) + print("processing requests...") + await asyncio.wait_for( + loop_until_fully_done_async(client, outputs), timeout=20.0 + ) - # Receive from subscriber until no more messages - print("collecting results...") - results = [] - while True: - result = subscriber.receive_one(timeout=1) - print(result) - if result is None: - break - results.append(result) - - # Collect all events and data_parallel_ranks from all results - all_dp_ranks = [received.data_parallel_rank for (_, received) in results] - unique_dps = set(all_dp_ranks) - assert len(unique_dps) == 2, ( - f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}" - ) + # Receive from subscriber until no more messages + print("collecting results...") + results = [] + while True: + result = subscriber.receive_one(timeout=1) + print(result) + if result is None: + break + results.append(result) + + # Collect all events and data_parallel_ranks from all results + all_dp_ranks = [received.data_parallel_rank for (_, received) in results] + unique_dps = set(all_dp_ranks) + assert len(unique_dps) == 2, ( + f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}" + ) - finally: - client.shutdown() - subscriber.close() + finally: + client.shutdown() + subscriber.close() @pytest.mark.timeout(20) def test_startup_failure(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m, pytest.raises(Exception) as e_info: - m.setenv("VLLM_USE_V1", "1") - # Monkey-patch to extract core process pid while it's starting. core_proc_pid = [None] cepm_ctor = CoreEngineProcManager.__init__ @@ -841,7 +821,6 @@ def create_mock_executor(vllm_config): mock_executor_class.side_effect = create_mock_executor with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices from vllm.v1.engine.utils import EngineZmqAddresses diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index a19ba562136f..3f6f2211556f 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -21,12 +21,10 @@ def _vllm_model( apc: bool, vllm_runner: type[VllmRunner], - monkeypatch: pytest.MonkeyPatch, *, skip_tokenizer_init: bool = False, ): """Set up VllmRunner instance.""" - monkeypatch.setenv("VLLM_USE_V1", "1") return vllm_runner( MODEL, dtype=DTYPE, @@ -45,16 +43,16 @@ def _vllm_model( # Prefix caching params=[False, True], ) -def vllm_model(vllm_runner, request, monkeypatch): +def vllm_model(vllm_runner, request): """VllmRunner test fixture parameterized by APC True/False.""" - with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model: + with _vllm_model(request.param, vllm_runner) as vllm_model: yield vllm_model @pytest.fixture(scope="function") -def vllm_model_apc(vllm_runner, monkeypatch): +def vllm_model_apc(vllm_runner): """VllmRunner test fixture with APC.""" - with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model: + with _vllm_model(True, vllm_runner) as vllm_model: yield vllm_model @@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch): # Prefix caching params=[False, True], ) -def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch): +def vllm_model_skip_tokenizer_init(vllm_runner, request): """VllmRunner test fixture with APC.""" with _vllm_model( request.param, vllm_runner, - monkeypatch, skip_tokenizer_init=True, ) as vllm_model: yield vllm_model @@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: ) -def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): +def test_engine_metrics(vllm_runner, example_prompts): max_tokens = 100 # Use spec decoding to test num_accepted_tokens_per_pos speculative_config = { @@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): "prompt_lookup_min": 3, "num_speculative_tokens": 5, } - monkeypatch.setenv("VLLM_USE_V1", "1") + with vllm_runner( MODEL, speculative_config=speculative_config, @@ -216,8 +213,7 @@ def find_metric(name) -> list[Metric]: @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) -def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_V1", "1") +def test_skip_tokenizer_initialization(model: str): # This test checks if the flag skip_tokenizer_init skips the initialization # of tokenizer and detokenizer. The generated output is expected to contain # token ids. diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index d4c33f6cbbe2..16cdc19037ba 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -103,7 +103,6 @@ def test_guided_decoding_deprecated(): PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, ) def test_structured_output( - monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any], sample_sql_ebnf: str, @@ -115,8 +114,6 @@ def test_structured_output( model_name: str, speculative_config: dict[str, Any], ): - monkeypatch.setenv("VLLM_USE_V1", "1") - if current_platform.is_tpu() and speculative_config: pytest.skip("TPU does not support speculative decoding") @@ -620,15 +617,12 @@ def test_structured_output( ], ) def test_structured_output_with_reasoning_matrices( - monkeypatch: pytest.MonkeyPatch, backend: str, tokenizer_mode: TokenizerMode, reasoning_parser: str, model_name: str, speculative_config: dict[str, Any] | None, ): - monkeypatch.setenv("VLLM_USE_V1", "1") - if current_platform.is_tpu() and speculative_config: pytest.skip("TPU does not support speculative decoding") @@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) def test_structured_output_auto_mode( - monkeypatch: pytest.MonkeyPatch, unsupported_json_schema: dict[str, Any], model_name: str, tokenizer_mode: str, ): - monkeypatch.setenv("VLLM_USE_V1", "1") - llm = LLM( model=model_name, max_model_len=1024, @@ -739,9 +730,7 @@ def test_structured_output_auto_mode( @pytest.mark.skip_global_cleanup -def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_V1", "1") - +def test_guidance_no_additional_properties(): llm = LLM( model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=1024, @@ -801,12 +790,9 @@ def generate_with_backend(backend): @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) def test_structured_output_batched_with_non_structured_outputs_requests( - monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], backend: str, ): - monkeypatch.setenv("VLLM_USE_V1", "1") - # Don't use eager execution on TPUs because we want to test for no # recompilation at runtime enforce_eager = bool(not current_platform.is_tpu()) diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh index ea125f99fc42..fa1738bb3194 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh @@ -53,7 +53,6 @@ cleanup() { launch_baseline() { BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME}; VLLM_LOGGING_LEVEL=DEBUG \ - VLLM_USE_V1=1 \ PJRT_DEVICE=TPU \ VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ @@ -73,7 +72,6 @@ launch_pd() { UCX_TLS=tcp \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_LOGGING_LEVEL=DEBUG \ - VLLM_USE_V1=1 \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ PJRT_DEVICE=TPU \ @@ -93,7 +91,6 @@ launch_pd() { UCX_TLS=tcp \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_LOGGING_LEVEL=DEBUG \ - VLLM_USE_V1=1 \ PJRT_DEVICE=TPU \ VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh index 8ba653770c4f..3d63822371be 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh @@ -55,7 +55,6 @@ launch_pd() { UCX_TLS=tcp \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_LOGGING_LEVEL=DEBUG \ - VLLM_USE_V1=1 \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ PJRT_DEVICE=TPU \ @@ -75,7 +74,6 @@ launch_pd() { UCX_TLS=tcp \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_LOGGING_LEVEL=DEBUG \ - VLLM_USE_V1=1 \ PJRT_DEVICE=TPU \ VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index c844330bb466..2cb5e6733b79 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import pytest import ray @@ -10,15 +9,6 @@ from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger - -@pytest.fixture(scope="function", autouse=True) -def use_v1_only(monkeypatch): - """ - The change relies on V1 APIs, so set VLLM_USE_V1=1. - """ - monkeypatch.setenv("VLLM_USE_V1", "1") - - MODELS = [ "distilbert/distilgpt2", ] @@ -39,10 +29,6 @@ def test_engine_log_metrics_ray( @ray.remote(num_gpus=1) class EngineTestActor: async def run(self): - # Set environment variable inside the Ray actor since environment - # variables from pytest fixtures don't propagate to Ray actors - os.environ["VLLM_USE_V1"] = "1" - engine_args = AsyncEngineArgs( model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True ) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index f83bc90778b0..bda430a080f6 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs( batch_logprobs_composition: BatchLogprobsComposition, temperature: float, example_prompts: list[str], - monkeypatch: pytest.MonkeyPatch, ) -> None: """Test V1 Engine logprobs & prompt logprobs @@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs( temperature: "temperature" sampling parameter example_prompts: example prompt fixture """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching - if do_apc and ( - temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT - ): - # Skip some test-cases to save time. - pytest.skip() - test_prompts = example_prompts - - max_tokens = 5 - hf_outputs = hf_model.generate_greedy( - test_prompts, + do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching + if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT): + # Skip some test-cases to save time. + pytest.skip() + test_prompts = example_prompts + + max_tokens = 5 + hf_outputs = hf_model.generate_greedy( + test_prompts, + max_tokens=max_tokens, + ) + hf_logprobs = hf_model.generate_greedy_logprobs( + test_prompts, + max_tokens=max_tokens, + ) + + # Batch has mixed sample params + # (different logprobs/prompt logprobs combos) + logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition) + + # Ensure that each test prompt has a logprob config for testing + logprob_prompt_logprob_list = _repeat_logprob_config( + test_prompts, logprob_prompt_logprob_list + ) + # Generate SamplingParams + vllm_sampling_params = [ + SamplingParams( max_tokens=max_tokens, + logprobs=num_lp, + prompt_logprobs=num_plp, + temperature=temperature, + seed=1984, ) - hf_logprobs = hf_model.generate_greedy_logprobs( - test_prompts, + for num_lp, num_plp in logprob_prompt_logprob_list + ] + for _ in range(2 if do_apc else 1): + _run_and_validate( + vllm_model=vllm_model, + test_prompts=test_prompts, + vllm_sampling_params=vllm_sampling_params, + hf_logprobs=hf_logprobs, + hf_outputs=hf_outputs, + logprob_prompt_logprob_list=logprob_prompt_logprob_list, + temperature=temperature, max_tokens=max_tokens, + do_apc=do_apc, ) - # Batch has mixed sample params - # (different logprobs/prompt logprobs combos) - logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition) - # Ensure that each test prompt has a logprob config for testing - logprob_prompt_logprob_list = _repeat_logprob_config( - test_prompts, logprob_prompt_logprob_list - ) - # Generate SamplingParams - vllm_sampling_params = [ - SamplingParams( - max_tokens=max_tokens, - logprobs=num_lp, - prompt_logprobs=num_plp, - temperature=temperature, - seed=1984, - ) - for num_lp, num_plp in logprob_prompt_logprob_list - ] - for _ in range(2 if do_apc else 1): - _run_and_validate( - vllm_model=vllm_model, - test_prompts=test_prompts, - vllm_sampling_params=vllm_sampling_params, - hf_logprobs=hf_logprobs, - hf_outputs=hf_outputs, - logprob_prompt_logprob_list=logprob_prompt_logprob_list, - temperature=temperature, - max_tokens=max_tokens, - do_apc=do_apc, - ) - - -def test_max_logprobs(monkeypatch: pytest.MonkeyPatch): +def test_max_logprobs(): """vLLM v1 engine should fail a request with `logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs` APC should not matter as this test checks basic request validation. """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - runner = VllmRunner( - "facebook/opt-125m", - max_logprobs=1, - enable_prefix_caching=False, - # 2 other llms alive during whole session - gpu_memory_utilization=0.15, - max_model_len=256, - ) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) + runner = VllmRunner( + "facebook/opt-125m", + max_logprobs=1, + enable_prefix_caching=False, + # 2 other llms alive during whole session + gpu_memory_utilization=0.15, + max_model_len=256, + ) + vllm_sampling_params = SamplingParams(logprobs=1) + # should pass + runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) + bad_sampling_params = SamplingParams(logprobs=2) + with pytest.raises(ValueError): + runner.generate(["Hello world"], sampling_params=bad_sampling_params) -def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): +def test_none_logprobs(vllm_model, example_prompts): """Engine should return `logprobs` and `prompt_logprobs` as `None` Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - max_tokens = 5 + max_tokens = 5 - sampling_params_logprobs_none = SamplingParams( - max_tokens=max_tokens, - logprobs=None, - prompt_logprobs=None, - temperature=0.0, - ) - results_logprobs_none = vllm_model.llm.generate( - example_prompts, - sampling_params=sampling_params_logprobs_none, - ) + sampling_params_logprobs_none = SamplingParams( + max_tokens=max_tokens, + logprobs=None, + prompt_logprobs=None, + temperature=0.0, + ) + results_logprobs_none = vllm_model.llm.generate( + example_prompts, + sampling_params=sampling_params_logprobs_none, + ) - for i in range(len(results_logprobs_none)): - # Check sample logprobs are None - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None - # Check prompt logprobs are None - assert results_logprobs_none[i].prompt_logprobs is None + for i in range(len(results_logprobs_none)): + # Check sample logprobs are None + assert results_logprobs_none[i].outputs[0].logprobs is None + assert results_logprobs_none[i].outputs[0].cumulative_logprob is None + # Check prompt logprobs are None + assert results_logprobs_none[i].prompt_logprobs is None -def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): +def test_zero_logprobs(vllm_model, example_prompts): """Engine should return sampled token and prompt token logprobs Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - max_tokens = 5 + max_tokens = 5 - sampling_params_logprobs_zero = SamplingParams( - max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0 - ) - results_logprobs_zero = vllm_model.llm.generate( - example_prompts, sampling_params=sampling_params_logprobs_zero - ) + sampling_params_logprobs_zero = SamplingParams( + max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0 + ) + results_logprobs_zero = vllm_model.llm.generate( + example_prompts, sampling_params=sampling_params_logprobs_zero + ) - for i in range(len(results_logprobs_zero)): - # Check that there is one sample logprob dict for each - # sample token - logprobs = results_logprobs_zero[i].outputs[0].logprobs - prompt_logprobs = results_logprobs_zero[i].prompt_logprobs - sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids - prompt_token_ids = results_logprobs_zero[i].prompt_token_ids - assert logprobs is not None - assert len(sampled_token_ids) == len(logprobs) - assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None - # Check that there is one prompt logprob dict for each - # prompt token - assert prompt_logprobs is not None - assert len(prompt_token_ids) == len(prompt_logprobs) - - -def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): + for i in range(len(results_logprobs_zero)): + # Check that there is one sample logprob dict for each + # sample token + logprobs = results_logprobs_zero[i].outputs[0].logprobs + prompt_logprobs = results_logprobs_zero[i].prompt_logprobs + sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids + prompt_token_ids = results_logprobs_zero[i].prompt_token_ids + assert logprobs is not None + assert len(sampled_token_ids) == len(logprobs) + assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None + # Check that there is one prompt logprob dict for each + # prompt token + assert prompt_logprobs is not None + assert len(prompt_token_ids) == len(prompt_logprobs) + + +def test_all_logprobs(example_prompts): """Engine should return all vocabulary logprobs and prompt logprobs Args: example_prompts: list of example prompts (test fixture) """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - runner = VllmRunner( - "facebook/opt-125m", - max_logprobs=-1, - enable_prefix_caching=False, - # 2 other llms alive during whole session - gpu_memory_utilization=0.15, - max_model_len=256, - ) + runner = VllmRunner( + "facebook/opt-125m", + max_logprobs=-1, + enable_prefix_caching=False, + # 2 other llms alive during whole session + gpu_memory_utilization=0.15, + max_model_len=256, + ) - sampling_params_logprobs_all = SamplingParams( - max_tokens=5, logprobs=-1, prompt_logprobs=-1 - ) - results_logprobs_all = runner.llm.generate( - example_prompts, sampling_params=sampling_params_logprobs_all - ) - vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size() + sampling_params_logprobs_all = SamplingParams( + max_tokens=5, logprobs=-1, prompt_logprobs=-1 + ) + results_logprobs_all = runner.llm.generate( + example_prompts, sampling_params=sampling_params_logprobs_all + ) + vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size() - for i in range(len(results_logprobs_all)): - logprobs = results_logprobs_all[i].outputs[0].logprobs - prompt_logprobs = results_logprobs_all[i].prompt_logprobs - assert logprobs is not None - for logprob in logprobs: - assert len(logprob) == vocab_size - assert prompt_logprobs is not None - assert prompt_logprobs[0] is None - for prompt_logprob in prompt_logprobs[1:]: - assert len(prompt_logprob) == vocab_size + for i in range(len(results_logprobs_all)): + logprobs = results_logprobs_all[i].outputs[0].logprobs + prompt_logprobs = results_logprobs_all[i].prompt_logprobs + assert logprobs is not None + for logprob in logprobs: + assert len(logprob) == vocab_size + assert prompt_logprobs is not None + assert prompt_logprobs[0] is None + for prompt_logprob in prompt_logprobs[1:]: + assert len(prompt_logprob) == vocab_size @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) -def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): +def test_logprobs_mode(logprobs_mode: LogprobsMode): """Test with LLM engine with different logprobs_mode. For logprobs, we should have non-positive values. For logits, we should expect at least one positive values. """ from vllm import LLM - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - llm = LLM( - "facebook/opt-125m", - max_logprobs=5, - enable_prefix_caching=False, - # 2 other llms alive during whole session - gpu_memory_utilization=0.05, - max_model_len=16, - logprobs_mode=logprobs_mode, - ) - vllm_sampling_params = SamplingParams(logprobs=1) - results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params) - - total_token_with_logprobs = 0 - positive_values = 0 - for output in results[0].outputs: - for logprobs in output.logprobs: - for token_id in logprobs: - logprob = logprobs[token_id] - if logprobs_mode in ("raw_logprobs", "processed_logprobs"): - assert logprob.logprob <= 0 - if logprob.logprob > 0: - positive_values = positive_values + 1 - total_token_with_logprobs = total_token_with_logprobs + 1 - assert total_token_with_logprobs >= len(results[0].outputs) - if logprobs_mode in ("raw_logits", "processed_logits"): - assert positive_values > 0 - del llm + llm = LLM( + "facebook/opt-125m", + max_logprobs=5, + enable_prefix_caching=False, + # 2 other llms alive during whole session + gpu_memory_utilization=0.05, + max_model_len=16, + logprobs_mode=logprobs_mode, + ) + vllm_sampling_params = SamplingParams(logprobs=1) + results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params) + + total_token_with_logprobs = 0 + positive_values = 0 + for output in results[0].outputs: + for logprobs in output.logprobs: + for token_id in logprobs: + logprob = logprobs[token_id] + if logprobs_mode in ("raw_logprobs", "processed_logprobs"): + assert logprob.logprob <= 0 + if logprob.logprob > 0: + positive_values = positive_values + 1 + total_token_with_logprobs = total_token_with_logprobs + 1 + assert total_token_with_logprobs >= len(results[0].outputs) + if logprobs_mode in ("raw_logits", "processed_logits"): + assert positive_values > 0 + del llm diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index 24f9397cc4c6..bdde28fe0342 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -1,14 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import pytest from vllm import LLM, SamplingParams -if os.getenv("VLLM_USE_V1", "0") != "1": - pytest.skip("Test package requires V1", allow_module_level=True) - MODEL = "meta-llama/Llama-3.2-1B" PROMPT = "Hello my name is Robert and I" @@ -173,14 +169,6 @@ def test_allowed_token_ids(llm): _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) -def test_priority(llm): - """Check that we reject requests with priority.""" - - # Reject all allowed token ids - with pytest.raises(ValueError): - _ = llm.generate(PROMPT, priority=[1]) - - def test_seed(llm): """Check that seed impacts randomness.""" diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 647887812f8a..bc779f6bd9c4 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -38,7 +38,6 @@ def test_eagle_max_len( monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str ): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 1518987ded04..f3495b00d3d4 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -42,7 +42,6 @@ @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS) def test_basic( vllm_runner: type[VllmRunner], - monkeypatch: pytest.MonkeyPatch, model: str, max_tokens: int, tensor_parallel_size: int, @@ -55,23 +54,20 @@ def test_basic( ) example_prompts = [prompt] - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + with vllm_runner( + model, + # Note: max_num_batched_tokens == 1024 is needed here to + # actually test chunked prompt + max_num_batched_tokens=1024, + max_model_len=8192, + gpu_memory_utilization=0.7, + max_num_seqs=max_num_seqs, + tensor_parallel_size=tensor_parallel_size, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + output = vllm_outputs[0][1] - with vllm_runner( - model, - # Note: max_num_batched_tokens == 1024 is needed here to - # actually test chunked prompt - max_num_batched_tokens=1024, - max_model_len=8192, - gpu_memory_utilization=0.7, - max_num_seqs=max_num_seqs, - tensor_parallel_size=tensor_parallel_size, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - output = vllm_outputs[0][1] - - assert "1024" in output or "0, 1" in output + assert "1024" in output or "0, 1" in output @pytest.mark.skip(reason="Temporarily disabled due to timeout") @@ -82,7 +78,6 @@ def test_basic( @pytest.mark.parametrize("max_num_seqs", [16]) def test_phi3( vllm_runner: type[VllmRunner], - monkeypatch: pytest.MonkeyPatch, max_tokens: int, max_num_seqs: int, ) -> None: @@ -99,18 +94,15 @@ def test_phi3( # test head dim = 96 model = "microsoft/Phi-3-mini-128k-instruct" - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - with vllm_runner( - model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) - # vllm_outputs is a list of tuples whose first element is the token id - # and the second element is the output (including the prompt). - for output, answer in zip(vllm_outputs, answers): - generated_text = output[1] - assert answer in generated_text + with vllm_runner( + model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) + # vllm_outputs is a list of tuples whose first element is the token id + # and the second element is the output (including the prompt). + for output, answer in zip(vllm_outputs, answers): + generated_text = output[1] + assert answer in generated_text TP_SIZE_8 = 8 @@ -123,7 +115,6 @@ def test_phi3( ) def test_gemma3_27b_with_text_input_and_tp( vllm_runner: type[VllmRunner], - monkeypatch: pytest.MonkeyPatch, ) -> None: model = "google/gemma-3-27b-it" max_tokens = 16 @@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp( " but in rising every time we fall.", ] - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - with vllm_runner( - model, - max_num_batched_tokens=256, - max_num_seqs=max_num_seqs, - tensor_parallel_size=tensor_parallel_size, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) - # vllm_outputs is a list of tuples whose first element is the token id - # and the second element is the output (including the prompt). - for output, answer in zip(vllm_outputs, answers): - generated_text = output[1] - assert answer in generated_text + with vllm_runner( + model, + max_num_batched_tokens=256, + max_num_seqs=max_num_seqs, + tensor_parallel_size=tensor_parallel_size, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) + # vllm_outputs is a list of tuples whose first element is the token id + # and the second element is the output (including the prompt). + for output, answer in zip(vllm_outputs, answers): + generated_text = output[1] + assert answer in generated_text @pytest.mark.skipif( @@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp( ) def test_w8a8_quantization( vllm_runner: type[VllmRunner], - monkeypatch: pytest.MonkeyPatch, ) -> None: model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8" max_tokens = 5 @@ -176,18 +163,15 @@ def test_w8a8_quantization( ) example_prompts = [prompt] - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - with vllm_runner( - model, - max_num_batched_tokens=64, - max_model_len=4096, - gpu_memory_utilization=0.7, - max_num_seqs=max_num_seqs, - tensor_parallel_size=tensor_parallel_size, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - output = vllm_outputs[0][1] - - assert "1024" in output or "0, 1" in output + with vllm_runner( + model, + max_num_batched_tokens=64, + max_model_len=4096, + gpu_memory_utilization=0.7, + max_num_seqs=max_num_seqs, + tensor_parallel_size=tensor_parallel_size, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + output = vllm_outputs[0][1] + + assert "1024" in output or "0, 1" in output diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py index e8cc396f970e..b7b6835c40cc 100644 --- a/tests/v1/tpu/test_perf.py +++ b/tests/v1/tpu/test_perf.py @@ -86,7 +86,6 @@ class TestParams: @pytest.mark.parametrize("params", TEST_PARAMS) def test_perf( vllm_runner: type[VllmRunner], - monkeypatch: pytest.MonkeyPatch, params: TestParams, ) -> None: tokenizer = get_tokenizer( @@ -107,48 +106,45 @@ def test_perf( ) ) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + sampling_params = SamplingParams( + max_tokens=params.decode_len, temperature=1.0, min_p=0.0 + ) - sampling_params = SamplingParams( - max_tokens=params.decode_len, temperature=1.0, min_p=0.0 + with vllm_runner( + params.model, + max_num_batched_tokens=MAX_MODEL_LEN, + max_model_len=MAX_MODEL_LEN, + max_num_seqs=MAX_NUM_SEQS, + gpu_memory_utilization=GPU_UTIL, + enforce_eager=False, + tensor_parallel_size=1, + ) as vllm_model: + print(" -- Warmup / Compile") + for i in range(NUM_WARMUPS): + _ = vllm_model.generate(prompts, sampling_params) + + print(" -- Benchmarking... ") + times = [] + for i in range(NUM_RUNS): + start_time = time.time() + _ = vllm_model.generate(prompts, sampling_params) + times.append(time.time() - start_time) + + avg_time = sum(times) / len(times) + + print(" -- avg_time = {}".format(avg_time)) + print( + " -- expected_avg_time = {} with err_tol = {}".format( + params.expected_avg_time, params.err_tol + ) ) - - with vllm_runner( - params.model, - max_num_batched_tokens=MAX_MODEL_LEN, - max_model_len=MAX_MODEL_LEN, - max_num_seqs=MAX_NUM_SEQS, - gpu_memory_utilization=GPU_UTIL, - enforce_eager=False, - tensor_parallel_size=1, - ) as vllm_model: - print(" -- Warmup / Compile") - for i in range(NUM_WARMUPS): - _ = vllm_model.generate(prompts, sampling_params) - - print(" -- Benchmarking... ") - times = [] - for i in range(NUM_RUNS): - start_time = time.time() - _ = vllm_model.generate(prompts, sampling_params) - times.append(time.time() - start_time) - - avg_time = sum(times) / len(times) - - print(" -- avg_time = {}".format(avg_time)) + diff = avg_time - params.expected_avg_time + ok = diff < params.err_tol + if diff < -params.err_tol: print( - " -- expected_avg_time = {} with err_tol = {}".format( - params.expected_avg_time, params.err_tol - ) + " !! WARNING !! Performance has improved by {}, " + "it may be necessary to fine-tune the " + "expected_avg_time = {}".format(-diff, params.expected_avg_time) ) - diff = avg_time - params.expected_avg_time - ok = diff < params.err_tol - if diff < -params.err_tol: - print( - " !! WARNING !! Performance has improved by {}, " - "it may be necessary to fine-tune the " - "expected_avg_time = {}".format(-diff, params.expected_avg_time) - ) - - assert ok, " !! ERROR !! Regression detected" + + assert ok, " !! ERROR !! Regression detected" diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py index e7767aceec55..505da4163143 100644 --- a/tests/v1/tracing/test_tracing.py +++ b/tests/v1/tracing/test_tracing.py @@ -82,7 +82,7 @@ def test_traces( ): with monkeypatch.context() as m: m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - m.setenv("VLLM_USE_V1", "1") + sampling_params = SamplingParams( temperature=0.01, top_p=0.1, diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index f48b354e8a7d..299567427027 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -77,7 +77,13 @@ def warming_up_model(self) -> None: logger.info("Warming up model for the compilation...") # Only generate graph for the generic shape with _set_global_compilation_settings(self.vllm_config): - self._dummy_run(max(16, self.max_num_reqs)) + self._dummy_run( + min( + max(16, self.max_num_reqs), + self.scheduler_config.max_num_batched_tokens, + ) + ) + logger.info("Warming up done.") def _init_device_properties(self) -> None: