diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d11a43377548..ef717f8d1f59 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -296,6 +296,7 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/kv_offload
     - pytest -v -s v1/sample
@@ -317,7 +318,7 @@ steps:
   no_gpu: true
   commands:
     # split the test to avoid interference
-    - pytest -v -s v1/core
+    - pytest -v -s -m 'cpu_test' v1/core
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index d63c82102b6b..9b9d8cfea7fa 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -13,7 +13,7 @@
 import torch
 
 from vllm import LLM
-from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
+from vllm.v1.engine.llm_engine import LLMEngine
 
 from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
@@ -211,16 +211,11 @@ def test_models_distributed(
 
 
 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
-    from vllm.envs import VLLM_USE_V1
-
-    if not VLLM_USE_V1:
-        pytest.skip("Skipping V0 test, dump input not supported")
-
     # Needed to mock an error in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
+        if isinstance(vllm_model.llm.llm_engine, LLMEngine):
             v1_test_failed_model_execution(vllm_model)
 
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index b7cd98e27403..f1b0f7b2de89 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -117,68 +117,59 @@ def model(x):
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize(
-    "model, use_v1",
+    "model",
     [
         # sleep mode with safetensors
-        ("meta-llama/Llama-3.2-1B", True),
+        "meta-llama/Llama-3.2-1B",
         # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", True),
+        "facebook/opt-125m",
     ],
 )
-def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
-    with monkeypatch.context() as m:
-        assert use_v1
-        m.setenv("VLLM_USE_V1", "1")
-        free, total = torch.cuda.mem_get_info()
-        used_bytes_baseline = total - free  # in case other process is running
-        llm = LLM(model, enable_sleep_mode=True)
-        prompt = "How are you?"
-        sampling_params = SamplingParams(temperature=0, max_tokens=10)
-        output = llm.generate(prompt, sampling_params)
-
-        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-        # which is difficult to measure in the test. therefore, we only
-        # test sleep level 1 here.
-        llm.sleep(level=1)
-
-        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-        # now the memory usage is mostly cudagraph memory pool,
-        # and it should be less than the model weights (1B model, 2GiB weights)
-
-        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-        # is captured but cannot be releasesd from PyTorch due to a known bug,
-        # therefore high memory usage after `llm.sleep` is called is expected.
-        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-        # in V1.
-        if use_v1:
-            assert used_bytes < 7 * GiB_bytes
-        else:
-            assert used_bytes < 2 * GiB_bytes
-
-        llm.wake_up()
-        output2 = llm.generate(prompt, sampling_params)
-        # cmp output
-        assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-        llm.sleep(level=1)
-        llm.wake_up(tags=["weights"])
-
-        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
-
-        # should just reallocate memory for weights (1B model, ~2GiB weights)
-        if use_v1:
-            assert used_bytes < 10 * GiB_bytes
-        else:
-            assert used_bytes < 6 * GiB_bytes
-
-        # now allocate kv cache memory
-        llm.wake_up(tags=["kv_cache"])
-        output3 = llm.generate(prompt, sampling_params)
-
-        # cmp output
-        assert output[0].outputs[0].text == output3[0].outputs[0].text
+def test_end_to_end(model: str):
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # which is difficult to measure in the test. therefore, we only
+    # test sleep level 1 here.
+    llm.sleep(level=1)
+
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage is mostly cudagraph memory pool,
+    # and it should be less than the model weights (1B model, 2GiB weights)
+
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # therefore high memory usage after `llm.sleep` is called is expected.
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # in V1.
+    assert used_bytes < 7 * GiB_bytes
+
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+    llm.sleep(level=1)
+    llm.wake_up(tags=["weights"])
+
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+    # should just reallocate memory for weights (1B model, ~2GiB weights)
+    assert used_bytes < 10 * GiB_bytes
+
+    # now allocate kv cache memory
+    llm.wake_up(tags=["kv_cache"])
+    output3 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output3[0].outputs[0].text
 
 
 @create_new_process_for_each_test()
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index 927c838ae74e..84194f3ed01e 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -66,7 +66,6 @@ def llm_pair(request):
             pytest.skip("Only Blackwell GPUs support Cutlass MLA")
 
     env_vars = {
-        "VLLM_USE_V1": "1",
         # Force native sampler to avoid potential nondeterminism in FlashInfer
         # when per-request generators are not used in V1.
         "VLLM_USE_FLASHINFER_SAMPLER": "0",
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
     with (
         temporary_environ(
             {
-                "VLLM_USE_V1": "1",
                 "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
                 # Flex_Attention is not supported with full cuda graph
             }
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 920cd5a06c26..8241d248fa53 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -18,7 +18,6 @@
     VllmConfig,
     set_current_vllm_config,
 )
-from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import is_torch_equal_or_newer
 
@@ -127,7 +126,6 @@ def _run_simple_model(
 @pytest.mark.parametrize("use_inductor", [True, False])
 @torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
     _run_simple_model(
         splitting_ops=["silly.attention"],
         use_inductor_graph_partition=False,
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
 @torch.inference_mode()
 @pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
 def test_simple_inductor_graph_partition(splitting_ops):
-    assert VLLM_USE_V1
     if not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 03cd510eb5d0..88ad4f81df50 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
         "pass_config": {"enable_async_tp": async_tp_enabled},
     }
 
-    async_tp_env = tp_env = {
-        "VLLM_USE_V1": "1",
-    }
-
     async_tp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
         "mp",
     ]
 
-    compare_two_settings(
-        model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
-    )
+    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index d055a41af4c4..0da7f58a2f5f 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.utils import _is_torch_equal_or_newer
@@ -16,15 +15,10 @@ def test_version():
     assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
 
 
-def test_use_cudagraphs_dynamic(monkeypatch):
-    assert vllm.envs.VLLM_USE_V1
+def test_use_cudagraphs_dynamic():
     vllm_config = VllmConfig()
     assert vllm_config.compilation_config.use_cudagraph
 
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    vllm_config = VllmConfig()
-    assert not vllm_config.compilation_config.use_cudagraph
-
 
 def test_custom_op():
     # proper syntax
@@ -41,8 +35,6 @@ def test_custom_op():
 # may be influenced by other tests.
 @pytest.mark.parametrize("val", ["1"])
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
-    assert vllm.envs.VLLM_USE_V1
-
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
 @pytest.mark.forked
 @pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
-    assert vllm.envs.VLLM_USE_V1
-
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 54d3d4ed0295..0f2e3bffbd31 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
     model_class: type[AttentionQuantPatternModel],
     backend: _Backend,
     use_inductor_graph_partition: bool,
-    monkeypatch,
     dist_init,
     caplog_vllm,
 ):
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     device = torch.device("cuda:0")
     torch.manual_seed(42)
 
diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py
index 9c03f26c504e..56dc542f1c76 100644
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -8,16 +8,13 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 
 
-def test_mp_reducer(monkeypatch):
+def test_mp_reducer():
     """
     Test that _reduce_config reducer is registered when AsyncLLM is instantiated
     without transformers_modules. This is a regression test for
     https://github.com/vllm-project/vllm/pull/18640.
     """
 
-    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     # Ensure transformers_modules is not in sys.modules
     if "transformers_modules" in sys.modules:
         del sys.modules["transformers_modules"]
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index 70cc7e31b8ad..d59b394393e3 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from vllm import LLM, SamplingParams, envs
+from vllm import LLM, SamplingParams
 
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
@@ -111,9 +111,7 @@ def _stop_token_id(llm):
 
 @pytest.mark.skip_global_cleanup
 def test_stop_strings():
-    # If V0, must set enforce_eager=False since we use
-    # async output processing below.
-    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=True)
 
     _stop_basic(llm)
     _stop_multi_tokens(llm)
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index c8b6dc9781df..53fc9957b910 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
 @dataclass
 class CPTestSettings:
     parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
     distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
     runner: RunnerOption
     test_options: CPTestOptions
 
-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})"
-            )
-
     @staticmethod
     def detailed(
         *,
@@ -87,7 +73,6 @@ def detailed(
         return CPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp"],
-            vllm_major_versions=["1"],
             runner=runner,
             test_options=CPTestOptions(
                 multi_node_only=multi_node_only, load_format=load_format
@@ -98,14 +83,11 @@ def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(
-                self.distributed_backends, self.vllm_major_versions
-            ):
+            for backend in self.distributed_backends:
                 yield (
                     model_id,
                     parallel_setup,
                     backend,
-                    vllm_major_version,
                     self.runner,
                     opts,
                 )
@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: CPTestOptions,
     num_gpus_available: int,
@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
-    cp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,  # Note(hc): DCP only support V1 engine only
-    }
-
     cp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
         distributed_backend,
     ]
 
-    try:
-        compare_two_settings(
-            model_id,
-            cp_args,
-            tp_args,
-            cp_env,
-            tp_env,
-            method=method,
-            max_wait_seconds=720,
-        )
-    except Exception:
-        testing_ray_compiled_graph = cp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(
+        model_id,
+        cp_args,
+        tp_args,
+        method=method,
+        max_wait_seconds=720,
+    )
 
 
 CP_TEXT_GENERATION_MODELS = {
@@ -257,7 +223,6 @@ def _compare_cp_with_tp(
         "model_id",
         "parallel_setup",
         "distributed_backend",
-        "vllm_major_version",
         "runner",
         "test_options",
     ),
@@ -274,7 +239,6 @@ def test_cp_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: CPTestOptions,
     num_gpus_available,
@@ -283,7 +247,6 @@ def test_cp_generation(
         model_id,
         parallel_setup,
         distributed_backend,
-        vllm_major_version,
         runner,
         test_options,
         num_gpus_available,
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 7d55c40754b4..43f0c9dd1a85 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -307,7 +307,6 @@ def _compare_tp(
     if distributed_backend == "ray":
         # For V1, test Ray Compiled Graph for all the tests
         pp_env = {
-            "VLLM_USE_V1": "1",
             "VLLM_USE_RAY_COMPILED_DAG": "1",
             "VLLM_USE_RAY_SPMD_WORKER": "1",
             "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -316,15 +315,11 @@ def _compare_tp(
         # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
     elif distributed_backend == "mp":
-        pp_env = {
-            "VLLM_USE_V1": "1",
-        }
+        pp_env = None
     else:
         pp_env = None
 
-    tp_env = {
-        "VLLM_USE_V1": "1",
-    }
+    tp_env = None
 
     pp_args = [
         *common_args,
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 82eaed66717c..1defd9690241 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
 @dataclass
 class SPTestSettings:
     parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
     distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
     runner: RunnerOption
     test_options: SPTestOptions
 
-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})"
-            )
-
     @staticmethod
     def detailed(
         *,
@@ -85,7 +71,6 @@ def detailed(
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
             runner=runner,
             test_options=SPTestOptions(
                 multi_node_only=multi_node_only, load_format=load_format
@@ -117,7 +102,6 @@ def fast(
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
             runner=runner,
             test_options=SPTestOptions(
                 multi_node_only=multi_node_only, load_format=load_format
@@ -147,7 +131,6 @@ def fp8_quant(
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
             runner=runner,
             test_options=SPTestOptions(
                 multi_node_only=multi_node_only, load_format=load_format
@@ -158,14 +141,11 @@ def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(
-                self.distributed_backends, self.vllm_major_versions
-            ):
+            for backend in self.distributed_backends:
                 yield (
                     model_id,
                     parallel_setup,
                     backend,
-                    vllm_major_version,
                     self.runner,
                     opts,
                 )
@@ -175,7 +155,6 @@ def _compare_sp(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available: int,
@@ -265,10 +244,6 @@ def _compare_sp(
         },
     }
 
-    tp_sp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
-
     tp_sp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -281,9 +256,6 @@ def _compare_sp(
         json.dumps(compilation_config),
     ]
 
-    tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
     tp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -292,18 +264,7 @@ def _compare_sp(
         "mp",
     ]
 
-    try:
-        compare_two_settings(
-            model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
-        )
-    except Exception:
-        testing_ray_compiled_graph = tp_sp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
 
 
 SP_TEXT_GENERATION_MODELS = {
@@ -325,7 +286,6 @@ def _compare_sp(
         "model_id",
         "parallel_setup",
         "distributed_backend",
-        "vllm_major_version",
         "runner",
         "test_options",
     ),
@@ -341,7 +301,6 @@ def test_tp_sp_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    vllm_major_version: str,
     runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available,
@@ -350,7 +309,6 @@ def test_tp_sp_generation(
         model_id,
         parallel_setup,
         distributed_backend,
-        vllm_major_version,
         runner,
         test_options,
         num_gpus_available,
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index e2d107b60586..af607720c8b0 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -61,50 +61,34 @@ def run_test(model_name, more_args=None):
 TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"
 
 
-@pytest.mark.skipif(
-    not current_platform.is_cuda() and not current_platform.is_tpu(),
-    reason="V1 is currently only supported on CUDA and TPU",
-)
 @pytest.mark.parametrize("model", MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine(model):
     """Run with the V1 Engine."""
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
 
-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=64"
 
-            more_args = "max_model_len=2048,max_num_seqs=64"
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)
 
-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
+    run_test(model, more_args)
 
-        run_test(model, more_args)
 
-
-@pytest.mark.skipif(
-    not current_platform.is_cuda() and not current_platform.is_tpu(),
-    reason="V1 is currently only supported on CUDA and TPU",
-)
 @pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
-    model, monkeypatch: pytest.MonkeyPatch
-):
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
     """Run with the V1 Engine."""
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
-            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
 
-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)
 
-        run_test(model, more_args)
+    run_test(model, more_args)
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 919b7793628e..5b23b4239027 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -10,7 +10,6 @@
 """
 
 import lm_eval
-import pytest
 
 from vllm.platforms import current_platform
 
@@ -67,21 +66,13 @@ def run_test(more_args):
         ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    and not current_platform.is_tpu()
-    and not current_platform.is_xpu(),
-    reason="V1 currently only supported on CUDA, XPU and TPU",
-)
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine():
     """Run with the V1 Engine."""
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        more_args = []
+    more_args = []
 
-        # Limit compilation time for V1
-        if current_platform.is_tpu():
-            more_args = ["--max-num-seqs", "64"]
+    # Limit compilation time for V1
+    if current_platform.is_tpu():
+        more_args = ["--max-num-seqs", "64"]
 
-        run_test(more_args)
+    run_test(more_args)
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index ed0b284bda62..d110234d60ac 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -21,18 +21,7 @@
 
 
 @pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module")
-def server(monkeypatch_module, zephyr_lora_files):  # noqa: F811
-    monkeypatch_module.setenv("VLLM_USE_V1", "1")
-
+def server(zephyr_lora_files):  # noqa: F811
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 379e7d36d9e1..674e14e4f5c1 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -37,21 +37,8 @@
 ]
 
 
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
-    use_v1 = request.param
-    assert use_v1
-    monkeypatch_module.setenv("VLLM_USE_V1", "1")
-
+def server_with_lora_modules_json(request, zephyr_lora_files):
     # Define the json format LoRA module configurations
     lora_module_1 = {
         "name": "zephyr-lora",
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 711505c74bca..6b00dde494d1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -22,24 +22,6 @@
 PREV_MINOR_VERSION = version._prev_minor_version()
 
 
-@pytest.fixture(scope="module", params=[True])
-def use_v1(request):
-    # Module-scoped variant of run_with_both_engines
-    #
-    # Use this fixture to run a test with both v0 and v1, and
-    # also to conditionalize the test logic e.g.
-    #
-    # def test_metrics_exist(use_v1, server, client):
-    #     ...
-    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
-    #     for metric in expected:
-    #         assert metric in response.text
-    #
-    # @skip_v1 wouldn't work here because this is a module-level
-    # fixture - per-function decorators would have no effect
-    yield request.param
-
-
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
@@ -63,13 +45,11 @@ def default_server_args():
         f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
     ],
 )
-def server(use_v1, default_server_args, request):
+def server(default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
-    with RemoteOpenAIServer(
-        MODEL_NAME, default_server_args, env_dict=env_dict
-    ) as remote_server:
+
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
         yield remote_server
 
 
@@ -129,7 +109,8 @@ async def client(server):
 
 @pytest.mark.asyncio
 async def test_metrics_counts(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
@@ -145,7 +126,7 @@ async def test_metrics_counts(
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
+        if (metric_family not in EXPECTED_METRICS_V1) or (
             not server.show_hidden_metrics
             and metric_family in HIDDEN_DEPRECATED_METRICS
         ):
@@ -183,62 +164,6 @@ async def test_metrics_counts(
         assert found_metric, f"Did not find {metric_family} in prom endpoint"
 
 
-EXPECTED_METRICS = [
-    "vllm:num_requests_running",
-    "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
-    "vllm:e2e_request_latency_seconds_sum",
-    "vllm:e2e_request_latency_seconds_bucket",
-    "vllm:e2e_request_latency_seconds_count",
-    "vllm:request_queue_time_seconds_sum",
-    "vllm:request_queue_time_seconds_bucket",
-    "vllm:request_queue_time_seconds_count",
-    "vllm:request_inference_time_seconds_sum",
-    "vllm:request_inference_time_seconds_bucket",
-    "vllm:request_inference_time_seconds_count",
-    "vllm:request_prefill_time_seconds_sum",
-    "vllm:request_prefill_time_seconds_bucket",
-    "vllm:request_prefill_time_seconds_count",
-    "vllm:request_decode_time_seconds_sum",
-    "vllm:request_decode_time_seconds_bucket",
-    "vllm:request_decode_time_seconds_count",
-    "vllm:request_prompt_tokens_sum",
-    "vllm:request_prompt_tokens_bucket",
-    "vllm:request_prompt_tokens_count",
-    "vllm:request_generation_tokens_sum",
-    "vllm:request_generation_tokens_bucket",
-    "vllm:request_generation_tokens_count",
-    "vllm:request_params_n_sum",
-    "vllm:request_params_n_bucket",
-    "vllm:request_params_n_count",
-    "vllm:request_params_max_tokens_sum",
-    "vllm:request_params_max_tokens_bucket",
-    "vllm:request_params_max_tokens_count",
-    "vllm:iteration_tokens_total",
-    "vllm:num_preemptions_total",
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "vllm:request_success_total",
-    "vllm:cache_config_info",
-    # labels in cache_config_info
-    "block_size",
-    "cache_dtype",
-    "cpu_offload_gb",
-    "enable_prefix_caching",
-    "gpu_memory_utilization",
-    "num_cpu_blocks",
-    "num_gpu_blocks",
-    "num_gpu_blocks_override",
-    "sliding_window",
-    "swap_space_bytes",
-]
-
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
@@ -304,17 +229,21 @@ async def test_metrics_counts(
 
 @pytest.mark.asyncio
 async def test_metrics_exist(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(
-        model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        temperature=0.0,
     )
 
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
+    for metric in EXPECTED_METRICS_V1:
         if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
             continue
         assert metric in response.text
@@ -322,10 +251,11 @@ async def test_metrics_exist(
 
 @pytest.mark.asyncio
 async def test_abort_metrics_reset(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
     running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server
     )
 
     # Expect no running requests or kvcache usage
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
 
     # Check that we have running requests
     running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server
     )
 
     # Expect running requests and kvcache usage
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
 
     # Verify running and waiting requests counts and KV cache usage are zero
     running_requests_after, waiting_requests_after, kv_cache_usage_after = (
-        _get_running_metrics_from_api(server, use_v1)
+        _get_running_metrics_from_api(server)
     )
 
     assert running_requests_after == 0, (
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
     )
 
 
-def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
     """Return (running_count, waiting_count, kv_cache_usage)"""
 
     response = requests.get(server.url_for("metrics"))
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
     # Verify running and waiting requests counts and KV cache usage are zero
     running_requests, waiting_requests, kv_cache_usage = None, None, None
 
-    kv_cache_usage_metric = (
-        "vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
-    )
+    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
 
     for family in text_string_to_metric_families(response.text):
         if family.name == "vllm:num_requests_running":
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
     return running_requests, waiting_requests, kv_cache_usage
 
 
-def test_metrics_exist_run_batch(use_v1: bool):
+def test_metrics_exist_run_batch():
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
                 "--port",
                 port,
             ],
-            env={"VLLM_USE_V1": "1"},
         )
 
         def is_server_up(url):
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 81e2b52dfa71..3d0885414b24 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -15,11 +15,6 @@
 from ...utils import RemoteOpenAIServer
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 @pytest.mark.asyncio
 async def test_empty_prompt():
     model_name = "gpt2"
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 6b99ba7af50e..5c607f921536 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -80,7 +80,6 @@ def test_env(
 ):
     """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv(STR_BACKEND_ENV_VAR, name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
@@ -212,30 +211,21 @@ def test_env(
 
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
-def test_fp32_fallback(
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
+def test_fp32_fallback(device: str):
     """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "TORCH_SDPA"
 
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "TORCH_SDPA"
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "FLEX_ATTENTION"
+    elif device == "cuda":
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "FLEX_ATTENTION"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
     pytest.skip(
         "Skipping as current backend selector does not "
         "handle fallbacks when a backend is set via env var."
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
         monkeypatch.context() as m,
         patch("vllm.attention.selector.current_platform", CudaPlatform()),
     ):
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
 
         # Should raise ValueError for invalid backend
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index 87002c72f6e1..ae33f422d373 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
 
     # Run with flex attention
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
 
         set_seed(seed)
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
 
     # Run with default backend
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         set_seed(seed)
         with vllm_runner(
             model_name,
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
 
     # Run with flex attention
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         with vllm_runner(
             model_name,
@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
             flex_outputs = llm_flex.embed(prompts)
 
     # Run with default backend
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        with vllm_runner(
+    with (
+        monkeypatch.context() as m,
+        vllm_runner(
             model_name,
             runner="pooling",
             dtype=torch.bfloat16,
             tensor_parallel_size=1,
             max_model_len=100,
             enforce_eager=True,
-        ) as llm_default:
-            default_outputs = llm_default.embed(prompts)
+        ) as llm_default,
+    ):
+        default_outputs = llm_default.embed(prompts)
 
     check_embeddings_close(
         embeddings_0_lst=flex_outputs,
diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py
index 2f9b09f4026c..fd3386ff67df 100644
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -613,7 +613,6 @@ def test_dummy_maverick(
     profile: bool = False,
 ) -> None:
     # Disable multiprocessing allows us to access model executor from LLM engine
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     model_path = create_reduced_maverick_model(
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 90cb461a6caf..0389e28746cb 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -8,7 +8,6 @@
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
-from vllm import envs
 
 
 class DummyPlatform(Platform):
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        if envs.VLLM_USE_V1:
-            compilation_config = vllm_config.compilation_config
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
+        vllm_config.compilation_config.custom_ops = ["all"]
 
     def get_attn_backend_cls(
         self,
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 1c37d6a39261..45902cc874c3 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -16,7 +16,6 @@ def schedule(self):
 
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         # Explicitly turn off engine multiprocessing so
         # that the scheduler runs in this process
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 42aebcd52414..fa0ca48f9bd9 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -8,18 +8,11 @@
 
 from typing import Optional
 
-import pytest
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 
 
-@pytest.fixture(autouse=True)
-def v1(monkeypatch):
-    """Only run on vLLM v1."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 def _generate(
     llm: LLM,
     prompt: str,
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 5999c9cf1e0e..9780092b25e6 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -17,17 +17,6 @@
 # 100 training iterations with a training batch size of 100.
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
-    for all tests in this file
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        yield
-
-
 def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
     return vllm.LLM(
         model="Qwen/Qwen2.5-3B-Instruct",
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index f30a6628b1bf..fcc0b6a5f7de 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -305,7 +305,6 @@ class BackendConfig:
     "CutlassMLA": BackendConfig(
         name="CutlassMLA",
         env_vars={
-            "VLLM_USE_V1": "1",
             "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
             "FORCE_NUM_KV_SPLITS": "1",  # TODO: remove this when hang issue is fixed
         },
diff --git a/tests/v1/core/test_kv_sharing.py b/tests/v1/core/test_kv_sharing.py
index 328f2640f218..e6d37b1d63c8 100644
--- a/tests/v1/core/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
 import torch
 
 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
 from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
 
+pytestmark = pytest.mark.cpu_test
+
 
 def new_kv_cache_spec():
     return FullAttentionSpec(16, 1, 1, torch.float32, False)
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 6983c3b92f6b..90f8757ae493 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 
 from vllm import LLM
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"
 
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 77d5c5d87fc1..8c8148ae2094 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
     ):
         pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
 
-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars
 
     with temporary_environ(env_vars), ExitStack() as stack:
         if not supported:
@@ -117,7 +117,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 def test_cudagraph_compilation_combo(combo_case):
     backend_name, cudagraph_mode, compilation_level, supported = combo_case
 
-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars
 
     with temporary_environ(env_vars), ExitStack() as stack:
         if not supported:
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 5f26c2f1c651..0fcb97fe6305 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
         )
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index c9018ee177e8..71b0e86c75c1 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -32,7 +32,7 @@ class TestConfig:
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
 def test_sliding_window_retrieval(
-    monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
+    model, batch_size, seed, disable_hybrid_kv_cache_manager
 ):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
     If we tell it upfront which we are going to be looking for, then
     it answers correctly (mostly).
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        test_config = model_config[model]
-
-        llm = LLM(
-            model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
-        )
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
-
-        prompts, answer, indices = prep_prompts(
-            batch_size, ln_range=test_config.ln_range
-        )
-
-        check_length(prompts, llm, test_config.sliding_window)
-
-        # Fresh generation
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
-
-        # Re-generate with the same prompts to test prefix caching
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
+    test_config = model_config[model]
+
+    llm = LLM(
+        model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+    prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
+
+    check_length(prompts, llm, test_config.sliding_window)
+
+    # Fresh generation
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )
+
+    # Re-generate with the same prompts to test prefix caching
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )
 
 
 def check_length(prompts: list[str], llm: LLM, sliding_window: int):
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index b9052d8a58b8..89e5f26ac627 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
     )
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Make scheduling deterministic for reproducibility
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
diff --git a/tests/v1/e2e/test_min_tokens.py b/tests/v1/e2e/test_min_tokens.py
index f15982b7e5f3..e00a3d58debe 100644
--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -13,7 +13,6 @@
 5) Multiple stop conditions
 """
 
-import os
 from typing import Optional, Union
 
 import pytest
@@ -161,9 +160,6 @@ def __str__(self):
 @pytest.fixture(scope="module")
 def llm_v1():
     """Create V1 LLM instance for testing"""
-    # Ensure V1 engine is used
-    os.environ["VLLM_USE_V1"] = "1"
-
     llm = LLM(
         model=TEST_MODEL,
         tensor_parallel_size=1,
@@ -503,6 +499,6 @@ def test_min_tokens_validation():
     
     Usage:
         cd vllm/
-        VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
+        python -m pytest tests/v1/e2e/test_min_tokens.py -v
     """
     pytest.main([__file__, "-v"])
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 9ed9cd7950a9..fbbbd0389c26 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -301,7 +301,6 @@ def test_mtp_correctness(
     model_setup: (method, model_name, tp_size)
     """
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_MLA_DISABLE", "1")
 
         method, model_name, tp_size = model_setup
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 3e30d28111c8..444d771a18d6 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -95,17 +95,11 @@ async def generate(
 )
 @pytest.mark.asyncio
 async def test_load(
-    monkeypatch: pytest.MonkeyPatch,
     output_kind: RequestOutputKind,
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
-    # so that in the future when we switch, we don't have to change all the
-    # tests.
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -149,14 +143,11 @@ async def test_load(
 )
 @pytest.mark.asyncio
 async def test_abort(
-    monkeypatch: pytest.MonkeyPatch,
     output_kind: RequestOutputKind,
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -222,13 +213,8 @@ async def test_abort(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
 @pytest.mark.asyncio
-async def test_multi_abort(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+async def test_multi_abort(output_kind: RequestOutputKind):
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
@@ -304,14 +290,11 @@ async def test_multi_abort(
 )
 @pytest.mark.asyncio
 async def test_finished_flag(
-    monkeypatch: pytest.MonkeyPatch,
     n: int,
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -341,12 +324,10 @@ async def test_finished_flag(
 )
 @pytest.mark.asyncio
 async def test_mid_stream_cancellation(
-    monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
+    engine_args: AsyncEngineArgs, prompt: PromptType
 ):
     """Test that requests can be cancelled mid-stream."""
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
     be added to the default loggers.
     """
 
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(
                 TEXT_ENGINE_ARGS,
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
 
 
 @pytest.mark.asyncio(scope="module")
-async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+async def test_dp_rank_argument():
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.asyncio
-async def test_check_health(monkeypatch: pytest.MonkeyPatch):
+async def test_check_health():
     """Test that check_health returns normally for healthy engine
     and raises EngineDeadError when the engine is dead.
     """
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
 
     from vllm.v1.engine.exceptions import EngineDeadError
 
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
 @pytest.mark.asyncio
-async def test_abort_final_output(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
+async def test_abort_final_output(output_kind: RequestOutputKind):
     """Test that abort() returns a final output with correct information."""
 
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index f6b10fa67b3b..943402e429b6 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -5,18 +5,11 @@
 
 import pytest
 
-from vllm import envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 
-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
-
 
 def test_prefix_caching_from_cli():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 28d7854ab5d2..997b2b74bb6b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
 
 
 @create_new_process_for_each_test()
-def test_engine_core(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-        """Test basic request lifecycle."""
-
-        # First request.
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
-
-        # Second request.
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 1
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        # Add two requests in a row.
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 2
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 4
-
-        # Loop through until they are all done.
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
+def test_engine_core():
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+    """Test basic request lifecycle."""
+
+    # First request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1
+
+    # Second request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 1
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    # Add two requests in a row.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 2
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 4
+
+    # Loop through until they are all done.
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass
+
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    """Test abort cycle."""
+
+    # Basic abort.
+    req = make_request()
+    request_id = req.request_id
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+
+    engine_core.abort_requests([request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert engine_core.scheduler.has_finished_requests()
+
+    _ = engine_core.step()
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()
+
+    # Add, step, abort 1 of the 3.
+    req0 = make_request()
+    req1 = make_request()
+    req2 = make_request()
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 0
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req2))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 2
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 3
+
+    # Abort just one.
+    engine_core.abort_requests([req1.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2
+
+    # Abort the other requests at the same time.
+    engine_core.abort_requests([req2.request_id, req0.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+
+    # Sending duplicate requests with same request_id
+    req0 = make_request()
+    req1 = make_request()
+    req0.request_id = req1.request_id = "test"
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass
+
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass
+
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
 
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        """Test abort cycle."""
 
-        # Basic abort.
-        req = make_request()
-        request_id = req.request_id
+@create_new_process_for_each_test()
+def test_engine_core_advanced_sampling():
+    """
+    A basic end-to-end test to verify that the engine functions correctly
+    when additional sampling parameters, such as top_p, min_tokens, and
+    presence_penalty, are set.
+    """
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+    """Test basic request lifecycle."""
+    # First request.
+    request: EngineCoreRequest = make_request()
+    request.sampling_params = SamplingParams(
+        min_tokens=4,
+        presence_penalty=1.0,
+        frequency_penalty=1.0,
+        repetition_penalty=0.1,
+        stop_token_ids=[1001, 1002],
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request))
 
-        engine_core.add_request(*engine_core.preprocess_add_request(req))
+    def _check_engine_state():
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-
-        engine_core.abort_requests([request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert engine_core.scheduler.has_finished_requests()
-
-        _ = engine_core.step()
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
-
-        # Add, step, abort 1 of the 3.
-        req0 = make_request()
-        req1 = make_request()
-        req2 = make_request()
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 0
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req2))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 2
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 3
-
-        # Abort just one.
-        engine_core.abort_requests([req1.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
-
-        # Abort the other requests at the same time.
-        engine_core.abort_requests([req2.request_id, req0.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-
-        # Sending duplicate requests with same request_id
-        req0 = make_request()
-        req1 = make_request()
-        req0.request_id = req1.request_id = "test"
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
-
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
+        # Loop through until they are all done.
         while (outs := engine_core.step()[0].get(0)) and outs.outputs:
             pass
-
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
 
+    _check_engine_state()
 
-@create_new_process_for_each_test()
-def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
-    """
-    A basic end-to-end test to verify that the engine functions correctly
-    when additional sampling parameters, such as top_p, min_tokens, and
-    presence_penalty, are set.
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-        """Test basic request lifecycle."""
-        # First request.
-        request: EngineCoreRequest = make_request()
-        request.sampling_params = SamplingParams(
-            min_tokens=4,
-            presence_penalty=1.0,
-            frequency_penalty=1.0,
-            repetition_penalty=0.1,
-            stop_token_ids=[1001, 1002],
-        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request))
-
-        def _check_engine_state():
-            assert len(engine_core.scheduler.waiting) == 1
-            assert len(engine_core.scheduler.running) == 0
-            # Loop through until they are all done.
-            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-                pass
-            assert len(engine_core.scheduler.waiting) == 0
-            assert len(engine_core.scheduler.running) == 0
-
-        _check_engine_state()
-
-        # Second request.
-        request2 = make_request()
-        request2.sampling_params = SamplingParams(
-            top_p=0.99,
-            top_k=50,
-        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request2))
-        _check_engine_state()
+    # Second request.
+    request2 = make_request()
+    request2.sampling_params = SamplingParams(
+        top_p=0.99,
+        top_k=50,
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request2))
+    _check_engine_state()
 
 
 @create_new_process_for_each_test()
-def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_concurrent_batches():
     """
     Test that the engine can handle multiple concurrent batches.
     """
@@ -272,173 +268,163 @@ def shutdown(self):
             if hasattr(self, "thread_pool"):
                 self.thread_pool.shutdown(wait=False)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            # To test concurrent batches.
-            max_num_seqs=2,
-            # Avoid all requests being scheduled once.
-            enable_prefix_caching=False,
-            max_num_batched_tokens=10,
-            # Reduce startup time.
-            enforce_eager=True,
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        # To test concurrent batches.
+        max_num_seqs=2,
+        # Avoid all requests being scheduled once.
+        enable_prefix_caching=False,
+        max_num_batched_tokens=10,
+        # Reduce startup time.
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
         )
-        vllm_config = engine_args.create_engine_config()
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
-            )
-        assert engine_core.batch_queue is not None
-
-        # Add two requests in a row. Each request have 12 prompt tokens.
-        req0 = make_request_with_max_tokens("0", 5)
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        req1 = make_request_with_max_tokens("1", 5)
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-
-        # Schedule Batch 1: (10, req0)
-        assert engine_core.step_with_batch_queue()[0] is None
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 10
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
-
-        # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue()[0] == {}
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 2
-        assert scheduler_output.num_scheduled_tokens["1"] == 8
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
-        assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
-
-        assert engine_core.scheduler.get_num_unfinished_requests() == 2
-
-        # Finish Batch 1 and schedule Batch 3: (4, req1).
-        # Note that req0 cannot be scheduled
-        # because it is in the decoding stage now.
-        engine_core.step_with_batch_queue()
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 4
-
-        # Finish Batch 2. Get first token of req0.
-        # Schedule Batch 4: (1, req0).
-        output = engine_core.step_with_batch_queue()[0].get(0)
+    assert engine_core.batch_queue is not None
+
+    # Add two requests in a row. Each request have 12 prompt tokens.
+    req0 = make_request_with_max_tokens("0", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    req1 = make_request_with_max_tokens("1", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+
+    # Schedule Batch 1: (10, req0)
+    assert engine_core.step_with_batch_queue()[0] is None
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 10
+    # num_computed_tokens should have been updated immediately.
+    assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
+
+    # Schedule Batch 2: (2, req0), (8, req1)
+    assert engine_core.step_with_batch_queue()[0] == {}
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 2
+    assert scheduler_output.num_scheduled_tokens["1"] == 8
+    # num_computed_tokens should have been updated immediately.
+    assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
+    assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
+
+    assert engine_core.scheduler.get_num_unfinished_requests() == 2
+
+    # Finish Batch 1 and schedule Batch 3: (4, req1).
+    # Note that req0 cannot be scheduled
+    # because it is in the decoding stage now.
+    engine_core.step_with_batch_queue()
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 4
+
+    # Finish Batch 2. Get first token of req0.
+    # Schedule Batch 4: (1, req0).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 1
+
+    # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 1
+
+    # Loop until req0 is finished.
+    req_id = 0
+    expected_num_tokens = [
+        engine_core.scheduler.requests["0"].num_tokens + 1,
+        engine_core.scheduler.requests["1"].num_tokens + 1,
+    ]
+    while engine_core.scheduler.get_num_unfinished_requests() == 2:
+        output = engine_core.step_with_batch_queue()[0]
+        # Every step consumes an output.
         assert output is not None
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 1
-
-        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
-        output = engine_core.step_with_batch_queue()[0].get(0)
-        assert output is not None
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 1
-
-        # Loop until req0 is finished.
-        req_id = 0
-        expected_num_tokens = [
-            engine_core.scheduler.requests["0"].num_tokens + 1,
-            engine_core.scheduler.requests["1"].num_tokens + 1,
-        ]
-        while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            output = engine_core.step_with_batch_queue()[0]
-            # Every step consumes an output.
-            assert output is not None
-            assert len(output[0].outputs) == 1
-            if req_id in engine_core.scheduler.requests:
-                assert (
-                    engine_core.scheduler.requests[req_id].num_tokens
-                    == expected_num_tokens[req_id]
-                )
-            expected_num_tokens[req_id] += 1
-            req_id = (req_id + 1) % 2
+        assert len(output[0].outputs) == 1
+        if req_id in engine_core.scheduler.requests:
+            assert (
+                engine_core.scheduler.requests[req_id].num_tokens
+                == expected_num_tokens[req_id]
+            )
+        expected_num_tokens[req_id] += 1
+        req_id = (req_id + 1) % 2
 
 
 @multi_gpu_test(num_gpus=2)
-def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_tp():
     """
     Test engine can initialize worker in tp properly
     """
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            tensor_parallel_size=2,
-            # Reduce startup time.
-            enforce_eager=True,
-        )
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        tensor_parallel_size=2,
+        # Reduce startup time.
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
 
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
 
-        def get_worker_cache_config_field(worker, key: str):
-            return getattr(worker.cache_config, key)
+    def get_worker_cache_config_field(worker, key: str):
+        return getattr(worker.cache_config, key)
 
-        num_gpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_gpu_blocks",)
-        )
-        num_cpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_cpu_blocks",)
-        )
-        assert all(x is not None for x in num_gpu_blocks)
-        assert all(x is not None for x in num_cpu_blocks)
+    num_gpu_blocks = engine_core.collective_rpc(
+        get_worker_cache_config_field, args=("num_gpu_blocks",)
+    )
+    num_cpu_blocks = engine_core.collective_rpc(
+        get_worker_cache_config_field, args=("num_cpu_blocks",)
+    )
+    assert all(x is not None for x in num_gpu_blocks)
+    assert all(x is not None for x in num_cpu_blocks)
 
 
 @create_new_process_for_each_test()
-def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_invalid_request_id_type():
     """Test that engine raises TypeError for non-string request_id."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)
 
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
-
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
 
-        # Test with UUID object (common mistake)
-        uuid_request = make_request()
-        uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
+    # Test with UUID object (common mistake)
+    uuid_request = make_request()
+    uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
 
-        with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
-            engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
+    with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
+        engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
 
-        # Test with integer
-        int_request = make_request()
-        int_request.request_id = 12345
+    # Test with integer
+    int_request = make_request()
+    int_request.request_id = 12345
 
-        with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
-            engine_core.add_request(*engine_core.preprocess_add_request(int_request))
+    with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
+        engine_core.add_request(*engine_core.preprocess_add_request(int_request))
 
-        # Test with None
-        none_request = make_request()
-        none_request.request_id = None
+    # Test with None
+    none_request = make_request()
+    none_request.request_id = None
 
-        with pytest.raises(
-            TypeError, match="request_id must be a string, got.*NoneType"
-        ):
-            engine_core.add_request(*engine_core.preprocess_add_request(none_request))
+    with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
+        engine_core.add_request(*engine_core.preprocess_add_request(none_request))
 
-        # Verify engine is still functional after errors
-        valid_request = make_request()
-        engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
+    # Verify engine is still functional after errors
+    valid_request = make_request()
+    engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 90284fc54d06..bc04d1f93f95 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -130,8 +130,6 @@ def test_engine_core_client(
     monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
 ):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Monkey-patch core engine utility function to test.
         m.setattr(EngineCore, "echo", echo, raising=False)
 
@@ -218,8 +216,6 @@ def test_engine_core_client(
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Monkey-patch core engine utility function to test.
         m.setattr(EngineCore, "echo", echo, raising=False)
 
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
     monkeypatch: pytest.MonkeyPatch,
 ):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Must set insecure serialization to allow returning custom types.
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
     monkeypatch: pytest.MonkeyPatch,
 ):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Must set insecure serialization to allow returning custom types.
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
     monkeypatch: pytest.MonkeyPatch,
 ):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Must set insecure serialization to allow returning custom types.
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
     indirect=["publisher_config"],
 )
 def test_kv_cache_events(
-    monkeypatch: pytest.MonkeyPatch,
     multiprocessing_mode: bool,
     publisher_config,
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        block_size = 16
-        num_blocks = 2
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            enforce_eager=True,
-            enable_prefix_caching=True,
-            block_size=block_size,
-        )
-        engine_args.kv_events_config = publisher_config
+    block_size = 16
+    num_blocks = 2
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        block_size=block_size,
+    )
+    engine_args.kv_events_config = publisher_config
 
-        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
 
-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=False,
-                vllm_config=vllm_config,
-                executor_class=executor_class,
-                log_stats=False,
-            )
-        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        subscriber = MockSubscriber(
-            endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
         )
+    endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    subscriber = MockSubscriber(
+        endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+    )
 
-        try:
-            custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
-            request = make_request(sampling_params, custom_tokens)
-            client.add_request(request)
+    try:
+        custom_tokens = list(range(num_blocks * block_size))
+        sampling_params = SamplingParams(max_tokens=1)
+        request = make_request(sampling_params, custom_tokens)
+        client.add_request(request)
 
-            outputs: dict[str, list] = {request.request_id: []}
-            loop_until_done(client, outputs)
+        outputs: dict[str, list] = {request.request_id: []}
+        loop_until_done(client, outputs)
 
-            result = subscriber.receive_one(timeout=1000)
-            assert result is not None, "No message received"
+        result = subscriber.receive_one(timeout=1000)
+        assert result is not None, "No message received"
 
-            seq, received = result
+        seq, received = result
 
-            assert seq == 0, "Sequence number mismatch"
-            assert len(received.events) == 1, (
-                "We should have exactly one BlockStored event"
-            )
-            event = received.events[0]
-            assert isinstance(event, BlockStored), "We should have a BlockStored event"
-            assert len(event.block_hashes) == num_blocks, (
-                "We should have a BlockStored event with 2 block_hashes"
-            )
-            assert event.block_size == block_size, (
-                "Block size should be the same as the block size"
-            )
-            assert event.parent_block_hash is None, "Parent block hash should be None"
-            assert event.lora_id is None, "Lora id should be None"
-            assert len(event.token_ids) == num_blocks * block_size, (
-                "Token ids should be the same as the custom tokens"
-            )
-            assert event.token_ids == custom_tokens, (
-                "Token ids should be the same as the custom tokens"
-            )
-        finally:
-            client.shutdown()
-            subscriber.close()
+        assert seq == 0, "Sequence number mismatch"
+        assert len(received.events) == 1, "We should have exactly one BlockStored event"
+        event = received.events[0]
+        assert isinstance(event, BlockStored), "We should have a BlockStored event"
+        assert len(event.block_hashes) == num_blocks, (
+            "We should have a BlockStored event with 2 block_hashes"
+        )
+        assert event.block_size == block_size, (
+            "Block size should be the same as the block size"
+        )
+        assert event.parent_block_hash is None, "Parent block hash should be None"
+        assert event.lora_id is None, "Lora id should be None"
+        assert len(event.token_ids) == num_blocks * block_size, (
+            "Token ids should be the same as the custom tokens"
+        )
+        assert event.token_ids == custom_tokens, (
+            "Token ids should be the same as the custom tokens"
+        )
+    finally:
+        client.shutdown()
+        subscriber.close()
 
 
 @pytest.mark.asyncio
@@ -672,101 +657,96 @@ def test_kv_cache_events(
 )
 @multi_gpu_test(num_gpus=4)
 async def test_kv_cache_events_dp(
-    monkeypatch: pytest.MonkeyPatch,
     multiprocessing_mode: bool,
     publisher_config,
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        block_size = 16
-        num_blocks = 2
-        dp_size = 2
-        tp_size = 2
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            enforce_eager=True,
-            enable_prefix_caching=True,
-            data_parallel_size=dp_size,
-            tensor_parallel_size=tp_size,
-            block_size=block_size,
-        )
-        engine_args.kv_events_config = publisher_config
+    block_size = 16
+    num_blocks = 2
+    dp_size = 2
+    tp_size = 2
+
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        data_parallel_size=dp_size,
+        tensor_parallel_size=tp_size,
+        block_size=block_size,
+    )
+    engine_args.kv_events_config = publisher_config
 
-        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
 
-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=True,
-                vllm_config=vllm_config,
-                executor_class=executor_class,
-                log_stats=False,
-            )
-        await asyncio.sleep(1)
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+        )
+    await asyncio.sleep(1)
 
-        # Build endpoints for all DP ranks
-        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        endpoints = []
-        for i in range(dp_size):
-            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
-            endpoints.append(offset_endpoint)
+    # Build endpoints for all DP ranks
+    base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    endpoints = []
+    for i in range(dp_size):
+        offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
+        endpoints.append(offset_endpoint)
 
-        subscriber = MockSubscriber(
-            endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
-        )
+    subscriber = MockSubscriber(
+        endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
+    )
 
-        try:
-            custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
-            all_request_ids = []
+    try:
+        custom_tokens = list(range(num_blocks * block_size))
+        sampling_params = SamplingParams(max_tokens=1)
+        all_request_ids = []
 
-            # Create and add 25 requests
-            # NOTE: attempts to force routing to both dp groups but can be flaky
-            for i in range(25):
-                await asyncio.sleep(0.01)
-                request = make_request(sampling_params, custom_tokens)
-                await client.add_request_async(request)
-                all_request_ids.append(request.request_id)
+        # Create and add 25 requests
+        # NOTE: attempts to force routing to both dp groups but can be flaky
+        for i in range(25):
+            await asyncio.sleep(0.01)
+            request = make_request(sampling_params, custom_tokens)
+            await client.add_request_async(request)
+            all_request_ids.append(request.request_id)
 
-            await asyncio.sleep(0.1)
+        await asyncio.sleep(0.1)
 
-            # Initialize outputs dict for all requests
-            outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
+        # Initialize outputs dict for all requests
+        outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
 
-            print("processing requests...")
-            await asyncio.wait_for(
-                loop_until_fully_done_async(client, outputs), timeout=20.0
-            )
+        print("processing requests...")
+        await asyncio.wait_for(
+            loop_until_fully_done_async(client, outputs), timeout=20.0
+        )
 
-            # Receive from subscriber until no more messages
-            print("collecting results...")
-            results = []
-            while True:
-                result = subscriber.receive_one(timeout=1)
-                print(result)
-                if result is None:
-                    break
-                results.append(result)
-
-            # Collect all events and data_parallel_ranks from all results
-            all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
-            unique_dps = set(all_dp_ranks)
-            assert len(unique_dps) == 2, (
-                f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
-            )
+        # Receive from subscriber until no more messages
+        print("collecting results...")
+        results = []
+        while True:
+            result = subscriber.receive_one(timeout=1)
+            print(result)
+            if result is None:
+                break
+            results.append(result)
+
+        # Collect all events and data_parallel_ranks from all results
+        all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
+        unique_dps = set(all_dp_ranks)
+        assert len(unique_dps) == 2, (
+            f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+        )
 
-        finally:
-            client.shutdown()
-            subscriber.close()
+    finally:
+        client.shutdown()
+        subscriber.close()
 
 
 @pytest.mark.timeout(20)
 def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Monkey-patch to extract core process pid while it's starting.
         core_proc_pid = [None]
         cepm_ctor = CoreEngineProcManager.__init__
@@ -841,7 +821,6 @@ def create_mock_executor(vllm_config):
     mock_executor_class.side_effect = create_mock_executor
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices
 
         from vllm.v1.engine.utils import EngineZmqAddresses
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index a19ba562136f..3f6f2211556f 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -21,12 +21,10 @@
 def _vllm_model(
     apc: bool,
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     *,
     skip_tokenizer_init: bool = False,
 ):
     """Set up VllmRunner instance."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     return vllm_runner(
         MODEL,
         dtype=DTYPE,
@@ -45,16 +43,16 @@ def _vllm_model(
     # Prefix caching
     params=[False, True],
 )
-def vllm_model(vllm_runner, request, monkeypatch):
+def vllm_model(vllm_runner, request):
     """VllmRunner test fixture parameterized by APC True/False."""
-    with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(request.param, vllm_runner) as vllm_model:
         yield vllm_model
 
 
 @pytest.fixture(scope="function")
-def vllm_model_apc(vllm_runner, monkeypatch):
+def vllm_model_apc(vllm_runner):
     """VllmRunner test fixture with APC."""
-    with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(True, vllm_runner) as vllm_model:
         yield vllm_model
 
 
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
     # Prefix caching
     params=[False, True],
 )
-def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+def vllm_model_skip_tokenizer_init(vllm_runner, request):
     """VllmRunner test fixture with APC."""
     with _vllm_model(
         request.param,
         vllm_runner,
-        monkeypatch,
         skip_tokenizer_init=True,
     ) as vllm_model:
         yield vllm_model
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
             )
 
 
-def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+def test_engine_metrics(vllm_runner, example_prompts):
     max_tokens = 100
     # Use spec decoding to test num_accepted_tokens_per_pos
     speculative_config = {
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
         "prompt_lookup_min": 3,
         "num_speculative_tokens": 5,
     }
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+
     with vllm_runner(
         MODEL,
         speculative_config=speculative_config,
@@ -216,8 +213,7 @@ def find_metric(name) -> list[Metric]:
 
 
 @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
-def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d4c33f6cbbe2..16cdc19037ba 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
 )
 def test_structured_output(
-    monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
     unsupported_json_schema: dict[str, Any],
     sample_sql_ebnf: str,
@@ -115,8 +114,6 @@ def test_structured_output(
     model_name: str,
     speculative_config: dict[str, Any],
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     if current_platform.is_tpu() and speculative_config:
         pytest.skip("TPU does not support speculative decoding")
 
@@ -620,15 +617,12 @@ def test_structured_output(
     ],
 )
 def test_structured_output_with_reasoning_matrices(
-    monkeypatch: pytest.MonkeyPatch,
     backend: str,
     tokenizer_mode: TokenizerMode,
     reasoning_parser: str,
     model_name: str,
     speculative_config: dict[str, Any] | None,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     if current_platform.is_tpu() and speculative_config:
         pytest.skip("TPU does not support speculative decoding")
 
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    monkeypatch: pytest.MonkeyPatch,
     unsupported_json_schema: dict[str, Any],
     model_name: str,
     tokenizer_mode: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     llm = LLM(
         model=model_name,
         max_model_len=1024,
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
 
 
 @pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
+def test_guidance_no_additional_properties():
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
         max_model_len=1024,
@@ -801,12 +790,9 @@ def generate_with_backend(backend):
 
 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
     backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     # Don't use eager execution on TPUs because we want to test for no
     # recompilation at runtime
     enforce_eager = bool(not current_platform.is_tpu())
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
index ea125f99fc42..fa1738bb3194 100644
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@@ -53,7 +53,6 @@ cleanup() {
 launch_baseline() {
   BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   PJRT_DEVICE=TPU \
   VLLM_WORKER_MULTIPROC_METHOD=spawn \
   VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@@ -73,7 +72,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
   VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
   PJRT_DEVICE=TPU \
@@ -93,7 +91,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   PJRT_DEVICE=TPU \
   VLLM_WORKER_MULTIPROC_METHOD=spawn \
   VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
index 8ba653770c4f..3d63822371be 100644
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
@@ -55,7 +55,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
   VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
   PJRT_DEVICE=TPU \
@@ -75,7 +74,6 @@ launch_pd() {
   UCX_TLS=tcp \
   VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
   VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
   PJRT_DEVICE=TPU \
   VLLM_WORKER_MULTIPROC_METHOD=spawn \
   VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index c844330bb466..2cb5e6733b79 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 import ray
@@ -10,15 +9,6 @@
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
 
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    """
-    The change relies on V1 APIs, so set VLLM_USE_V1=1.
-    """
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 MODELS = [
     "distilbert/distilgpt2",
 ]
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
     @ray.remote(num_gpus=1)
     class EngineTestActor:
         async def run(self):
-            # Set environment variable inside the Ray actor since environment
-            # variables from pytest fixtures don't propagate to Ray actors
-            os.environ["VLLM_USE_V1"] = "1"
-
             engine_args = AsyncEngineArgs(
                 model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
             )
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index f83bc90778b0..bda430a080f6 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
     batch_logprobs_composition: BatchLogprobsComposition,
     temperature: float,
     example_prompts: list[str],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test V1 Engine logprobs & prompt logprobs
 
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
       temperature: "temperature" sampling parameter
       example_prompts: example prompt fixture
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
-        if do_apc and (
-            temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
-        ):
-            # Skip some test-cases to save time.
-            pytest.skip()
-        test_prompts = example_prompts
-
-        max_tokens = 5
-        hf_outputs = hf_model.generate_greedy(
-            test_prompts,
+    do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
+    if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
+        # Skip some test-cases to save time.
+        pytest.skip()
+    test_prompts = example_prompts
+
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list
+    )
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(
             max_tokens=max_tokens,
+            logprobs=num_lp,
+            prompt_logprobs=num_plp,
+            temperature=temperature,
+            seed=1984,
         )
-        hf_logprobs = hf_model.generate_greedy_logprobs(
-            test_prompts,
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
             max_tokens=max_tokens,
+            do_apc=do_apc,
         )
 
-        # Batch has mixed sample params
-        # (different logprobs/prompt logprobs combos)
-        logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
 
-        # Ensure that each test prompt has a logprob config for testing
-        logprob_prompt_logprob_list = _repeat_logprob_config(
-            test_prompts, logprob_prompt_logprob_list
-        )
-        # Generate SamplingParams
-        vllm_sampling_params = [
-            SamplingParams(
-                max_tokens=max_tokens,
-                logprobs=num_lp,
-                prompt_logprobs=num_plp,
-                temperature=temperature,
-                seed=1984,
-            )
-            for num_lp, num_plp in logprob_prompt_logprob_list
-        ]
-        for _ in range(2 if do_apc else 1):
-            _run_and_validate(
-                vllm_model=vllm_model,
-                test_prompts=test_prompts,
-                vllm_sampling_params=vllm_sampling_params,
-                hf_logprobs=hf_logprobs,
-                hf_outputs=hf_outputs,
-                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                do_apc=do_apc,
-            )
-
-
-def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
+def test_max_logprobs():
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     Should also fail for `prompt_logprobs > max_logprobs`
     APC should not matter as this test checks basic request validation.
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        runner = VllmRunner(
-            "facebook/opt-125m",
-            max_logprobs=1,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.15,
-            max_model_len=256,
-        )
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        # should pass
-        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-        bad_sampling_params = SamplingParams(logprobs=2)
-        with pytest.raises(ValueError):
-            runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
 
-def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_none_logprobs(vllm_model, example_prompts):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
 
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
+    max_tokens = 5
 
-        sampling_params_logprobs_none = SamplingParams(
-            max_tokens=max_tokens,
-            logprobs=None,
-            prompt_logprobs=None,
-            temperature=0.0,
-        )
-        results_logprobs_none = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params_logprobs_none,
-        )
+    sampling_params_logprobs_none = SamplingParams(
+        max_tokens=max_tokens,
+        logprobs=None,
+        prompt_logprobs=None,
+        temperature=0.0,
+    )
+    results_logprobs_none = vllm_model.llm.generate(
+        example_prompts,
+        sampling_params=sampling_params_logprobs_none,
+    )
 
-        for i in range(len(results_logprobs_none)):
-            # Check sample logprobs are None
-            assert results_logprobs_none[i].outputs[0].logprobs is None
-            assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
-            # Check prompt logprobs are None
-            assert results_logprobs_none[i].prompt_logprobs is None
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
 
 
-def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_zero_logprobs(vllm_model, example_prompts):
     """Engine should return sampled token and prompt token logprobs
 
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
+    max_tokens = 5
 
-        sampling_params_logprobs_zero = SamplingParams(
-            max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
-        )
-        results_logprobs_zero = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_zero
-        )
+    sampling_params_logprobs_zero = SamplingParams(
+        max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
+    )
+    results_logprobs_zero = vllm_model.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_zero
+    )
 
-        for i in range(len(results_logprobs_zero)):
-            # Check that there is one sample logprob dict for each
-            # sample token
-            logprobs = results_logprobs_zero[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-            assert logprobs is not None
-            assert len(sampled_token_ids) == len(logprobs)
-            assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
-            # Check that there is one prompt logprob dict for each
-            # prompt token
-            assert prompt_logprobs is not None
-            assert len(prompt_token_ids) == len(prompt_logprobs)
-
-
-def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
+    for i in range(len(results_logprobs_zero)):
+        # Check that there is one sample logprob dict for each
+        # sample token
+        logprobs = results_logprobs_zero[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+        assert logprobs is not None
+        assert len(sampled_token_ids) == len(logprobs)
+        assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
+        # Check that there is one prompt logprob dict for each
+        # prompt token
+        assert prompt_logprobs is not None
+        assert len(prompt_token_ids) == len(prompt_logprobs)
+
+
+def test_all_logprobs(example_prompts):
     """Engine should return all vocabulary logprobs and prompt logprobs
 
     Args:
       example_prompts: list of example prompts (test fixture)
     """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        runner = VllmRunner(
-            "facebook/opt-125m",
-            max_logprobs=-1,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.15,
-            max_model_len=256,
-        )
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=-1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
 
-        sampling_params_logprobs_all = SamplingParams(
-            max_tokens=5, logprobs=-1, prompt_logprobs=-1
-        )
-        results_logprobs_all = runner.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_all
-        )
-        vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+    sampling_params_logprobs_all = SamplingParams(
+        max_tokens=5, logprobs=-1, prompt_logprobs=-1
+    )
+    results_logprobs_all = runner.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_all
+    )
+    vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
 
-        for i in range(len(results_logprobs_all)):
-            logprobs = results_logprobs_all[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
-            assert logprobs is not None
-            for logprob in logprobs:
-                assert len(logprob) == vocab_size
-            assert prompt_logprobs is not None
-            assert prompt_logprobs[0] is None
-            for prompt_logprob in prompt_logprobs[1:]:
-                assert len(prompt_logprob) == vocab_size
+    for i in range(len(results_logprobs_all)):
+        logprobs = results_logprobs_all[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_all[i].prompt_logprobs
+        assert logprobs is not None
+        for logprob in logprobs:
+            assert len(logprob) == vocab_size
+        assert prompt_logprobs is not None
+        assert prompt_logprobs[0] is None
+        for prompt_logprob in prompt_logprobs[1:]:
+            assert len(prompt_logprob) == vocab_size
 
 
 @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
-def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_mode(logprobs_mode: LogprobsMode):
     """Test with LLM engine with different logprobs_mode.
     For logprobs, we should have non-positive values.
     For logits, we should expect at least one positive values.
     """
     from vllm import LLM
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        llm = LLM(
-            "facebook/opt-125m",
-            max_logprobs=5,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.05,
-            max_model_len=16,
-            logprobs_mode=logprobs_mode,
-        )
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-        total_token_with_logprobs = 0
-        positive_values = 0
-        for output in results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    logprob = logprobs[token_id]
-                    if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
-                        assert logprob.logprob <= 0
-                    if logprob.logprob > 0:
-                        positive_values = positive_values + 1
-                    total_token_with_logprobs = total_token_with_logprobs + 1
-        assert total_token_with_logprobs >= len(results[0].outputs)
-        if logprobs_mode in ("raw_logits", "processed_logits"):
-            assert positive_values > 0
-        del llm
+    llm = LLM(
+        "facebook/opt-125m",
+        max_logprobs=5,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.05,
+        max_model_len=16,
+        logprobs_mode=logprobs_mode,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    total_token_with_logprobs = 0
+    positive_values = 0
+    for output in results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                logprob = logprobs[token_id]
+                if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
+                    assert logprob.logprob <= 0
+                if logprob.logprob > 0:
+                    positive_values = positive_values + 1
+                total_token_with_logprobs = total_token_with_logprobs + 1
+    assert total_token_with_logprobs >= len(results[0].outputs)
+    if logprobs_mode in ("raw_logits", "processed_logits"):
+        assert positive_values > 0
+    del llm
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 24f9397cc4c6..bdde28fe0342 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 
 from vllm import LLM, SamplingParams
 
-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"
 
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
         _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
 
 
-def test_priority(llm):
-    """Check that we reject requests with priority."""
-
-    # Reject all allowed token ids
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, priority=[1])
-
-
 def test_seed(llm):
     """Check that seed impacts randomness."""
 
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 647887812f8a..bc779f6bd9c4 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -38,7 +38,6 @@ def test_eagle_max_len(
     monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 1518987ded04..f3495b00d3d4 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -42,7 +42,6 @@
 @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
     tensor_parallel_size: int,
@@ -55,23 +54,20 @@ def test_basic(
     )
     example_prompts = [prompt]
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        # Note: max_num_batched_tokens == 1024 is needed here to
+        # actually test chunked prompt
+        max_num_batched_tokens=1024,
+        max_model_len=8192,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]
 
-        with vllm_runner(
-            model,
-            # Note: max_num_batched_tokens == 1024 is needed here to
-            # actually test chunked prompt
-            max_num_batched_tokens=1024,
-            max_model_len=8192,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output
 
 
 @pytest.mark.skip(reason="Temporarily disabled due to timeout")
@@ -82,7 +78,6 @@ def test_basic(
 @pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     max_tokens: int,
     max_num_seqs: int,
 ) -> None:
@@ -99,18 +94,15 @@ def test_phi3(
     # test head dim = 96
     model = "microsoft/Phi-3-mini-128k-instruct"
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text
 
 
 TP_SIZE_8 = 8
@@ -123,7 +115,6 @@ def test_phi3(
 )
 def test_gemma3_27b_with_text_input_and_tp(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     model = "google/gemma-3-27b-it"
     max_tokens = 16
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
         " but in rising every time we fall.",
     ]
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model,
-            max_num_batched_tokens=256,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=256,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text
 
 
 @pytest.mark.skipif(
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
 )
 def test_w8a8_quantization(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
     max_tokens = 5
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
     )
     example_prompts = [prompt]
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model,
-            max_num_batched_tokens=64,
-            max_model_len=4096,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=64,
+        max_model_len=4096,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]
+
+    assert "1024" in output or "0, 1" in output
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index e8cc396f970e..b7b6835c40cc 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -86,7 +86,6 @@ class TestParams:
 @pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
     vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
     params: TestParams,
 ) -> None:
     tokenizer = get_tokenizer(
@@ -107,48 +106,45 @@ def test_perf(
         )
     )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    sampling_params = SamplingParams(
+        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    )
 
-        sampling_params = SamplingParams(
-            max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    with vllm_runner(
+        params.model,
+        max_num_batched_tokens=MAX_MODEL_LEN,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=MAX_NUM_SEQS,
+        gpu_memory_utilization=GPU_UTIL,
+        enforce_eager=False,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        print("  -- Warmup / Compile")
+        for i in range(NUM_WARMUPS):
+            _ = vllm_model.generate(prompts, sampling_params)
+
+        print("  -- Benchmarking... ")
+        times = []
+        for i in range(NUM_RUNS):
+            start_time = time.time()
+            _ = vllm_model.generate(prompts, sampling_params)
+            times.append(time.time() - start_time)
+
+        avg_time = sum(times) / len(times)
+
+        print("  -- avg_time = {}".format(avg_time))
+        print(
+            "  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol
+            )
         )
-
-        with vllm_runner(
-            params.model,
-            max_num_batched_tokens=MAX_MODEL_LEN,
-            max_model_len=MAX_MODEL_LEN,
-            max_num_seqs=MAX_NUM_SEQS,
-            gpu_memory_utilization=GPU_UTIL,
-            enforce_eager=False,
-            tensor_parallel_size=1,
-        ) as vllm_model:
-            print("  -- Warmup / Compile")
-            for i in range(NUM_WARMUPS):
-                _ = vllm_model.generate(prompts, sampling_params)
-
-            print("  -- Benchmarking... ")
-            times = []
-            for i in range(NUM_RUNS):
-                start_time = time.time()
-                _ = vllm_model.generate(prompts, sampling_params)
-                times.append(time.time() - start_time)
-
-            avg_time = sum(times) / len(times)
-
-            print("  -- avg_time = {}".format(avg_time))
+        diff = avg_time - params.expected_avg_time
+        ok = diff < params.err_tol
+        if diff < -params.err_tol:
             print(
-                "  -- expected_avg_time = {} with err_tol = {}".format(
-                    params.expected_avg_time, params.err_tol
-                )
+                "  !! WARNING !! Performance has improved by {}, "
+                "it may be necessary to fine-tune the "
+                "expected_avg_time = {}".format(-diff, params.expected_avg_time)
             )
-            diff = avg_time - params.expected_avg_time
-            ok = diff < params.err_tol
-            if diff < -params.err_tol:
-                print(
-                    "  !! WARNING !! Performance has improved by {}, "
-                    "it may be necessary to fine-tune the "
-                    "expected_avg_time = {}".format(-diff, params.expected_avg_time)
-                )
-
-            assert ok, " !! ERROR !! Regression detected"
+
+        assert ok, " !! ERROR !! Regression detected"
diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
index e7767aceec55..505da4163143 100644
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -82,7 +82,7 @@ def test_traces(
 ):
     with monkeypatch.context() as m:
         m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+
         sampling_params = SamplingParams(
             temperature=0.01,
             top_p=0.1,
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index f48b354e8a7d..299567427027 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -77,7 +77,13 @@ def warming_up_model(self) -> None:
         logger.info("Warming up model for the compilation...")
         # Only generate graph for the generic shape
         with _set_global_compilation_settings(self.vllm_config):
-            self._dummy_run(max(16, self.max_num_reqs))
+            self._dummy_run(
+                min(
+                    max(16, self.max_num_reqs),
+                    self.scheduler_config.max_num_batched_tokens,
+                )
+            )
+
         logger.info("Warming up done.")
 
     def _init_device_properties(self) -> None: