vllm-project · aarnphm · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/tests/model_executor/model_loader/tensorizer_loader/conftest.py b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
@@ -10,7 +10,7 @@
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.executor.abstract import UniProcExecutor
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 MODEL_REF = "facebook/opt-125m"
 

@@ -36,7 +36,6 @@
     'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
     'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
     # cloudpickle
-    'vllm/worker/worker_base.py',
     'vllm/executor/mp_distributed_executor.py',
     'vllm/executor/ray_distributed_executor.py',
     'vllm/entrypoints/llm.py',

@@ -19,7 +19,7 @@
 from vllm.tasks import SupportedTask
 from vllm.utils import make_async
 from vllm.v1.outputs import PoolerOutput, SamplerOutput
-from vllm.worker.worker_base import WorkerBase
+from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 
@@ -30,7 +30,7 @@ class ExecutorBase(ABC):
     """Base class for all executors.
 
     An executor is responsible for executing the model on one device,
-    or it can be a distributed executor 
+    or it can be a distributed executor
     that can execute the model on multiple devices.
     """
 
@@ -83,7 +83,7 @@ def collective_rpc(self,
 
         Returns:
             A list containing the results from each worker.
-        
+
         Note:
             It is recommended to use this API to only pass control messages,
             and set up data-plane communication to pass data.
@@ -100,7 +100,7 @@ def determine_num_available_blocks(self) -> tuple[int, int]:
 
         Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
         `num_gpu_blocks` are blocks that are "active" on the device and can be
-        appended to. 
+        appended to.
         `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
         appended to.
         """
@@ -327,7 +327,7 @@ def _run_workers(
                 run only in the remote TP workers, not the driver worker.
                 It will also be run asynchronously and return a list of futures
                 rather than blocking on the results.
-        
+
         # TODO: simplify and merge with collective_rpc
         """
         raise NotImplementedError

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -16,7 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
@@ -19,7 +19,7 @@
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.utils import get_and_update_mm_cache
 from vllm.v1.outputs import AsyncModelRunnerOutput
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -160,10 +160,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         """
         Determine the number of available KV blocks.
         Add an additional all_reduce to get the min across all ranks.
-        Note that even if we have the same `gpu_memory_utilization` and 
-        `swap_space`, the available memory in every rank might still 
-        differ because NCCL can take different amounts of memory in 
-        different ranks. Therefore, it is necessary to test if all ranks 
+        Note that even if we have the same `gpu_memory_utilization` and
+        `swap_space`, the available memory in every rank might still
+        differ because NCCL can take different amounts of memory in
+        different ranks. Therefore, it is necessary to test if all ranks
         agree on the same KV cache configuration.
         """
         a, b = super().determine_num_available_blocks()

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -110,17 +110,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
-            if vllm_config.speculative_config:
-                if not envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Speculative decoding is not supported on vLLM V0.")
-                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
-            else:
-                if envs.VLLM_USE_V1:
-                    parallel_config.worker_cls = \
-                        "vllm.v1.worker.gpu_worker.Worker"
-                else:
-                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
 
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -327,17 +327,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             cache_config.block_size = 16
 
         if parallel_config.worker_cls == "auto":
-            if vllm_config.speculative_config:
-                if not use_v1:
-                    raise NotImplementedError(
-                        "Speculative decoding is not supported on vLLM V0.")
-                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
-            else:
-                if use_v1:
-                    parallel_config.worker_cls = \
-                        "vllm.v1.worker.gpu_worker.Worker"
-                else:
-                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
         #  Aiter rms norm perform best when CUDA Graph capture is enabled.
         if (use_v1 and use_aiter_rms_norm and not is_eager_execution
                 and "-rms_norm" not in compilation_config.custom_ops):

@@ -41,7 +41,7 @@
 from vllm.v1.executor.utils import get_and_update_mm_cache
 from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
                              ModelRunnerOutput)
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.v1.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
@@ -702,7 +702,7 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
 
 def set_multiprocessing_worker_envs():
     """ Set up environment variables that should be used when there are workers
-    in a multiprocessing environment. This should be called by the parent 
+    in a multiprocessing environment. This should be called by the parent
     process before worker processes are created"""
 
     _maybe_force_spawn()