Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.v1.executor.abstract import UniProcExecutor
from vllm.worker.worker_base import WorkerWrapperBase
from vllm.v1.worker.worker_base import WorkerWrapperBase

MODEL_REF = "facebook/opt-125m"

Expand Down
1 change: 0 additions & 1 deletion tools/pre_commit/check_pickle_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
# cloudpickle
'vllm/worker/worker_base.py',
'vllm/executor/mp_distributed_executor.py',
'vllm/executor/ray_distributed_executor.py',
'vllm/entrypoints/llm.py',
Expand Down
10 changes: 5 additions & 5 deletions vllm/executor/executor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from vllm.tasks import SupportedTask
from vllm.utils import make_async
from vllm.v1.outputs import PoolerOutput, SamplerOutput
from vllm.worker.worker_base import WorkerBase
from vllm.v1.worker.worker_base import WorkerBase

logger = init_logger(__name__)

Expand All @@ -30,7 +30,7 @@ class ExecutorBase(ABC):
"""Base class for all executors.

An executor is responsible for executing the model on one device,
or it can be a distributed executor
or it can be a distributed executor
that can execute the model on multiple devices.
"""

Expand Down Expand Up @@ -83,7 +83,7 @@ def collective_rpc(self,

Returns:
A list containing the results from each worker.

Note:
It is recommended to use this API to only pass control messages,
and set up data-plane communication to pass data.
Expand All @@ -100,7 +100,7 @@ def determine_num_available_blocks(self) -> tuple[int, int]:

Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
`num_gpu_blocks` are blocks that are "active" on the device and can be
appended to.
appended to.
`num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
appended to.
"""
Expand Down Expand Up @@ -327,7 +327,7 @@ def _run_workers(
run only in the remote TP workers, not the driver worker.
It will also be run asynchronously and return a list of futures
rather than blocking on the results.

# TODO: simplify and merge with collective_rpc
"""
raise NotImplementedError
Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/ray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from vllm.platforms import current_platform
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
from vllm.utils import get_ip
from vllm.worker.worker_base import WorkerWrapperBase
from vllm.v1.worker.worker_base import WorkerWrapperBase

if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput
Expand Down
10 changes: 5 additions & 5 deletions vllm/executor/uniproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
from vllm.v1.executor.utils import get_and_update_mm_cache
from vllm.v1.outputs import AsyncModelRunnerOutput
from vllm.worker.worker_base import WorkerWrapperBase
from vllm.v1.worker.worker_base import WorkerWrapperBase

logger = init_logger(__name__)

Expand Down Expand Up @@ -160,10 +160,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""
Determine the number of available KV blocks.
Add an additional all_reduce to get the min across all ranks.
Note that even if we have the same `gpu_memory_utilization` and
`swap_space`, the available memory in every rank might still
differ because NCCL can take different amounts of memory in
different ranks. Therefore, it is necessary to test if all ranks
Note that even if we have the same `gpu_memory_utilization` and
`swap_space`, the available memory in every rank might still
differ because NCCL can take different amounts of memory in
different ranks. Therefore, it is necessary to test if all ranks
agree on the same KV cache configuration.
"""
a, b = super().determine_num_available_blocks()
Expand Down
12 changes: 1 addition & 11 deletions vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
model_config = vllm_config.model_config

if parallel_config.worker_cls == "auto":
if vllm_config.speculative_config:
if not envs.VLLM_USE_V1:
raise NotImplementedError(
"Speculative decoding is not supported on vLLM V0.")
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
else:
if envs.VLLM_USE_V1:
parallel_config.worker_cls = \
"vllm.v1.worker.gpu_worker.Worker"
else:
parallel_config.worker_cls = "vllm.worker.worker.Worker"
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"

cache_config = vllm_config.cache_config
if cache_config and cache_config.block_size is None:
Expand Down
12 changes: 1 addition & 11 deletions vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,17 +327,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
cache_config.block_size = 16

if parallel_config.worker_cls == "auto":
if vllm_config.speculative_config:
if not use_v1:
raise NotImplementedError(
"Speculative decoding is not supported on vLLM V0.")
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
else:
if use_v1:
parallel_config.worker_cls = \
"vllm.v1.worker.gpu_worker.Worker"
else:
parallel_config.worker_cls = "vllm.worker.worker.Worker"
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
# Aiter rms norm perform best when CUDA Graph capture is enabled.
if (use_v1 and use_aiter_rms_norm and not is_eager_execution
and "-rms_norm" not in compilation_config.custom_ops):
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/executor/multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from vllm.v1.executor.utils import get_and_update_mm_cache
from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
ModelRunnerOutput)
from vllm.worker.worker_base import WorkerWrapperBase
from vllm.v1.worker.worker_base import WorkerWrapperBase

logger = init_logger(__name__)

Expand Down Expand Up @@ -702,7 +702,7 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:

def set_multiprocessing_worker_envs():
""" Set up environment variables that should be used when there are workers
in a multiprocessing environment. This should be called by the parent
in a multiprocessing environment. This should be called by the parent
process before worker processes are created"""

_maybe_force_spawn()
Expand Down
Loading