Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
e92a8cf
[V0 Deprecation] Remove AsyncLLMEngine
WoosukKwon Sep 17, 2025
9a54905
Merge branch 'main' into woosuk/remove-async-llm-engine
WoosukKwon Sep 17, 2025
6c89e62
fix assert false
WoosukKwon Sep 17, 2025
63f124d
merge
WoosukKwon Sep 17, 2025
f63f899
[V0 Deprecation] Remove LLMEngine
WoosukKwon Sep 17, 2025
9186476
merge
WoosukKwon Sep 17, 2025
65b3990
merge
WoosukKwon Sep 17, 2025
51a326d
fix
WoosukKwon Sep 17, 2025
8c2eb56
revert
WoosukKwon Sep 17, 2025
7a92f17
fix test_chat
WoosukKwon Sep 17, 2025
d80a455
fix pp test
WoosukKwon Sep 17, 2025
9bb81fe
fix
WoosukKwon Sep 17, 2025
c855f92
rm more tests
WoosukKwon Sep 18, 2025
c12bc3e
fix
WoosukKwon Sep 18, 2025
3d7c361
fix
WoosukKwon Sep 18, 2025
c17fb8f
[V0 Deprecation] Remove more V0 tests
WoosukKwon Sep 18, 2025
9011ad2
minor
WoosukKwon Sep 18, 2025
2d60e15
fix
WoosukKwon Sep 18, 2025
7e3535c
Merge branch 'main' into woosuk/rm-more-v0-tests
WoosukKwon Sep 18, 2025
4e42d0c
Merge branch 'main' into woosuk/rm-more-v0-tests
WoosukKwon Sep 18, 2025
9df17d4
Merge branch 'woosuk/rm-more-v0-tests' into woosuk/remove-async-llm-e…
WoosukKwon Sep 18, 2025
4de8eda
update
WoosukKwon Sep 18, 2025
679bf7b
Merge branch 'main' into woosuk/remove-async-llm-engine
WoosukKwon Sep 18, 2025
d2cd2a4
merge
WoosukKwon Sep 18, 2025
c846648
merge
WoosukKwon Sep 18, 2025
4d356ef
rm v0 tests
WoosukKwon Sep 18, 2025
91dd5db
Merge branch 'main' into woosuk/remove-llm-engine
WoosukKwon Sep 19, 2025
277ef29
rm
WoosukKwon Sep 19, 2025
c05504e
minor
WoosukKwon Sep 19, 2025
5fe855d
merge
WoosukKwon Sep 20, 2025
60b94e6
Remove codeowners
WoosukKwon Sep 20, 2025
35c121f
fix
WoosukKwon Sep 20, 2025
becf74c
fix
WoosukKwon Sep 20, 2025
2104774
fix
WoosukKwon Sep 20, 2025
f60bf6c
Remove v0 output processor
WoosukKwon Sep 20, 2025
88d5f07
Remove V0 core
WoosukKwon Sep 20, 2025
a17686b
fix
WoosukKwon Sep 20, 2025
4603a7c
Merge branch 'woosuk/remove-llm-engine' into woosuk/rm-v0-core
WoosukKwon Sep 20, 2025
f2b7215
rm fp8 kv cache fallback & profiling
WoosukKwon Sep 21, 2025
dddbd5e
skip
WoosukKwon Sep 21, 2025
751107b
Merge branch 'woosuk/remove-llm-engine' into woosuk/rm-v0-core
WoosukKwon Sep 21, 2025
7f9a153
rm misc
WoosukKwon Sep 21, 2025
d9ff402
merge
WoosukKwon Sep 21, 2025
3dfacf9
minor
WoosukKwon Sep 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
Expand Down Expand Up @@ -867,8 +866,6 @@ steps:
- tests/distributed/
- vllm/compilation
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
Expand Down
2 changes: 0 additions & 2 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ line-length = 80
"vllm/_version.py" = ["ALL"]
# Python 3.8 typing - skip V0 code
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
Expand Down Expand Up @@ -117,7 +116,6 @@ files = [
"vllm/*.py",
"vllm/assets",
"vllm/entrypoints",
"vllm/core",
"vllm/inputs",
"vllm/logging_utils",
"vllm/multimodal",
Expand Down
12 changes: 4 additions & 8 deletions vllm/attention/backends/differential_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import defaultdict
from dataclasses import dataclass
from itertools import accumulate
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type

import torch
from einops import rearrange
Expand Down Expand Up @@ -34,9 +34,6 @@
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
flash_attn_with_kvcache)

if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder

logger = init_logger(__name__)


Expand Down Expand Up @@ -329,7 +326,7 @@ def decode_metadata(
class DifferentialFlashAttentionMetadataBuilder(
AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):

def __init__(self, input_builder: "ModelInputForGPUBuilder"):
def __init__(self, input_builder):
self.input_builder = input_builder
self.runner = input_builder.runner
self.sliding_window = input_builder.sliding_window
Expand All @@ -350,9 +347,8 @@ def prepare(self):
self.num_decode_tokens = 0
self.has_prefix_cache_hit = False

def _add_seq_group(
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
prefix_cache_hit: bool):
"""Add a sequence group to the metadata. Specifically update/append
1. context length.
2. block table.
Expand Down
10 changes: 3 additions & 7 deletions vllm/attention/backends/dual_chunk_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
import math
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type

import torch
import torch.distributed
Expand All @@ -22,9 +22,6 @@
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
flash_attn_with_kvcache, sparse_attn_func)

if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder

logger = init_logger(__name__)


Expand Down Expand Up @@ -224,9 +221,8 @@ def prepare(self):
super().prepare()
self.orig_seq_lens: List[int] = []

def _add_seq_group(
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
prefix_cache_hit: bool):
super()._add_seq_group(inter_data, chunked_prefill_enabled,
prefix_cache_hit)
for prompt_len, seq_len in zip(inter_data.prompt_lens,
Expand Down
12 changes: 4 additions & 8 deletions vllm/attention/backends/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import defaultdict
from dataclasses import dataclass
from itertools import accumulate
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
from typing import Dict, List, Optional, Tuple, Type

import torch

Expand All @@ -31,9 +31,6 @@
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
flash_attn_with_kvcache)

if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder

logger = init_logger(__name__)


Expand Down Expand Up @@ -312,7 +309,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
class FlashAttentionMetadataBuilder(
AttentionMetadataBuilder[FlashAttentionMetadata]):

def __init__(self, input_builder: "ModelInputForGPUBuilder"):
def __init__(self, input_builder):
self.input_builder = input_builder
self.runner = input_builder.runner
self.sliding_window = input_builder.sliding_window
Expand All @@ -332,9 +329,8 @@ def prepare(self):
self.num_decode_tokens = 0
self.has_prefix_cache_hit = False

def _add_seq_group(
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
prefix_cache_hit: bool):
"""Add a sequence group to the metadata. Specifically update/append
1. context length.
2. block table.
Expand Down
13 changes: 4 additions & 9 deletions vllm/attention/backends/mla/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,7 @@
from contextlib import contextmanager
from dataclasses import dataclass
from itertools import accumulate
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
Type, TypeVar)
from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar

import torch

Expand Down Expand Up @@ -233,9 +232,6 @@
except ImportError:
flash_attn_varlen_func = None

if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder

is_hip = current_platform.is_rocm()


Expand Down Expand Up @@ -638,7 +634,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
"""
BLOCK_TABLE_EXTENDER: list[list[int]] = []

def __init__(self, input_builder: "ModelInputForGPUBuilder"):
def __init__(self, input_builder):
self.input_builder = input_builder
self.runner = input_builder.runner
self.sliding_window = input_builder.sliding_window
Expand Down Expand Up @@ -668,9 +664,8 @@ def prepare(self):
self.num_decode_tokens = 0
self.has_prefix_cache_hit = False

def _add_seq_group(
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
prefix_cache_hit: bool):
"""Add a sequence group to the metadata. Specifically update/append
1. context length.
2. block table.
Expand Down
11 changes: 3 additions & 8 deletions vllm/attention/backends/placeholder_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import defaultdict
from dataclasses import dataclass
from itertools import accumulate
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
from typing import Dict, List, Optional, Tuple, Type

import torch

Expand All @@ -13,9 +13,6 @@
AttentionMetadataBuilder)
from vllm.attention.backends.utils import CommonAttentionState
from vllm.multimodal import MultiModalPlaceholderMap

if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder)
from vllm.utils import async_tensor_h2d

# Placeholder attention backend for models like Mamba and pooling models that
Expand Down Expand Up @@ -204,7 +201,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
class PlaceholderAttentionMetadataBuilder(
AttentionMetadataBuilder[PlaceholderAttentionMetadata]):

def __init__(self, input_builder: "ModelInputForGPUBuilder"):
def __init__(self, input_builder):

self.input_builder = input_builder
self.runner = input_builder.runner
Expand All @@ -220,9 +217,7 @@ def prepare(self):
self.num_prefill_tokens = 0
self.num_decode_tokens = 0

def _add_seq_group(
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
chunked_prefill_enabled: bool):
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
"""Add a sequence group to the metadata. Specifically update/append
1. context length.
"""
Expand Down
7 changes: 2 additions & 5 deletions vllm/attention/backends/rocm_aiter_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from contextlib import contextmanager
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional, Type, Union
from typing import Optional, Type, Union

import torch

Expand All @@ -19,9 +19,6 @@
from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
get_aiter_mla_metadata)

if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder


def is_aiter_mla_enabled() -> bool:
return envs.VLLM_ROCM_USE_AITER \
Expand Down Expand Up @@ -110,7 +107,7 @@ def decode_metadata(self):
class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]

def __init__(self, input_builder: "ModelInputForGPUBuilder"):
def __init__(self, input_builder):
super().__init__(input_builder)
assert self.block_size == 1, "AITER MLA requires only block size 1."

Expand Down
9 changes: 2 additions & 7 deletions vllm/attention/backends/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@
# if we have at least this many elements. Could be tuned further.
_COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256

if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder


def is_block_tables_empty(block_tables: Union[None, Dict]):
"""
Expand Down Expand Up @@ -129,7 +126,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):

_metadata_cls: Type[TAttentionMetadata]

def __init__(self, input_builder: "ModelInputForGPUBuilder"):
def __init__(self, input_builder):
self.input_builder = input_builder
self.runner = input_builder.runner

Expand All @@ -149,9 +146,7 @@ def prepare(self):
self.num_prefill_tokens = 0
self.num_decode_tokens = 0

def _add_seq_group(
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
chunked_prefill_enabled: bool):
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
is_prompt = inter_data.is_prompt
block_tables = inter_data.block_tables

Expand Down
Empty file removed vllm/core/__init__.py
Empty file.
Empty file removed vllm/core/block/__init__.py
Empty file.
Loading