vllm-project · WoosukKwon · Sep 21, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -148,7 +148,6 @@ steps:
   num_gpus: 4
   source_file_dependencies:
   - vllm/distributed/
-  - vllm/core/
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
@@ -867,8 +866,6 @@ steps:
   - tests/distributed/
   - vllm/compilation
   - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
   - tests/v1/test_async_llm_dp.py
   - tests/v1/test_external_lb_dp.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -4,10 +4,8 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256

diff --git a/pyproject.toml b/pyproject.toml
@@ -70,7 +70,6 @@ line-length = 80
 "vllm/_version.py" = ["ALL"]
 # Python 3.8 typing - skip V0 code
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/core/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
@@ -117,7 +116,6 @@ files = [
     "vllm/*.py",
     "vllm/assets",
     "vllm/entrypoints",
-    "vllm/core",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",

@@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 from einops import rearrange
@@ -34,9 +34,6 @@
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache)
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 logger = init_logger(__name__)
 
 
@@ -329,7 +326,7 @@ def decode_metadata(
 class DifferentialFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):
 
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
         self.input_builder = input_builder
         self.runner = input_builder.runner
         self.sliding_window = input_builder.sliding_window
@@ -350,9 +347,8 @@ def prepare(self):
         self.num_decode_tokens = 0
         self.has_prefix_cache_hit = False
 
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         2. block table.

@@ -4,7 +4,7 @@
 """
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.distributed
@@ -22,9 +22,6 @@
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache, sparse_attn_func)
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 logger = init_logger(__name__)
 
 
@@ -224,9 +221,8 @@ def prepare(self):
         super().prepare()
         self.orig_seq_lens: List[int] = []
 
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
         super()._add_seq_group(inter_data, chunked_prefill_enabled,
                                prefix_cache_hit)
         for prompt_len, seq_len in zip(inter_data.prompt_lens,

@@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 
@@ -31,9 +31,6 @@
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache)
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 logger = init_logger(__name__)
 
 
@@ -312,7 +309,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
 
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
         self.input_builder = input_builder
         self.runner = input_builder.runner
         self.sliding_window = input_builder.sliding_window
@@ -332,9 +329,8 @@ def prepare(self):
         self.num_decode_tokens = 0
         self.has_prefix_cache_hit = False
 
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         2. block table.

@@ -193,8 +193,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
-                    Type, TypeVar)
+from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar
 
 import torch
 
@@ -233,9 +232,6 @@
     except ImportError:
         flash_attn_varlen_func = None
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 is_hip = current_platform.is_rocm()
 
 
@@ -638,7 +634,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     """
     BLOCK_TABLE_EXTENDER: list[list[int]] = []
 
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
         self.input_builder = input_builder
         self.runner = input_builder.runner
         self.sliding_window = input_builder.sliding_window
@@ -668,9 +664,8 @@ def prepare(self):
         self.num_decode_tokens = 0
         self.has_prefix_cache_hit = False
 
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         2. block table.

@@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 
@@ -13,9 +13,6 @@
                                               AttentionMetadataBuilder)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.multimodal import MultiModalPlaceholderMap
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder)
 from vllm.utils import async_tensor_h2d
 
 # Placeholder attention backend for models like Mamba and pooling models that
@@ -204,7 +201,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
 class PlaceholderAttentionMetadataBuilder(
         AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
 
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
 
         self.input_builder = input_builder
         self.runner = input_builder.runner
@@ -220,9 +217,7 @@ def prepare(self):
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
 
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         """

@@ -3,7 +3,7 @@
 
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Type, Union
+from typing import Optional, Type, Union
 
 import torch
 
@@ -19,9 +19,6 @@
 from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
                                                get_aiter_mla_metadata)
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 
 def is_aiter_mla_enabled() -> bool:
     return envs.VLLM_ROCM_USE_AITER \
@@ -110,7 +107,7 @@ def decode_metadata(self):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
 
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
         super().__init__(input_builder)
         assert self.block_size == 1, "AITER MLA requires only block size 1."
 

@@ -35,9 +35,6 @@
 # if we have at least this many elements. Could be tuned further.
 _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 
 def is_block_tables_empty(block_tables: Union[None, Dict]):
     """
@@ -129,7 +126,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
 
     _metadata_cls: Type[TAttentionMetadata]
 
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
         self.input_builder = input_builder
         self.runner = input_builder.runner
 
@@ -149,9 +146,7 @@ def prepare(self):
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
 
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
         is_prompt = inter_data.is_prompt
         block_tables = inter_data.block_tables
 

diff --git a/vllm/core/__init__.py b/vllm/core/__init__.py
diff --git a/vllm/core/block/__init__.py b/vllm/core/block/__init__.py