From 43927e7104959065a08b8932f74ec3e2c04be623 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 09:58:14 +0000 Subject: [PATCH 1/8] Init Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 16 +++++----- tests/lora/test_lora_checkpoints.py | 21 +++++--------- tests/lora/test_lora_huggingface.py | 10 ++----- tests/lora/test_lora_manager.py | 9 ++---- vllm/lora/models.py | 29 ++++++------------- vllm/lora/worker_manager.py | 6 +--- vllm/model_executor/models/apertus.py | 1 - vllm/model_executor/models/bamba.py | 1 - vllm/model_executor/models/exaone.py | 1 - vllm/model_executor/models/exaone4.py | 1 - vllm/model_executor/models/falcon_h1.py | 1 - vllm/model_executor/models/granite.py | 1 - vllm/model_executor/models/granitemoe.py | 1 - .../model_executor/models/granitemoehybrid.py | 1 - .../model_executor/models/granitemoeshared.py | 1 - vllm/model_executor/models/interfaces.py | 6 +--- vllm/model_executor/models/jamba.py | 1 - vllm/model_executor/models/lfm2.py | 1 - vllm/model_executor/models/lfm2_moe.py | 1 - vllm/model_executor/models/llama.py | 1 - vllm/model_executor/models/minicpm.py | 1 - vllm/model_executor/models/minicpm_eagle.py | 1 - vllm/model_executor/models/minicpmv.py | 1 - vllm/model_executor/models/mixtral.py | 1 - vllm/model_executor/models/nemotron.py | 1 - vllm/model_executor/models/nemotron_h.py | 1 - vllm/model_executor/models/nemotron_nas.py | 1 - vllm/model_executor/models/phimoe.py | 1 - vllm/model_executor/models/solar.py | 1 - .../models/transformers/base.py | 1 - vllm/v1/worker/lora_model_runner_mixin.py | 1 - 31 files changed, 31 insertions(+), 90 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 9d38ec542279..aa8c36c9a599 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -161,14 +161,8 @@ def llama_2_7b_base_huggingface_id(): @pytest.fixture(scope="session") -def sql_lora_huggingface_id(): - # huggingface repo id is used to test lora runtime downloading. - return "yard1/llama-2-7b-sql-lora-test" - - -@pytest.fixture(scope="session") -def sql_lora_files(sql_lora_huggingface_id): - return snapshot_download(repo_id=sql_lora_huggingface_id) +def sql_lora_files(): + return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") @pytest.fixture(scope="session") @@ -255,6 +249,12 @@ def qwen3_lora_files(): return snapshot_download(repo_id="charent/self_cognition_Alice") +@pytest.fixture(scope="session") +def qwen3_lora_huggingface_id(): + # huggingface repo id is used to test lora runtime downloading. + return "charent/self_cognition_Alice" + + @pytest.fixture(scope="session") def llama32_lora_files(): return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider") diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 2219d470e91a..60dd8f43109d 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -26,8 +26,7 @@ def test_load_checkpoints( chatglm3_lora_files, ): packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping - embedding_modules = BaiChuanBaseForCausalLM.embedding_modules - embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules + expected_lora_modules: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: @@ -46,8 +45,7 @@ def test_load_checkpoints( peft_helper=peft_helper, lora_model_id=1, device="cpu", - embedding_modules=embedding_modules, - embedding_padding_modules=embed_padding_modules, + model_vocab_size=64000, ) elif lora_name == "baichuan7B-zero": # Test that the target_modules contain prefix @@ -62,8 +60,7 @@ def test_load_checkpoints( peft_helper=peft_helper, lora_model_id=1, device="cpu", - embedding_modules=embedding_modules, - embedding_padding_modules=embed_padding_modules, + model_vocab_size=64000, ) elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, @@ -77,8 +74,7 @@ def test_load_checkpoints( peft_helper=peft_helper, lora_model_id=1, device="cpu", - embedding_modules=embedding_modules, - embedding_padding_modules=embed_padding_modules, + model_vocab_size=64000, ) else: # For the baichuan7B model, load chatglm3-6b's LoRA, @@ -94,15 +90,13 @@ def test_load_checkpoints( peft_helper=peft_helper, lora_model_id=1, device="cpu", - embedding_modules=embedding_modules, - embedding_padding_modules=embed_padding_modules, + model_vocab_size=64000, ) def test_lora_weights_mapping(baichuan_lora_files): packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping - embedding_modules = BaiChuanBaseForCausalLM.embedding_modules - embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules + expected_lora_modules: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: @@ -127,8 +121,7 @@ def test_lora_weights_mapping(baichuan_lora_files): peft_helper=peft_helper, lora_model_id=1, device="cpu", - embedding_modules=embedding_modules, - embedding_padding_modules=embed_padding_modules, + model_vocab_size=64000, weights_mapper=hf_to_vllm_mapper, ) for name in lora_model.loras: diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 7d20faef541a..7ffffa59d1da 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -6,10 +6,10 @@ from vllm.lora.models import LoRAModel from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path -from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM # Provide absolute path and huggingface lora ids -lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"] +lora_fixture_name = ["qwen3_lora_files", "qwen3_lora_huggingface_id"] LLAMA_LORA_MODULES = [ "qkv_proj", "o_proj", @@ -23,9 +23,7 @@ @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name) def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_name = request.getfixturevalue(lora_fixture_name) - packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping - embedding_modules = LlamaForCausalLM.embedding_modules - embed_padding_modules = LlamaForCausalLM.embedding_padding_modules + packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping expected_lora_modules: list[str] = [] for module in LLAMA_LORA_MODULES: if module in packed_modules_mapping: @@ -43,8 +41,6 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): peft_helper=peft_helper, lora_model_id=1, device="cpu", - embedding_modules=embedding_modules, - embedding_padding_modules=embed_padding_modules, ) # Assertions to ensure the model is loaded correctly diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 24d4dfca46d6..63b366e7b9c3 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -34,7 +34,6 @@ "lm_head": "output_embeddings", } -EMBEDDING_PADDING_MODULES = ["lm_head"] DEVICES = ( [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] @@ -57,8 +56,6 @@ def test_from_lora_tensors(sql_lora_files, device): tensors, peft_helper=peft_helper, device=device, - embedding_modules=EMBEDDING_MODULES, - embedding_padding_modules=EMBEDDING_PADDING_MODULES, ) for module_name, lora in lora_model.loras.items(): assert lora.module_name == module_name @@ -430,7 +427,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 worker_adapter_manager = LRUCacheWorkerLoRAManager( - vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES + vllm_config, device, EMBEDDING_MODULES ) worker_adapter_manager.max_num_seqs = 4 @@ -533,9 +530,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 - worker_adapter_manager = WorkerLoRAManager( - vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES - ) + worker_adapter_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES) worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size worker_adapter_manager.create_lora_manager(dummy_model_gate_up) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 636f062feb7b..ce3e12fbccde 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -112,9 +112,7 @@ def from_lora_tensors( peft_helper: PEFTHelper, device: str = "cuda", dtype: torch.dtype | None = None, - target_embedding_padding: int | None = None, - embedding_modules: dict[str, str] | None = None, - embedding_padding_modules: list[str] | None = None, + model_vocab_size: int | None = None, weights_mapper: WeightsMapper | None = None, ) -> "LoRAModel": """Create a LoRAModel from a dictionary of tensors.""" @@ -132,22 +130,17 @@ def from_lora_tensors( ) if is_lora_a: + if "lora_embedding_A" in tensor_name and model_vocab_size is not None: + assert model_vocab_size == tensor.shape[1], ( + f"The embedding LoRA size({tensor.shape[1]}) must be consistent" + f" with the base model's vocabulary size({model_vocab_size})." + ) loras[module_name].lora_a = tensor.to(device=device, dtype=dtype) if pin_memory: loras[module_name].lora_a = loras[module_name].lora_a.pin_memory() else: loras[module_name].lora_b = tensor.to(device=device, dtype=dtype) - assert embedding_padding_modules is not None - if ( - any(name in module_name for name in embedding_padding_modules) - and target_embedding_padding is not None - ): - lora_b = loras[module_name].lora_b - assert target_embedding_padding >= lora_b.shape[0] - addition = target_embedding_padding - lora_b.shape[0] - loras[module_name].lora_b = torch.nn.functional.pad( - lora_b, (0, 0, 0, addition) - ) + if pin_memory: loras[module_name].lora_b = loras[module_name].lora_b.pin_memory() @@ -166,9 +159,7 @@ def from_local_checkpoint( lora_model_id: int | None = None, device: str = "cuda", dtype: torch.dtype | None = None, - target_embedding_padding: int | None = None, - embedding_modules: dict[str, str] | None = None, - embedding_padding_modules: list[str] | None = None, + model_vocab_size: int | None = None, weights_mapper: WeightsMapper | None = None, tensorizer_config_dict: dict | None = None, ) -> "LoRAModel": @@ -292,9 +283,7 @@ def check_unexpected_modules(modules: dict): peft_helper=peft_helper, device=device, dtype=dtype, - target_embedding_padding=target_embedding_padding, - embedding_modules=embedding_modules, - embedding_padding_modules=embedding_padding_modules, + model_vocab_size=model_vocab_size, weights_mapper=weights_mapper, ) diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4cc201a6414f..8125428d5b20 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -34,12 +34,10 @@ def __init__( vllm_config: VllmConfig, device: torch.device, embedding_modules: dict[str, str], - embedding_padding_modules: list[str], lora_model_cls: type[LoRAModel] = LoRAModel, ): self._lora_model_cls = lora_model_cls self.embedding_modules = embedding_modules - self.embedding_padding_modules = embedding_padding_modules self._cached_dummy_lora: None | Literal[False] | LoRAModel = False self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs self.max_num_batched_tokens = ( @@ -121,9 +119,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, - target_embedding_padding=self.vocab_size, - embedding_modules=self.embedding_modules, - embedding_padding_modules=self.embedding_padding_modules, + model_vocab_size=self.vocab_size, tensorizer_config_dict=lora_request.tensorizer_config_dict, weights_mapper=hf_to_vllm_mapper, ) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index f38b09bf5506..4a69787af55e 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -482,7 +482,6 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__( self, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 4422bb5da98f..1d6493b18c34 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -419,7 +419,6 @@ class BambaForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] @classmethod def get_mamba_state_dtype_from_config( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 99002baa8752..acf651ed2498 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -457,7 +457,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "wte": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 9d2c67d6c4f8..cb710a7ec5cf 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -450,7 +450,6 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 9433f0d1b4a4..83ceb9303cfb 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -510,7 +510,6 @@ class FalconH1ForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] @classmethod def get_mamba_state_dtype_from_config( diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index eac9ef9478a6..76519c4660f1 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -400,7 +400,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 02c6c5862141..b038400a1262 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -497,7 +497,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 9d5eeef198a6..1d9c2f5df4a5 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -601,7 +601,6 @@ class GraniteMoeHybridForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] @classmethod def get_mamba_state_dtype_from_config( diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index fd346db7e35a..8ad5a7105bb5 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -263,7 +263,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 6f6ce32538b7..3e31fea8f754 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -336,10 +336,8 @@ class SupportsLoRA(Protocol): There is no need to redefine this flag if this class is in the MRO of your model class. """ - # The `embedding_module` and `embedding_padding_modules` - # are empty by default. + # The `embedding_module` is empty by default. embedding_modules: ClassVar[dict[str, str]] = {} - embedding_padding_modules: ClassVar[list[str]] = [] packed_modules_mapping: dict[str, list[str]] = {} @@ -351,7 +349,6 @@ class _SupportsLoRAType(Protocol): packed_modules_mapping: dict[str, list[str]] embedding_modules: dict[str, str] - embedding_padding_modules: list[str] @overload @@ -371,7 +368,6 @@ def supports_lora( lora_attrs = ( "packed_modules_mapping", "embedding_modules", - "embedding_padding_modules", ) missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr)) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 3a2c98c73dab..b2ad12be1e35 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -480,7 +480,6 @@ class JambaForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 69615f8b6a09..a4a994f97a2f 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -422,7 +422,6 @@ class Lfm2ForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] @classmethod def get_mamba_state_dtype_from_config( diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index aaeb2cc38999..c8669de72dd0 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -602,7 +602,6 @@ class Lfm2MoeForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] @classmethod def get_mamba_state_dtype_from_config( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 6dfbde7a17f5..8f5a967cd422 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -528,7 +528,6 @@ class LlamaForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] # Mistral/Llama models can also be loaded with --load-format mistral # from consolidated.safetensors checkpoints diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 67911ba8c1c8..67c462f4b25c 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -568,7 +568,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index e6bccfcac4f1..9f3587a6d2fa 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -305,7 +305,6 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 2ac97764dd34..6d0ebf5c9825 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1741,5 +1741,4 @@ def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""): # so update values before init is called cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) cls.embedding_modules.update(instance_cls.embedding_modules) - cls.embedding_padding_modules += instance_cls.embedding_padding_modules return instance_cls(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index e21656dbd635..50ec57e7a805 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -496,7 +496,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 93ad2064a2fc..ffba6c9dfe73 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -439,7 +439,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 8675eff59222..baeb901bbb05 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -713,7 +713,6 @@ class NemotronHForCausalLM( "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] @classmethod def get_mamba_state_dtype_from_config( diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 34ea2945b711..9d968dee8711 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -387,7 +387,6 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] # Mistral/Llama models can also be loaded with --load-format mistral # from consolidated.safetensors checkpoints diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index a5a669139b2f..49530776f890 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -617,7 +617,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index c576154b1ecf..7bef56110cab 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -426,7 +426,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "embed_tokens": "input_embeddings", "lm_head": "output_embeddings", } - embedding_padding_modules = ["lm_head"] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index b33ce35427f5..f3ebc6da8e30 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -93,7 +93,6 @@ def vllm_flash_attention_forward( class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): - embedding_padding_modules = ["lm_head"] embedding_modules = ["embed_tokens"] # TODO transformers will have a util to get it hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 37abe5649460..a67246146005 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -43,7 +43,6 @@ def load_lora_model( vllm_config, device, model.embedding_modules, - model.embedding_padding_modules, ) return self.lora_manager.create_lora_manager(model) From d8292ae3d713b1a6c4d8839a4e1f51e25a1caafb Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 10:04:57 +0000 Subject: [PATCH 2/8] Fix lora example Signed-off-by: Jee Jee Li --- examples/offline_inference/multilora_inference.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 6c23cf342e06..5e5da2c0144c 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -46,7 +46,6 @@ def create_test_prompts( logprobs=1, prompt_logprobs=1, max_tokens=128, - stop_token_ids=[32003], ), LoRARequest("sql-lora", 1, lora_path), ), @@ -57,7 +56,6 @@ def create_test_prompts( logprobs=1, prompt_logprobs=1, max_tokens=128, - stop_token_ids=[32003], ), LoRARequest("sql-lora2", 2, lora_path), ), @@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine: # use the same rank, it is recommended to set this as low as possible. # max_cpu_loras: controls the size of the CPU LoRA cache. engine_args = EngineArgs( - model="meta-llama/Llama-2-7b-hf", + model="meta-llama/Llama-3.2-3B-Instruct", enable_lora=True, max_loras=1, max_lora_rank=8, @@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine: def main(): """Main function that sets up and runs the prompt processing.""" engine = initialize_engine() - lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") + lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider") test_prompts = create_test_prompts(lora_path) process_requests(engine, test_prompts) From 5621c3819765e66d1ff4b829e05f78094e56d949 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 15:51:25 +0000 Subject: [PATCH 3/8] Move forward Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 17 +++-------------- tests/lora/test_lora_manager.py | 8 ++++---- tests/lora/test_peft_helper.py | 12 ++++++------ vllm/lora/models.py | 1 - 4 files changed, 13 insertions(+), 25 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index aa8c36c9a599..abd92d9c910d 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -154,17 +154,6 @@ def dummy_model_gate_up() -> nn.Module: return model -@pytest.fixture(scope="session") -def llama_2_7b_base_huggingface_id(): - # used as a base model for testing with sql lora adapter - return "meta-llama/Llama-2-7b-hf" - - -@pytest.fixture(scope="session") -def sql_lora_files(): - return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - - @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test @@ -250,14 +239,14 @@ def qwen3_lora_files(): @pytest.fixture(scope="session") -def qwen3_lora_huggingface_id(): +def llama32_lora_huggingface_id(): # huggingface repo id is used to test lora runtime downloading. - return "charent/self_cognition_Alice" + return "jeeejeee/llama32-3b-text2sql-spider" @pytest.fixture(scope="session") def llama32_lora_files(): - return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider") + return snapshot_download(repo_id="llama32_lora_huggingface_id") @pytest.fixture diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 63b366e7b9c3..081f14d6fabf 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -45,11 +45,11 @@ @pytest.mark.parametrize("device", DEVICES) -def test_from_lora_tensors(sql_lora_files, device): - tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors")) +def test_from_lora_tensors(qwen3_lora_files, device): + tensors = load_file(os.path.join(qwen3_lora_files, "adapter_model.safetensors")) peft_helper = PEFTHelper.from_local_dir( - sql_lora_files, max_position_embeddings=4096 + qwen3_lora_files, max_position_embeddings=4096 ) lora_model = LoRAModel.from_lora_tensors( 1, @@ -60,7 +60,7 @@ def test_from_lora_tensors(sql_lora_files, device): for module_name, lora in lora_model.loras.items(): assert lora.module_name == module_name assert lora.rank == 8 - assert lora.lora_alpha == 16 + assert lora.lora_alpha == 32 assert lora.lora_a is not None assert lora.lora_b is not None assert lora.lora_a.device == torch.device(device) diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index 9c55c623d444..c8a632d71478 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -25,14 +25,14 @@ ] -def test_peft_helper_pass(sql_lora_files, tmp_path): +def test_peft_helper_pass(qwen3_lora_files, tmp_path): peft_helper = PEFTHelper.from_local_dir( - sql_lora_files, max_position_embeddings=4096 + qwen3_lora_files, max_position_embeddings=4096 ) lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) peft_helper.validate_legal(lora_config) assert peft_helper.r == 8 - assert peft_helper.lora_alpha == 16 + assert peft_helper.lora_alpha == 32 assert peft_helper.target_modules == [ "q_proj", "v_proj", @@ -49,7 +49,7 @@ def test_peft_helper_pass(sql_lora_files, tmp_path): # test RSLoRA rslora_config = dict(use_rslora=True) test_dir = tmp_path / "test_rslora" - shutil.copytree(sql_lora_files, test_dir) + shutil.copytree(qwen3_lora_files, test_dir) # Load and modify configuration config_path = test_dir / "adapter_config.json" @@ -70,14 +70,14 @@ def test_peft_helper_pass(sql_lora_files, tmp_path): @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES) def test_peft_helper_error( - sql_lora_files, + qwen3_lora_files, tmp_path, test_name: str, config_change: dict, expected_error: str, ): test_dir = tmp_path / test_name - shutil.copytree(sql_lora_files, test_dir) + shutil.copytree(qwen3_lora_files, test_dir) # Load and modify configuration config_path = test_dir / "adapter_config.json" diff --git a/vllm/lora/models.py b/vllm/lora/models.py index eac7dbf70f62..d9f2dd774add 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -103,7 +103,6 @@ def get_lora(self, module_name: str) -> LoRALayerWeights | None: def check_lora_name(self, lora_name: str) -> bool: return lora_name in self.loras - # (yard1): TODO see if we can derive target_embedding_padding automatically @classmethod def from_lora_tensors( cls, From 6bc4b34dfe58c9b907e782d9dab895a4bc2652d4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 16:33:50 +0000 Subject: [PATCH 4/8] Move forward Signed-off-by: Jee Jee Li --- tests/entrypoints/conftest.py | 4 +- tests/entrypoints/openai/test_basic.py | 2 +- tests/entrypoints/openai/test_chat.py | 20 ++++----- .../entrypoints/openai/test_chunked_prompt.py | 3 +- .../entrypoints/openai/test_lora_adapters.py | 43 +++++++++---------- tests/entrypoints/openai/test_models.py | 12 +++--- tests/entrypoints/openai/test_orca_metrics.py | 2 +- .../openai/test_return_tokens_as_ids.py | 6 +-- tests/entrypoints/openai/test_tokenization.py | 2 +- tests/entrypoints/openai/test_uds.py | 2 +- tests/entrypoints/sagemaker/conftest.py | 2 +- tests/lora/test_lora_huggingface.py | 2 +- .../test_filesystem_resolver.py | 10 ++--- 13 files changed, 54 insertions(+), 56 deletions(-) diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index a52e1cb7df33..1ce83ec55606 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -188,11 +188,11 @@ def sample_sql_statements(): @pytest.fixture(scope="session") -def zephyr_lora_files(): +def qwen3_lora_files(): """Download zephyr LoRA files once per test session.""" from huggingface_hub import snapshot_download - return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora") + return snapshot_download(repo_id="charent/self_cognition_Alice") @pytest.fixture(scope="session") diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index e63a6f10cbc7..3d581a300b6a 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -16,7 +16,7 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d25958f602b3..83e44884c812 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -16,11 +16,11 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") -def server(zephyr_lora_files): # noqa: F811 +def server(qwen3_lora_files): # noqa: F811 args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -31,7 +31,7 @@ def server(zephyr_lora_files): # noqa: F811 # lora config below "--enable-lora", "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", + f"qwen3-lora={qwen3_lora_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -54,7 +54,7 @@ async def client(server): @pytest.mark.parametrize( # first test base model, then test loras "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "qwen3-lora"], ) async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -78,7 +78,7 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "qwen3-lora"], ) async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -104,7 +104,7 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "qwen3-lora"], ) async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -130,7 +130,7 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "qwen3-lora"], ) async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -239,7 +239,7 @@ async def test_more_than_one_prompt_logprobs_chat( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "qwen3-lora"], ) async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -284,7 +284,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "qwen3-lora"], ) async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -330,7 +330,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], + ["Qwen/Qwen3-0.6B", "qwen3-lora"], ) async def test_chat_completion_stream_options( client: openai.AsyncOpenAI, model_name: str diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 608e509e59e8..f5c412107775 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") @@ -20,7 +20,6 @@ def server(): "--max-model-len", "8192", "--enforce-eager", - # lora config below "--max-num-seqs", "128", "--enable-chunked-prefill", diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index c74f805961bc..22461f470db0 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -13,9 +13,8 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here +MODEL_NAME = "Qwen/Qwen3-0.6B" + BADREQUEST_CASES = [ ( @@ -33,11 +32,11 @@ @pytest.fixture(scope="module", params=[True]) -def server_with_lora_modules_json(request, zephyr_lora_files): +def server_with_lora_modules_json(request, qwen3_lora_files): # Define the json format LoRA module configurations lora_module_1 = { - "name": "zephyr-lora", - "path": zephyr_lora_files, + "name": "qwen3-lora", + "path": qwen3_lora_files, "base_model_name": MODEL_NAME, } @@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json): @pytest.mark.asyncio -async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files): +async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files): models = await client.models.list() models = models.data served_model = models[0] @@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files assert served_model.id == MODEL_NAME assert served_model.root == MODEL_NAME assert served_model.parent is None - assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models) + assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models) assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) - assert lora_models[0].id == "zephyr-lora" + assert lora_models[0].id == "qwen3-lora" @pytest.mark.asyncio -async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files): +async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files): response = await client.post( "load_lora_adapter", cast_to=str, - body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files}, + body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files}, ) # Ensure adapter loads before querying /models assert "success" in response @@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file models = await client.models.list() models = models.data dynamic_lora_model = models[-1] - assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.root == qwen3_lora_files assert dynamic_lora_model.parent == MODEL_NAME - assert dynamic_lora_model.id == "zephyr-lora-3" + assert dynamic_lora_model.id == "qwen3-lora-3" @pytest.mark.asyncio @@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path): async def test_dynamic_lora_badrequests( client: openai.AsyncOpenAI, tmp_path, - zephyr_lora_files, + qwen3_lora_files, test_name: str, config_change: dict, expected_error: str, @@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests( test_dir = tmp_path / test_name # Copy adapter files - shutil.copytree(zephyr_lora_files, test_dir) + shutil.copytree(qwen3_lora_files, test_dir) # Load and modify configuration config_path = test_dir / "adapter_config.json" @@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests( @pytest.mark.asyncio async def test_multiple_lora_adapters( - client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files + client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files ): """Validate that many loras can be dynamically registered and inferenced with concurrently""" @@ -178,7 +177,7 @@ async def load_and_run_adapter(adapter_name: str): await client.post( "load_lora_adapter", cast_to=str, - body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)}, + body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)}, ) for _ in range(3): await client.completions.create( @@ -199,7 +198,7 @@ async def load_and_run_adapter(adapter_name: str): @pytest.mark.asyncio async def test_loading_invalid_adapters_does_not_break_others( - client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files + client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files ): invalid_files = tmp_path / "invalid_files" invalid_files.mkdir() @@ -215,7 +214,7 @@ async def run_good_requests(client): while not stop_good_requests_event.is_set(): try: batch = await client.completions.create( - model="zephyr-lora", + model="qwen3-lora", prompt=["Hello there", "Foo bar bazz buzz"], max_tokens=5, ) @@ -254,7 +253,7 @@ async def run_good_requests(client): await client.post( "load_lora_adapter", cast_to=str, - body={"lora_name": "valid", "lora_path": zephyr_lora_files}, + body={"lora_name": "valid", "lora_path": qwen3_lora_files}, ) await client.completions.create( model="valid", @@ -267,7 +266,7 @@ async def run_good_requests(client): async def test_beam_search_with_lora_adapters( client: openai.AsyncOpenAI, tmp_path, - zephyr_lora_files, + qwen3_lora_files, ): """Validate that async beam search can be used with lora.""" @@ -275,7 +274,7 @@ async def load_and_run_adapter(adapter_name: str): await client.post( "load_lora_adapter", cast_to=str, - body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)}, + body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)}, ) for _ in range(3): await client.completions.create( diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 7d2968d96506..e5af11edf7fa 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -8,13 +8,13 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here @pytest.fixture(scope="module") -def server(zephyr_lora_files): +def server(qwen3_lora_files): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -25,7 +25,7 @@ def server(zephyr_lora_files): # lora config below "--enable-lora", "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", + f"qwen3-lora={qwen3_lora_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -45,12 +45,12 @@ async def client(server): @pytest.mark.asyncio -async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files): +async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files): models = await client.models.list() models = models.data served_model = models[0] lora_models = models[1:] assert served_model.id == MODEL_NAME assert served_model.root == MODEL_NAME - assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models) - assert lora_models[0].id == "zephyr-lora" + assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models) + assert lora_models[0].id == "qwen3-lora" diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py index 1ed44a33bf81..f58bbbcb7929 100644 --- a/tests/entrypoints/openai/test_orca_metrics.py +++ b/tests/entrypoints/openai/test_orca_metrics.py @@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index adbcc1f2430c..cedf6ce16060 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -11,11 +11,11 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") -def default_server_args(zephyr_lora_files): +def default_server_args(qwen3_lora_files): return [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files): # lora config "--enable-lora", "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", + f"qwen3-lora={qwen3_lora_files}", "--max-lora-rank", "64", "--max-cpu-loras", diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 7fd32e1c7be1..d23628671a87 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -10,7 +10,7 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py index 5c39869a794f..c79a4870dea3 100644 --- a/tests/entrypoints/openai/test_uds.py +++ b/tests/entrypoints/openai/test_uds.py @@ -10,7 +10,7 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py index 4c859c2527d2..8b2a7668a150 100644 --- a/tests/entrypoints/sagemaker/conftest.py +++ b/tests/entrypoints/sagemaker/conftest.py @@ -9,7 +9,7 @@ from ...utils import RemoteOpenAIServer # Model name constants used across tests -MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME_ZEPHYR = "Qwen/Qwen3-0.6B" MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct" LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese" diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 86f4f3692a97..3348d2f8ce65 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -9,7 +9,7 @@ from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM # Provide absolute path and huggingface lora ids -lora_fixture_name = ["qwen3_lora_files", "qwen3_lora_huggingface_id"] +lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"] LLAMA_LORA_MODULES = [ "qkv_proj", "o_proj", diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py index cd98efdd1390..4615ed966e1a 100644 --- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py +++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py @@ -8,8 +8,8 @@ from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver -MODEL_NAME = "mistralai/Mistral-7B-v0.1" -LORA_NAME = "typeof/zephyr-7b-beta-lora" +MODEL_NAME = "Qwen/Qwen3-0.6B" +LORA_NAME = "charent/self_cognition_Alic" PA_NAME = "swapnilbp/llama_tweet_ptune" @@ -21,7 +21,7 @@ def adapter_cache(request, tmpdir_factory): @pytest.fixture(scope="module") -def zephyr_lora_files(): +def qwen3_lora_files(): return snapshot_download(repo_id=LORA_NAME) @@ -31,9 +31,9 @@ def pa_files(): @pytest.mark.asyncio -async def test_filesystem_resolver(adapter_cache, zephyr_lora_files): +async def test_filesystem_resolver(adapter_cache, qwen3_lora_files): model_files = adapter_cache / LORA_NAME - shutil.copytree(zephyr_lora_files, model_files) + shutil.copytree(qwen3_lora_files, model_files) fs_resolver = FilesystemResolver(adapter_cache) assert fs_resolver is not None From 6be19a8275f31018df0a14dc9bdcd65338d1489e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 28 Nov 2025 01:55:54 +0000 Subject: [PATCH 5/8] FIX Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 4 ++-- tests/lora/test_peft_helper.py | 26 ++++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index abd92d9c910d..be3ddf693383 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -245,8 +245,8 @@ def llama32_lora_huggingface_id(): @pytest.fixture(scope="session") -def llama32_lora_files(): - return snapshot_download(repo_id="llama32_lora_huggingface_id") +def llama32_lora_files(llama32_lora_huggingface_id): + return snapshot_download(repo_id=llama32_lora_huggingface_id) @pytest.fixture diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index c8a632d71478..e3035b00e9e0 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -25,31 +25,33 @@ ] -def test_peft_helper_pass(qwen3_lora_files, tmp_path): +def test_peft_helper_pass(llama32_lora_files, tmp_path): peft_helper = PEFTHelper.from_local_dir( - qwen3_lora_files, max_position_embeddings=4096 + llama32_lora_files, max_position_embeddings=4096 ) lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) peft_helper.validate_legal(lora_config) assert peft_helper.r == 8 assert peft_helper.lora_alpha == 32 - assert peft_helper.target_modules == [ - "q_proj", - "v_proj", - "k_proj", - "o_proj", - "gate_proj", - "up_proj", + target_modules = sorted(peft_helper.target_modules) + + assert target_modules == [ "down_proj", "embed_tokens", + "gate_proj", + "k_proj", "lm_head", + "o_proj", + "q_proj", + "up_proj", + "v_proj", ] assert peft_helper.vllm_max_position_embeddings == 4096 # test RSLoRA rslora_config = dict(use_rslora=True) test_dir = tmp_path / "test_rslora" - shutil.copytree(qwen3_lora_files, test_dir) + shutil.copytree(llama32_lora_files, test_dir) # Load and modify configuration config_path = test_dir / "adapter_config.json" @@ -70,14 +72,14 @@ def test_peft_helper_pass(qwen3_lora_files, tmp_path): @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES) def test_peft_helper_error( - qwen3_lora_files, + llama32_lora_files, tmp_path, test_name: str, config_change: dict, expected_error: str, ): test_dir = tmp_path / test_name - shutil.copytree(qwen3_lora_files, test_dir) + shutil.copytree(llama32_lora_files, test_dir) # Load and modify configuration config_path = test_dir / "adapter_config.json" From a7a372f48db006e8ffee925703925b00866ec7ed Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 28 Nov 2025 16:35:14 +0000 Subject: [PATCH 6/8] Fix test Signed-off-by: Jee Jee Li --- tests/entrypoints/openai/test_chat.py | 28 ++++++++++++------- tests/lora/test_olmoe_tp.py | 22 +++++++++++---- .../test_filesystem_resolver.py | 2 +- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 83e44884c812..b2909f21e4dd 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -16,11 +16,19 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @pytest.fixture(scope="module") -def server(qwen3_lora_files): # noqa: F811 +def zephyr_lora_files(): + """Download zephyr LoRA files once per test session.""" + from huggingface_hub import snapshot_download + + return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora") + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files): # noqa: F811 args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -31,7 +39,7 @@ def server(qwen3_lora_files): # noqa: F811 # lora config below "--enable-lora", "--lora-modules", - f"qwen3-lora={qwen3_lora_files}", + f"zephyr-lora={zephyr_lora_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -54,7 +62,7 @@ async def client(server): @pytest.mark.parametrize( # first test base model, then test loras "model_name", - [MODEL_NAME, "qwen3-lora"], + [MODEL_NAME, "zephyr-lora"], ) async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -78,7 +86,7 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", - [MODEL_NAME, "qwen3-lora"], + [MODEL_NAME, "zephyr-lora"], ) async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -104,7 +112,7 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "qwen3-lora"], + [MODEL_NAME, "zephyr-lora"], ) async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -130,7 +138,7 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "qwen3-lora"], + [MODEL_NAME, "zephyr-lora"], ) async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -239,7 +247,7 @@ async def test_more_than_one_prompt_logprobs_chat( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "qwen3-lora"], + [MODEL_NAME, "zephyr-lora"], ) async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -284,7 +292,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", - [MODEL_NAME, "qwen3-lora"], + [MODEL_NAME, "zephyr-lora"], ) async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): messages = [ @@ -330,7 +338,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - ["Qwen/Qwen3-0.6B", "qwen3-lora"], + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ) async def test_chat_completion_stream_options( client: openai.AsyncOpenAI, model_name: str diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py index e3c9816625ba..e10419d244c3 100644 --- a/tests/lora/test_olmoe_tp.py +++ b/tests/lora/test_olmoe_tp.py @@ -40,7 +40,10 @@ def generate_and_test( - llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None + llm: vllm.LLM, + lora_path: str, + lora_id: list[int | None] | int | None, + compare_lower: bool = False, ) -> None: prompts = [ PROMPT_TEMPLATE.format(context="How many candidates are there?"), @@ -74,12 +77,18 @@ def generate_and_test( for i in range(len(EXPECTED_LORA_OUTPUT)): req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id + generated_text = generated_texts[i] expected_output = ( EXPECTED_LORA_OUTPUT[i] if req_lora_id is not None else EXPECTED_BASE_MODEL_OUTPUT[i] ) - assert generated_texts[i].startswith(expected_output) + + if compare_lower: + generated_text = generated_text.lower() + expected_output = expected_output.lower() + + assert generated_text.startswith(expected_output) def test_olmoe_lora(olmoe_lora_files): @@ -146,6 +155,9 @@ def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras): tensor_parallel_size=4, fully_sharded_loras=fully_sharded_loras, ) - - generate_and_test(llm, olmoe_lora_files, lora_id=1) - generate_and_test(llm, olmoe_lora_files, lora_id=2) + generate_and_test( + llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras + ) + generate_and_test( + llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras + ) diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py index 4615ed966e1a..d4adf6f84cf0 100644 --- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py +++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py @@ -9,7 +9,7 @@ from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver MODEL_NAME = "Qwen/Qwen3-0.6B" -LORA_NAME = "charent/self_cognition_Alic" +LORA_NAME = "charent/self_cognition_Alice" PA_NAME = "swapnilbp/llama_tweet_ptune" From 828ee299180d472e0bd2e903354354bbb13b443e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 29 Nov 2025 00:47:48 +0000 Subject: [PATCH 7/8] Fix test Signed-off-by: Jee Jee Li --- tests/entrypoints/conftest.py | 2 +- tests/entrypoints/openai/test_orca_metrics.py | 3 ++- tests/entrypoints/sagemaker/conftest.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 1ce83ec55606..9ab50c44aa4a 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -189,7 +189,7 @@ def sample_sql_statements(): @pytest.fixture(scope="session") def qwen3_lora_files(): - """Download zephyr LoRA files once per test session.""" + """Download Qwen3 LoRA files once per test session.""" from huggingface_hub import snapshot_download return snapshot_download(repo_id="charent/self_cognition_Alice") diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py index f58bbbcb7929..1ce043df0cd8 100644 --- a/tests/entrypoints/openai/test_orca_metrics.py +++ b/tests/entrypoints/openai/test_orca_metrics.py @@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI): choice = completion.choices[0] assert len(choice.text) >= 5 assert choice.finish_reason == "length" + # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374] assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11 + completion_tokens=5, prompt_tokens=5, total_tokens=10 ) # test using token IDs diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py index 8b2a7668a150..ad219eec18b7 100644 --- a/tests/entrypoints/sagemaker/conftest.py +++ b/tests/entrypoints/sagemaker/conftest.py @@ -9,7 +9,6 @@ from ...utils import RemoteOpenAIServer # Model name constants used across tests -MODEL_NAME_ZEPHYR = "Qwen/Qwen3-0.6B" MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct" LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese" From e190b62922d3bea9eb82c7d9951154a1eb919797 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 29 Nov 2025 01:58:53 +0000 Subject: [PATCH 8/8] Modify assert Signed-off-by: Jee Jee Li --- vllm/lora/models.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d9f2dd774add..f568b8b9ba59 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -129,8 +129,12 @@ def from_lora_tensors( ) if is_lora_a: - if "lora_embedding_A" in tensor_name and model_vocab_size is not None: - assert model_vocab_size == tensor.shape[1], ( + if ( + "lora_embedding_A" in tensor_name + and model_vocab_size is not None + and model_vocab_size != tensor.shape[1] + ): + raise RuntimeError( f"The embedding LoRA size({tensor.shape[1]}) must be consistent" f" with the base model's vocabulary size({model_vocab_size})." )