From 43927e7104959065a08b8932f74ec3e2c04be623 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 09:58:14 +0000
Subject: [PATCH 1/8] Init

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py                        | 16 +++++-----
 tests/lora/test_lora_checkpoints.py           | 21 +++++---------
 tests/lora/test_lora_huggingface.py           | 10 ++-----
 tests/lora/test_lora_manager.py               |  9 ++----
 vllm/lora/models.py                           | 29 ++++++-------------
 vllm/lora/worker_manager.py                   |  6 +---
 vllm/model_executor/models/apertus.py         |  1 -
 vllm/model_executor/models/bamba.py           |  1 -
 vllm/model_executor/models/exaone.py          |  1 -
 vllm/model_executor/models/exaone4.py         |  1 -
 vllm/model_executor/models/falcon_h1.py       |  1 -
 vllm/model_executor/models/granite.py         |  1 -
 vllm/model_executor/models/granitemoe.py      |  1 -
 .../model_executor/models/granitemoehybrid.py |  1 -
 .../model_executor/models/granitemoeshared.py |  1 -
 vllm/model_executor/models/interfaces.py      |  6 +---
 vllm/model_executor/models/jamba.py           |  1 -
 vllm/model_executor/models/lfm2.py            |  1 -
 vllm/model_executor/models/lfm2_moe.py        |  1 -
 vllm/model_executor/models/llama.py           |  1 -
 vllm/model_executor/models/minicpm.py         |  1 -
 vllm/model_executor/models/minicpm_eagle.py   |  1 -
 vllm/model_executor/models/minicpmv.py        |  1 -
 vllm/model_executor/models/mixtral.py         |  1 -
 vllm/model_executor/models/nemotron.py        |  1 -
 vllm/model_executor/models/nemotron_h.py      |  1 -
 vllm/model_executor/models/nemotron_nas.py    |  1 -
 vllm/model_executor/models/phimoe.py          |  1 -
 vllm/model_executor/models/solar.py           |  1 -
 .../models/transformers/base.py               |  1 -
 vllm/v1/worker/lora_model_runner_mixin.py     |  1 -
 31 files changed, 31 insertions(+), 90 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 9d38ec542279..aa8c36c9a599 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -161,14 +161,8 @@ def llama_2_7b_base_huggingface_id():
 
 
 @pytest.fixture(scope="session")
-def sql_lora_huggingface_id():
-    # huggingface repo id is used to test lora runtime downloading.
-    return "yard1/llama-2-7b-sql-lora-test"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_files(sql_lora_huggingface_id):
-    return snapshot_download(repo_id=sql_lora_huggingface_id)
+def sql_lora_files():
+    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
 
 
 @pytest.fixture(scope="session")
@@ -255,6 +249,12 @@ def qwen3_lora_files():
     return snapshot_download(repo_id="charent/self_cognition_Alice")
 
 
+@pytest.fixture(scope="session")
+def qwen3_lora_huggingface_id():
+    # huggingface repo id is used to test lora runtime downloading.
+    return "charent/self_cognition_Alice"
+
+
 @pytest.fixture(scope="session")
 def llama32_lora_files():
     return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 2219d470e91a..60dd8f43109d 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -26,8 +26,7 @@ def test_load_checkpoints(
     chatglm3_lora_files,
 ):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+
     expected_lora_modules: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -46,8 +45,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
@@ -62,8 +60,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
@@ -77,8 +74,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
@@ -94,15 +90,13 @@ def test_load_checkpoints(
                 peft_helper=peft_helper,
                 lora_model_id=1,
                 device="cpu",
-                embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules,
+                model_vocab_size=64000,
             )
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+
     expected_lora_modules: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -127,8 +121,7 @@ def test_lora_weights_mapping(baichuan_lora_files):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
+        model_vocab_size=64000,
         weights_mapper=hf_to_vllm_mapper,
     )
     for name in lora_model.loras:
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 7d20faef541a..7ffffa59d1da 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,10 +6,10 @@
 from vllm.lora.models import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 
 # Provide absolute path and huggingface lora ids
-lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+lora_fixture_name = ["qwen3_lora_files", "qwen3_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
     "qkv_proj",
     "o_proj",
@@ -23,9 +23,7 @@
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
-    embedding_modules = LlamaForCausalLM.embedding_modules
-    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
+    packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
     expected_lora_modules: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -43,8 +41,6 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
     )
 
     # Assertions to ensure the model is loaded correctly
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 24d4dfca46d6..63b366e7b9c3 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -34,7 +34,6 @@
     "lm_head": "output_embeddings",
 }
 
-EMBEDDING_PADDING_MODULES = ["lm_head"]
 
 DEVICES = (
     [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@@ -57,8 +56,6 @@ def test_from_lora_tensors(sql_lora_files, device):
         tensors,
         peft_helper=peft_helper,
         device=device,
-        embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
     )
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
@@ -430,7 +427,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+        vllm_config, device, EMBEDDING_MODULES
     )
 
     worker_adapter_manager.max_num_seqs = 4
@@ -533,9 +530,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
 
-    worker_adapter_manager = WorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
-    )
+    worker_adapter_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
     worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 636f062feb7b..ce3e12fbccde 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -112,9 +112,7 @@ def from_lora_tensors(
         peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
@@ -132,22 +130,17 @@ def from_lora_tensors(
                 )
 
             if is_lora_a:
+                if "lora_embedding_A" in tensor_name and model_vocab_size is not None:
+                    assert model_vocab_size == tensor.shape[1], (
+                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
+                        f" with the base model's vocabulary size({model_vocab_size})."
+                    )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
                 if pin_memory:
                     loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-                assert embedding_padding_modules is not None
-                if (
-                    any(name in module_name for name in embedding_padding_modules)
-                    and target_embedding_padding is not None
-                ):
-                    lora_b = loras[module_name].lora_b
-                    assert target_embedding_padding >= lora_b.shape[0]
-                    addition = target_embedding_padding - lora_b.shape[0]
-                    loras[module_name].lora_b = torch.nn.functional.pad(
-                        lora_b, (0, 0, 0, addition)
-                    )
+
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
@@ -166,9 +159,7 @@ def from_local_checkpoint(
         lora_model_id: int | None = None,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
         tensorizer_config_dict: dict | None = None,
     ) -> "LoRAModel":
@@ -292,9 +283,7 @@ def check_unexpected_modules(modules: dict):
             peft_helper=peft_helper,
             device=device,
             dtype=dtype,
-            target_embedding_padding=target_embedding_padding,
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules,
+            model_vocab_size=model_vocab_size,
             weights_mapper=weights_mapper,
         )
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4cc201a6414f..8125428d5b20 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -34,12 +34,10 @@ def __init__(
         vllm_config: VllmConfig,
         device: torch.device,
         embedding_modules: dict[str, str],
-        embedding_padding_modules: list[str],
         lora_model_cls: type[LoRAModel] = LoRAModel,
     ):
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
-        self.embedding_padding_modules = embedding_padding_modules
         self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
         self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.max_num_batched_tokens = (
@@ -121,9 +119,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size,
-                embedding_modules=self.embedding_modules,
-                embedding_padding_modules=self.embedding_padding_modules,
+                model_vocab_size=self.vocab_size,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
                 weights_mapper=hf_to_vllm_mapper,
             )
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index f38b09bf5506..4a69787af55e 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -482,7 +482,6 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 4422bb5da98f..1d6493b18c34 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -419,7 +419,6 @@ class BambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 99002baa8752..acf651ed2498 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -457,7 +457,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "wte": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 9d2c67d6c4f8..cb710a7ec5cf 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -450,7 +450,6 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 9433f0d1b4a4..83ceb9303cfb 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -510,7 +510,6 @@ class FalconH1ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index eac9ef9478a6..76519c4660f1 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -400,7 +400,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 02c6c5862141..b038400a1262 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -497,7 +497,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 9d5eeef198a6..1d9c2f5df4a5 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -601,7 +601,6 @@ class GraniteMoeHybridForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index fd346db7e35a..8ad5a7105bb5 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -263,7 +263,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f6ce32538b7..3e31fea8f754 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -336,10 +336,8 @@ class SupportsLoRA(Protocol):
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
     """
-    # The `embedding_module` and `embedding_padding_modules`
-    # are empty by default.
+    # The `embedding_module` is empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
-    embedding_padding_modules: ClassVar[list[str]] = []
     packed_modules_mapping: dict[str, list[str]] = {}
 
 
@@ -351,7 +349,6 @@ class _SupportsLoRAType(Protocol):
 
     packed_modules_mapping: dict[str, list[str]]
     embedding_modules: dict[str, str]
-    embedding_padding_modules: list[str]
 
 
 @overload
@@ -371,7 +368,6 @@ def supports_lora(
         lora_attrs = (
             "packed_modules_mapping",
             "embedding_modules",
-            "embedding_padding_modules",
         )
         missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr))
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3a2c98c73dab..b2ad12be1e35 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -480,7 +480,6 @@ class JambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 69615f8b6a09..a4a994f97a2f 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -422,7 +422,6 @@ class Lfm2ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index aaeb2cc38999..c8669de72dd0 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -602,7 +602,6 @@ class Lfm2MoeForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6dfbde7a17f5..8f5a967cd422 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -528,7 +528,6 @@ class LlamaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 67911ba8c1c8..67c462f4b25c 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -568,7 +568,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index e6bccfcac4f1..9f3587a6d2fa 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -305,7 +305,6 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2ac97764dd34..6d0ebf5c9825 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1741,5 +1741,4 @@ def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
         # so update values before init is called
         cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
         cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
         return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e21656dbd635..50ec57e7a805 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -496,7 +496,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 93ad2064a2fc..ffba6c9dfe73 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -439,7 +439,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 8675eff59222..baeb901bbb05 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -713,7 +713,6 @@ class NemotronHForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 34ea2945b711..9d968dee8711 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -387,7 +387,6 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index a5a669139b2f..49530776f890 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -617,7 +617,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index c576154b1ecf..7bef56110cab 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -426,7 +426,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index b33ce35427f5..f3ebc6da8e30 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -93,7 +93,6 @@ def vllm_flash_attention_forward(
 
 
 class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 37abe5649460..a67246146005 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -43,7 +43,6 @@ def load_lora_model(
             vllm_config,
             device,
             model.embedding_modules,
-            model.embedding_padding_modules,
         )
         return self.lora_manager.create_lora_manager(model)
 

From d8292ae3d713b1a6c4d8839a4e1f51e25a1caafb Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 10:04:57 +0000
Subject: [PATCH 2/8] Fix lora example

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 examples/offline_inference/multilora_inference.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 6c23cf342e06..5e5da2c0144c 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -46,7 +46,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora", 1, lora_path),
         ),
@@ -57,7 +56,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora2", 2, lora_path),
         ),
@@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
     #   use the same rank, it is recommended to set this as low as possible.
     # max_cpu_loras: controls the size of the CPU LoRA cache.
     engine_args = EngineArgs(
-        model="meta-llama/Llama-2-7b-hf",
+        model="meta-llama/Llama-3.2-3B-Instruct",
         enable_lora=True,
         max_loras=1,
         max_lora_rank=8,
@@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
 def main():
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine()
-    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
     test_prompts = create_test_prompts(lora_path)
     process_requests(engine, test_prompts)
 

From 5621c3819765e66d1ff4b829e05f78094e56d949 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 15:51:25 +0000
Subject: [PATCH 3/8] Move forward

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py          | 17 +++--------------
 tests/lora/test_lora_manager.py |  8 ++++----
 tests/lora/test_peft_helper.py  | 12 ++++++------
 vllm/lora/models.py             |  1 -
 4 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index aa8c36c9a599..abd92d9c910d 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -154,17 +154,6 @@ def dummy_model_gate_up() -> nn.Module:
     return model
 
 
-@pytest.fixture(scope="session")
-def llama_2_7b_base_huggingface_id():
-    # used as a base model for testing with sql lora adapter
-    return "meta-llama/Llama-2-7b-hf"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_files():
-    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-
-
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
@@ -250,14 +239,14 @@ def qwen3_lora_files():
 
 
 @pytest.fixture(scope="session")
-def qwen3_lora_huggingface_id():
+def llama32_lora_huggingface_id():
     # huggingface repo id is used to test lora runtime downloading.
-    return "charent/self_cognition_Alice"
+    return "jeeejeee/llama32-3b-text2sql-spider"
 
 
 @pytest.fixture(scope="session")
 def llama32_lora_files():
-    return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
+    return snapshot_download(repo_id="llama32_lora_huggingface_id")
 
 
 @pytest.fixture
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 63b366e7b9c3..081f14d6fabf 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -45,11 +45,11 @@
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
+def test_from_lora_tensors(qwen3_lora_files, device):
+    tensors = load_file(os.path.join(qwen3_lora_files, "adapter_model.safetensors"))
 
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        qwen3_lora_files, max_position_embeddings=4096
     )
     lora_model = LoRAModel.from_lora_tensors(
         1,
@@ -60,7 +60,7 @@ def test_from_lora_tensors(sql_lora_files, device):
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
-        assert lora.lora_alpha == 16
+        assert lora.lora_alpha == 32
         assert lora.lora_a is not None
         assert lora.lora_b is not None
         assert lora.lora_a.device == torch.device(device)
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9c55c623d444..c8a632d71478 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -25,14 +25,14 @@
 ]
 
 
-def test_peft_helper_pass(sql_lora_files, tmp_path):
+def test_peft_helper_pass(qwen3_lora_files, tmp_path):
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        qwen3_lora_files, max_position_embeddings=4096
     )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
+    assert peft_helper.lora_alpha == 32
     assert peft_helper.target_modules == [
         "q_proj",
         "v_proj",
@@ -49,7 +49,7 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -70,14 +70,14 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
 
 @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
 def test_peft_helper_error(
-    sql_lora_files,
+    qwen3_lora_files,
     tmp_path,
     test_name: str,
     config_change: dict,
     expected_error: str,
 ):
     test_dir = tmp_path / test_name
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index eac7dbf70f62..d9f2dd774add 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -103,7 +103,6 @@ def get_lora(self, module_name: str) -> LoRALayerWeights | None:
     def check_lora_name(self, lora_name: str) -> bool:
         return lora_name in self.loras
 
-    # (yard1): TODO see if we can derive target_embedding_padding automatically
     @classmethod
     def from_lora_tensors(
         cls,

From 6bc4b34dfe58c9b907e782d9dab895a4bc2652d4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 16:33:50 +0000
Subject: [PATCH 4/8] Move forward

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/conftest.py                 |  4 +-
 tests/entrypoints/openai/test_basic.py        |  2 +-
 tests/entrypoints/openai/test_chat.py         | 20 ++++-----
 .../entrypoints/openai/test_chunked_prompt.py |  3 +-
 .../entrypoints/openai/test_lora_adapters.py  | 43 +++++++++----------
 tests/entrypoints/openai/test_models.py       | 12 +++---
 tests/entrypoints/openai/test_orca_metrics.py |  2 +-
 .../openai/test_return_tokens_as_ids.py       |  6 +--
 tests/entrypoints/openai/test_tokenization.py |  2 +-
 tests/entrypoints/openai/test_uds.py          |  2 +-
 tests/entrypoints/sagemaker/conftest.py       |  2 +-
 tests/lora/test_lora_huggingface.py           |  2 +-
 .../test_filesystem_resolver.py               | 10 ++---
 13 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a52e1cb7df33..1ce83ec55606 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -188,11 +188,11 @@ def sample_sql_statements():
 
 
 @pytest.fixture(scope="session")
-def zephyr_lora_files():
+def qwen3_lora_files():
     """Download zephyr LoRA files once per test session."""
     from huggingface_hub import snapshot_download
 
-    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index e63a6f10cbc7..3d581a300b6a 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -16,7 +16,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d25958f602b3..83e44884c812 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -16,11 +16,11 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):  # noqa: F811
+def server(qwen3_lora_files):  # noqa: F811
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -31,7 +31,7 @@ def server(zephyr_lora_files):  # noqa: F811
         # lora config below
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
@@ -54,7 +54,7 @@ async def client(server):
 @pytest.mark.parametrize(
     # first test base model, then test loras
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "qwen3-lora"],
 )
 async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -78,7 +78,7 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "qwen3-lora"],
 )
 async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -104,7 +104,7 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "qwen3-lora"],
 )
 async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -130,7 +130,7 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "qwen3-lora"],
 )
 async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -239,7 +239,7 @@ async def test_more_than_one_prompt_logprobs_chat(
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "qwen3-lora"],
 )
 async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -284,7 +284,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
     "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "qwen3-lora"],
 )
 async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -330,7 +330,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+    ["Qwen/Qwen3-0.6B", "qwen3-lora"],
 )
 async def test_chat_completion_stream_options(
     client: openai.AsyncOpenAI, model_name: str
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 608e509e59e8..f5c412107775 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -8,7 +8,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -20,7 +20,6 @@ def server():
         "--max-model-len",
         "8192",
         "--enforce-eager",
-        # lora config below
         "--max-num-seqs",
         "128",
         "--enable-chunked-prefill",
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index c74f805961bc..22461f470db0 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -13,9 +13,8 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
 
 BADREQUEST_CASES = [
     (
@@ -33,11 +32,11 @@
 
 
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, zephyr_lora_files):
+def server_with_lora_modules_json(request, qwen3_lora_files):
     # Define the json format LoRA module configurations
     lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
+        "name": "qwen3-lora",
+        "path": qwen3_lora_files,
         "base_model_name": MODEL_NAME,
     }
 
@@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
     assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[0].id == "qwen3-lora"
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     response = await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+        body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
     )
     # Ensure adapter loads before querying /models
     assert "success" in response
@@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file
     models = await client.models.list()
     models = models.data
     dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.root == qwen3_lora_files
     assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
+    assert dynamic_lora_model.id == "qwen3-lora-3"
 
 
 @pytest.mark.asyncio
@@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
 async def test_dynamic_lora_badrequests(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
     test_name: str,
     config_change: dict,
     expected_error: str,
@@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
     test_dir = tmp_path / test_name
 
     # Copy adapter files
-    shutil.copytree(zephyr_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(
 
 @pytest.mark.asyncio
 async def test_multiple_lora_adapters(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     """Validate that many loras can be dynamically registered and inferenced
     with concurrently"""
@@ -178,7 +177,7 @@ async def load_and_run_adapter(adapter_name: str):
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
@@ -199,7 +198,7 @@ async def load_and_run_adapter(adapter_name: str):
 
 @pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
@@ -215,7 +214,7 @@ async def run_good_requests(client):
         while not stop_good_requests_event.is_set():
             try:
                 batch = await client.completions.create(
-                    model="zephyr-lora",
+                    model="qwen3-lora",
                     prompt=["Hello there", "Foo bar bazz buzz"],
                     max_tokens=5,
                 )
@@ -254,7 +253,7 @@ async def run_good_requests(client):
     await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+        body={"lora_name": "valid", "lora_path": qwen3_lora_files},
     )
     await client.completions.create(
         model="valid",
@@ -267,7 +266,7 @@ async def run_good_requests(client):
 async def test_beam_search_with_lora_adapters(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
 ):
     """Validate that async beam search can be used with lora."""
 
@@ -275,7 +274,7 @@ async def load_and_run_adapter(adapter_name: str):
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 7d2968d96506..e5af11edf7fa 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -8,13 +8,13 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def server(qwen3_lora_files):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -25,7 +25,7 @@ def server(zephyr_lora_files):
         # lora config below
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
@@ -45,12 +45,12 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index 1ed44a33bf81..f58bbbcb7929 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -8,7 +8,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index adbcc1f2430c..cedf6ce16060 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,11 +11,11 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
+def default_server_args(qwen3_lora_files):
     return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
         # lora config
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7fd32e1c7be1..d23628671a87 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -10,7 +10,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py
index 5c39869a794f..c79a4870dea3 100644
--- a/tests/entrypoints/openai/test_uds.py
+++ b/tests/entrypoints/openai/test_uds.py
@@ -10,7 +10,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index 4c859c2527d2..8b2a7668a150 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -9,7 +9,7 @@
 from ...utils import RemoteOpenAIServer
 
 # Model name constants used across tests
-MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME_ZEPHYR = "Qwen/Qwen3-0.6B"
 MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
 LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
 
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 86f4f3692a97..3348d2f8ce65 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -9,7 +9,7 @@
 from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 
 # Provide absolute path and huggingface lora ids
-lora_fixture_name = ["qwen3_lora_files", "qwen3_lora_huggingface_id"]
+lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
     "qkv_proj",
     "o_proj",
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cd98efdd1390..4615ed966e1a 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -8,8 +8,8 @@
 
 from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
 
-MODEL_NAME = "mistralai/Mistral-7B-v0.1"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+LORA_NAME = "charent/self_cognition_Alic"
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 
@@ -21,7 +21,7 @@ def adapter_cache(request, tmpdir_factory):
 
 
 @pytest.fixture(scope="module")
-def zephyr_lora_files():
+def qwen3_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
@@ -31,9 +31,9 @@ def pa_files():
 
 
 @pytest.mark.asyncio
-async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
+async def test_filesystem_resolver(adapter_cache, qwen3_lora_files):
     model_files = adapter_cache / LORA_NAME
-    shutil.copytree(zephyr_lora_files, model_files)
+    shutil.copytree(qwen3_lora_files, model_files)
 
     fs_resolver = FilesystemResolver(adapter_cache)
     assert fs_resolver is not None

From 6be19a8275f31018df0a14dc9bdcd65338d1489e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Nov 2025 01:55:54 +0000
Subject: [PATCH 5/8] FIX

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py         |  4 ++--
 tests/lora/test_peft_helper.py | 26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index abd92d9c910d..be3ddf693383 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -245,8 +245,8 @@ def llama32_lora_huggingface_id():
 
 
 @pytest.fixture(scope="session")
-def llama32_lora_files():
-    return snapshot_download(repo_id="llama32_lora_huggingface_id")
+def llama32_lora_files(llama32_lora_huggingface_id):
+    return snapshot_download(repo_id=llama32_lora_huggingface_id)
 
 
 @pytest.fixture
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index c8a632d71478..e3035b00e9e0 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -25,31 +25,33 @@
 ]
 
 
-def test_peft_helper_pass(qwen3_lora_files, tmp_path):
+def test_peft_helper_pass(llama32_lora_files, tmp_path):
     peft_helper = PEFTHelper.from_local_dir(
-        qwen3_lora_files, max_position_embeddings=4096
+        llama32_lora_files, max_position_embeddings=4096
     )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
     assert peft_helper.lora_alpha == 32
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
+    target_modules = sorted(peft_helper.target_modules)
+
+    assert target_modules == [
         "down_proj",
         "embed_tokens",
+        "gate_proj",
+        "k_proj",
         "lm_head",
+        "o_proj",
+        "q_proj",
+        "up_proj",
+        "v_proj",
     ]
     assert peft_helper.vllm_max_position_embeddings == 4096
 
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
-    shutil.copytree(qwen3_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -70,14 +72,14 @@ def test_peft_helper_pass(qwen3_lora_files, tmp_path):
 
 @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
 def test_peft_helper_error(
-    qwen3_lora_files,
+    llama32_lora_files,
     tmp_path,
     test_name: str,
     config_change: dict,
     expected_error: str,
 ):
     test_dir = tmp_path / test_name
-    shutil.copytree(qwen3_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"

From a7a372f48db006e8ffee925703925b00866ec7ed Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Nov 2025 16:35:14 +0000
Subject: [PATCH 6/8] Fix test

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_chat.py         | 28 ++++++++++++-------
 tests/lora/test_olmoe_tp.py                   | 22 +++++++++++----
 .../test_filesystem_resolver.py               |  2 +-
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 83e44884c812..b2909f21e4dd 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -16,11 +16,19 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
 @pytest.fixture(scope="module")
-def server(qwen3_lora_files):  # noqa: F811
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files):  # noqa: F811
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -31,7 +39,7 @@ def server(qwen3_lora_files):  # noqa: F811
         # lora config below
         "--enable-lora",
         "--lora-modules",
-        f"qwen3-lora={qwen3_lora_files}",
+        f"zephyr-lora={zephyr_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
@@ -54,7 +62,7 @@ async def client(server):
 @pytest.mark.parametrize(
     # first test base model, then test loras
     "model_name",
-    [MODEL_NAME, "qwen3-lora"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -78,7 +86,7 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
     "model_name",
-    [MODEL_NAME, "qwen3-lora"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -104,7 +112,7 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "qwen3-lora"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -130,7 +138,7 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "qwen3-lora"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -239,7 +247,7 @@ async def test_more_than_one_prompt_logprobs_chat(
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "qwen3-lora"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -284,7 +292,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
     "model_name",
-    [MODEL_NAME, "qwen3-lora"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
     messages = [
@@ -330,7 +338,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    ["Qwen/Qwen3-0.6B", "qwen3-lora"],
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
 async def test_chat_completion_stream_options(
     client: openai.AsyncOpenAI, model_name: str
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e3c9816625ba..e10419d244c3 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -40,7 +40,10 @@
 
 
 def generate_and_test(
-    llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: list[int | None] | int | None,
+    compare_lower: bool = False,
 ) -> None:
     prompts = [
         PROMPT_TEMPLATE.format(context="How many candidates are there?"),
@@ -74,12 +77,18 @@ def generate_and_test(
 
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
+        generated_text = generated_texts[i]
         expected_output = (
             EXPECTED_LORA_OUTPUT[i]
             if req_lora_id is not None
             else EXPECTED_BASE_MODEL_OUTPUT[i]
         )
-        assert generated_texts[i].startswith(expected_output)
+
+        if compare_lower:
+            generated_text = generated_text.lower()
+            expected_output = expected_output.lower()
+
+        assert generated_text.startswith(expected_output)
 
 
 def test_olmoe_lora(olmoe_lora_files):
@@ -146,6 +155,9 @@ def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
         tensor_parallel_size=4,
         fully_sharded_loras=fully_sharded_loras,
     )
-
-    generate_and_test(llm, olmoe_lora_files, lora_id=1)
-    generate_and_test(llm, olmoe_lora_files, lora_id=2)
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras
+    )
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras
+    )
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index 4615ed966e1a..d4adf6f84cf0 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -9,7 +9,7 @@
 from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
 
 MODEL_NAME = "Qwen/Qwen3-0.6B"
-LORA_NAME = "charent/self_cognition_Alic"
+LORA_NAME = "charent/self_cognition_Alice"
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 

From 828ee299180d472e0bd2e903354354bbb13b443e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 29 Nov 2025 00:47:48 +0000
Subject: [PATCH 7/8] Fix test

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/conftest.py                 | 2 +-
 tests/entrypoints/openai/test_orca_metrics.py | 3 ++-
 tests/entrypoints/sagemaker/conftest.py       | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 1ce83ec55606..9ab50c44aa4a 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -189,7 +189,7 @@ def sample_sql_statements():
 
 @pytest.fixture(scope="session")
 def qwen3_lora_files():
-    """Download zephyr LoRA files once per test session."""
+    """Download Qwen3 LoRA files once per test session."""
     from huggingface_hub import snapshot_download
 
     return snapshot_download(repo_id="charent/self_cognition_Alice")
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index f58bbbcb7929..1ce043df0cd8 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
     choice = completion.choices[0]
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
+    # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11
+        completion_tokens=5, prompt_tokens=5, total_tokens=10
     )
 
     # test using token IDs
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index 8b2a7668a150..ad219eec18b7 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -9,7 +9,6 @@
 from ...utils import RemoteOpenAIServer
 
 # Model name constants used across tests
-MODEL_NAME_ZEPHYR = "Qwen/Qwen3-0.6B"
 MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
 LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
 

From e190b62922d3bea9eb82c7d9951154a1eb919797 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 29 Nov 2025 01:58:53 +0000
Subject: [PATCH 8/8] Modify assert

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/models.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index d9f2dd774add..f568b8b9ba59 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -129,8 +129,12 @@ def from_lora_tensors(
                 )
 
             if is_lora_a:
-                if "lora_embedding_A" in tensor_name and model_vocab_size is not None:
-                    assert model_vocab_size == tensor.shape[1], (
+                if (
+                    "lora_embedding_A" in tensor_name
+                    and model_vocab_size is not None
+                    and model_vocab_size != tensor.shape[1]
+                ):
+                    raise RuntimeError(
                         f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
                         f" with the base model's vocabulary size({model_vocab_size})."
                     )