Skip to content
6 changes: 2 additions & 4 deletions examples/offline_inference/multilora_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def create_test_prompts(
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003],
),
LoRARequest("sql-lora", 1, lora_path),
),
Expand All @@ -57,7 +56,6 @@ def create_test_prompts(
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003],
),
LoRARequest("sql-lora2", 2, lora_path),
),
Expand Down Expand Up @@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
# use the same rank, it is recommended to set this as low as possible.
# max_cpu_loras: controls the size of the CPU LoRA cache.
engine_args = EngineArgs(
model="meta-llama/Llama-2-7b-hf",
model="meta-llama/Llama-3.2-3B-Instruct",
enable_lora=True,
max_loras=1,
max_lora_rank=8,
Expand All @@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
def main():
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine()
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have removed the LoRA extra-vocab code, so we no longer support this type of LoRA weight. Accordingly, I change the base model and LoRA model in this script

test_prompts = create_test_prompts(lora_path)
process_requests(engine, test_prompts)

Expand Down
6 changes: 3 additions & 3 deletions tests/entrypoints/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,11 @@ def sample_sql_statements():


@pytest.fixture(scope="session")
def zephyr_lora_files():
"""Download zephyr LoRA files once per test session."""
def qwen3_lora_files():
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a smaller LoRA model can reduce CI pressure.

"""Download Qwen3 LoRA files once per test session."""
from huggingface_hub import snapshot_download

return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
return snapshot_download(repo_id="charent/self_cognition_Alice")


@pytest.fixture(scope="session")
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
Expand Down
8 changes: 8 additions & 0 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.fixture(scope="module")
def zephyr_lora_files():
"""Download zephyr LoRA files once per test session."""
from huggingface_hub import snapshot_download

return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")


@pytest.fixture(scope="module")
def server(zephyr_lora_files): # noqa: F811
args = [
Expand Down
3 changes: 1 addition & 2 deletions tests/entrypoints/openai/test_chunked_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
Expand All @@ -20,7 +20,6 @@ def server():
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--max-num-seqs",
"128",
"--enable-chunked-prefill",
Expand Down
43 changes: 21 additions & 22 deletions tests/entrypoints/openai/test_lora_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
MODEL_NAME = "Qwen/Qwen3-0.6B"


BADREQUEST_CASES = [
(
Expand All @@ -33,11 +32,11 @@


@pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, zephyr_lora_files):
def server_with_lora_modules_json(request, qwen3_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "zephyr-lora",
"path": zephyr_lora_files,
"name": "qwen3-lora",
"path": qwen3_lora_files,
"base_model_name": MODEL_NAME,
}

Expand Down Expand Up @@ -74,35 +73,35 @@ async def client(server_with_lora_modules_json):


@pytest.mark.asyncio
async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert served_model.parent is None
assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[0].id == "qwen3-lora"


@pytest.mark.asyncio
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
response = await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
)
# Ensure adapter loads before querying /models
assert "success" in response

models = await client.models.list()
models = models.data
dynamic_lora_model = models[-1]
assert dynamic_lora_model.root == zephyr_lora_files
assert dynamic_lora_model.root == qwen3_lora_files
assert dynamic_lora_model.parent == MODEL_NAME
assert dynamic_lora_model.id == "zephyr-lora-3"
assert dynamic_lora_model.id == "qwen3-lora-3"


@pytest.mark.asyncio
Expand Down Expand Up @@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
async def test_dynamic_lora_badrequests(
client: openai.AsyncOpenAI,
tmp_path,
zephyr_lora_files,
qwen3_lora_files,
test_name: str,
config_change: dict,
expected_error: str,
Expand All @@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
test_dir = tmp_path / test_name

# Copy adapter files
shutil.copytree(zephyr_lora_files, test_dir)
shutil.copytree(qwen3_lora_files, test_dir)

# Load and modify configuration
config_path = test_dir / "adapter_config.json"
Expand All @@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(

@pytest.mark.asyncio
async def test_multiple_lora_adapters(
client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
):
"""Validate that many loras can be dynamically registered and inferenced
with concurrently"""
Expand All @@ -178,7 +177,7 @@ async def load_and_run_adapter(adapter_name: str):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
)
for _ in range(3):
await client.completions.create(
Expand All @@ -199,7 +198,7 @@ async def load_and_run_adapter(adapter_name: str):

@pytest.mark.asyncio
async def test_loading_invalid_adapters_does_not_break_others(
client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
):
invalid_files = tmp_path / "invalid_files"
invalid_files.mkdir()
Expand All @@ -215,7 +214,7 @@ async def run_good_requests(client):
while not stop_good_requests_event.is_set():
try:
batch = await client.completions.create(
model="zephyr-lora",
model="qwen3-lora",
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
Expand Down Expand Up @@ -254,7 +253,7 @@ async def run_good_requests(client):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": "valid", "lora_path": zephyr_lora_files},
body={"lora_name": "valid", "lora_path": qwen3_lora_files},
)
await client.completions.create(
model="valid",
Expand All @@ -267,15 +266,15 @@ async def run_good_requests(client):
async def test_beam_search_with_lora_adapters(
client: openai.AsyncOpenAI,
tmp_path,
zephyr_lora_files,
qwen3_lora_files,
):
"""Validate that async beam search can be used with lora."""

async def load_and_run_adapter(adapter_name: str):
await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
)
for _ in range(3):
await client.completions.create(
Expand Down
12 changes: 6 additions & 6 deletions tests/entrypoints/openai/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here


@pytest.fixture(scope="module")
def server(zephyr_lora_files):
def server(qwen3_lora_files):
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -25,7 +25,7 @@ def server(zephyr_lora_files):
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"qwen3-lora={qwen3_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
Expand All @@ -45,12 +45,12 @@ async def client(server):


@pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
assert lora_models[0].id == "qwen3-lora"
5 changes: 3 additions & 2 deletions tests/entrypoints/openai/test_orca_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
# When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11
completion_tokens=5, prompt_tokens=5, total_tokens=10
)

# test using token IDs
Expand Down
6 changes: 3 additions & 3 deletions tests/entrypoints/openai/test_return_tokens_as_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
def default_server_args(zephyr_lora_files):
def default_server_args(qwen3_lora_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
# lora config
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"qwen3-lora={qwen3_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_uds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/sagemaker/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from ...utils import RemoteOpenAIServer

# Model name constants used across tests
MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"

Expand Down
27 changes: 8 additions & 19 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,23 +154,6 @@ def dummy_model_gate_up() -> nn.Module:
return model


@pytest.fixture(scope="session")
def llama_2_7b_base_huggingface_id():
# used as a base model for testing with sql lora adapter
return "meta-llama/Llama-2-7b-hf"


@pytest.fixture(scope="session")
def sql_lora_huggingface_id():
# huggingface repo id is used to test lora runtime downloading.
return "yard1/llama-2-7b-sql-lora-test"


@pytest.fixture(scope="session")
def sql_lora_files(sql_lora_huggingface_id):
return snapshot_download(repo_id=sql_lora_huggingface_id)


@pytest.fixture(scope="session")
def mixtral_lora_files():
# Note: this module has incorrect adapter_config.json to test
Expand Down Expand Up @@ -256,8 +239,14 @@ def qwen3_lora_files():


@pytest.fixture(scope="session")
def llama32_lora_files():
return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
def llama32_lora_huggingface_id():
# huggingface repo id is used to test lora runtime downloading.
return "jeeejeee/llama32-3b-text2sql-spider"


@pytest.fixture(scope="session")
def llama32_lora_files(llama32_lora_huggingface_id):
return snapshot_download(repo_id=llama32_lora_huggingface_id)


@pytest.fixture
Expand Down
Loading