Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
115 commits
Select commit Hold shift + click to select a range
eb16239
checkpoint prototype
Jan 3, 2025
8549fdd
Issue currently is with streaming. The HTTP exception handlers do not…
Jan 3, 2025
77801cd
switch from ValueError -> Exception.
Jan 4, 2025
1bbc3a4
merged
Jan 4, 2025
8eca864
updated
Jan 4, 2025
b8c77b3
stash
Jan 4, 2025
ce9b8ef
stash
Jan 4, 2025
3a760a7
add watchdog
Jan 4, 2025
3024da0
updated
Jan 4, 2025
5af8189
revert spurious changes
Jan 4, 2025
3cb21bb
updated
Jan 4, 2025
7c97308
updated
Jan 4, 2025
ea6824a
updated
Jan 4, 2025
b278065
remove cruft
Jan 4, 2025
c004bd4
cruft
Jan 4, 2025
2556bc4
stash
Jan 4, 2025
db0b9e6
fix llama
Jan 4, 2025
f722589
updated
Jan 4, 2025
de75cc4
cruft
Jan 4, 2025
ba5ca87
cruft
Jan 4, 2025
4f6b68a
updated
Jan 4, 2025
949d425
updated
Jan 4, 2025
f67398b
updated
Jan 4, 2025
b3d2994
updated
Jan 4, 2025
34a997a
update comment
Jan 4, 2025
32cf91b
update comment
Jan 4, 2025
c73801c
fix more
Jan 4, 2025
1188845
updated
Jan 4, 2025
706782c
udpatd
Jan 4, 2025
1cc0915
added exception file
Jan 4, 2025
8db0eee
updated
Jan 4, 2025
2fc8af6
fixt
Jan 4, 2025
de39af1
reduce cruft
Jan 5, 2025
732ba64
reduce cruft
Jan 5, 2025
4372094
cleanup
Jan 5, 2025
b9144a3
updated
Jan 5, 2025
d90e122
cruft
Jan 5, 2025
2bbac31
updated
Jan 5, 2025
c40542a
revert changes to server
Jan 5, 2025
46734eb
revert debug cruft
Jan 5, 2025
f0baffb
fix error
Jan 5, 2025
8a7f18e
added tests
Jan 5, 2025
a662940
revert
Jan 5, 2025
4ee6390
fixed
Jan 5, 2025
3e23ee2
updated
Jan 5, 2025
45456f9
fixed error
Jan 5, 2025
6128b1a
update test coverage
Jan 5, 2025
de24559
stash
Jan 5, 2025
7adf26e
added tests
Jan 6, 2025
bf92854
stash
Jan 7, 2025
8dae5c6
updated
Feb 7, 2025
6b4fe88
updated
Feb 7, 2025
efe85ee
updared
Feb 7, 2025
6195795
fix typo
Feb 7, 2025
0b25586
updated
Feb 7, 2025
0b77b79
updated
Feb 8, 2025
61f3dd7
stash
Feb 8, 2025
fbf19ad
updated
Feb 8, 2025
d25ce5c
updated
Feb 8, 2025
23342d7
remove signal handler
Feb 8, 2025
ebdf8f9
remove signal handler
Feb 8, 2025
6a37020
update comment
Feb 8, 2025
2ed3349
avoid sigusr1
Feb 8, 2025
f9ef3d8
cleanup
Feb 8, 2025
95c249f
cleanup
Feb 8, 2025
030c671
cleanup
Feb 8, 2025
1bdb212
cleanup
Feb 8, 2025
25412a0
updated
Feb 8, 2025
7cf0647
updated
Feb 8, 2025
352da94
it starts?
Feb 8, 2025
a69e040
updated
Feb 8, 2025
8dddc20
updated
Feb 8, 2025
7b48b87
updated
Feb 8, 2025
7400852
updated
Feb 8, 2025
80317a0
updated
Feb 8, 2025
ca37960
nits
Feb 8, 2025
2d41499
fix test for bunched streaming
Feb 8, 2025
4a39d39
tweak typing
Feb 8, 2025
43360f0
Update tests/v1/shutdown/test_forward_error.py
robertgshaw2-redhat Feb 10, 2025
4d0f44f
Merge branch 'main' into api-server-error-handling
robertgshaw2-redhat Feb 10, 2025
218d095
pre commit
Feb 10, 2025
c395634
Update tests/v1/shutdown/test_forward_error.py
robertgshaw2-redhat Feb 10, 2025
042c486
Update vllm/v1/engine/core.py
robertgshaw2-redhat Feb 10, 2025
b5a7b6f
Update vllm/v1/engine/core.py
robertgshaw2-redhat Feb 10, 2025
dab77cf
Update tests/v1/shutdown/test_forward_error.py
robertgshaw2-redhat Feb 10, 2025
37e0c10
Rebase
rafvasq Feb 27, 2025
00974d9
Merge branch 'main' of https://github.com/vllm-project/vllm into api-…
rafvasq Feb 28, 2025
62346fc
Small fixes, splits startup tests to sim OOM
rafvasq Feb 28, 2025
3580322
Lints
rafvasq Feb 28, 2025
ba2fc3a
cleanup
Feb 28, 2025
b072a14
updated
Feb 28, 2025
d85f725
updated
Mar 1, 2025
3478e7a
updated
Mar 1, 2025
e03a211
cleaning up
Mar 1, 2025
57644ad
cleanup pr
Mar 1, 2025
d9496b4
cleanup pr
Mar 1, 2025
7df1ca7
updated
Mar 1, 2025
27751a7
updated
Mar 1, 2025
6e823ad
updated
Mar 1, 2025
7e3ffe8
updated
Mar 1, 2025
f405db8
updatd
Mar 1, 2025
867ff8f
updated
Mar 1, 2025
a8403ac
update to ensure we call shutdown on RPC error
Mar 1, 2025
18a4536
fixed
Mar 1, 2025
ed2759b
updated
Mar 1, 2025
113255e
updated
Mar 1, 2025
f79d23f
updated
Mar 1, 2025
a09ef27
updated
Mar 1, 2025
9b11b6c
updated
Mar 1, 2025
1f7ed2e
updated
Mar 1, 2025
fba8e41
updated
Mar 1, 2025
9a3c861
removed mp=0 tests
Mar 1, 2025
38857d8
removed non mp tests
Mar 1, 2025
ac06927
removed non mp tests
Mar 1, 2025
b002dcf
fixed error
Mar 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ steps:
commands:
# split the test to avoid interference
- VLLM_USE_V1=1 pytest -v -s v1/core
- VLLM_USE_V1=1 pytest -v -s v1/shutdown
- VLLM_USE_V1=1 pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker
Expand Down
7 changes: 3 additions & 4 deletions tests/v1/engine/test_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

import pytest

from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
from vllm import SamplingParams
from vllm.assets.image import ImageAsset
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import PromptType
from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError

if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.",
Expand Down Expand Up @@ -88,10 +88,11 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(
apc_engine_args = AsyncEngineArgs(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.8,
enforce_eager=True,
disable_log_requests=True)
engine = AsyncLLM.from_engine_args(apc_engine_args)
try:
with pytest.raises(ValueError) as excinfo:
with pytest.raises(EngineGenerateError):
# Issue a request with prompt logprobs enabled, which should fail
await asyncio.create_task(
generate(engine,
Expand All @@ -100,8 +101,6 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(
output_kind,
10,
prompt_logprobs=5))
# Validate exception string is correct
assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
finally:
# Shut down engine
engine.shutdown()
Expand Down
117 changes: 117 additions & 0 deletions tests/v1/shutdown/test_forward_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle an Error in model forward and shutdown."""

import asyncio

import pytest

from tests.utils import wait_for_gpu_memory_to_clear
from vllm import LLM, SamplingParams
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import GiB_bytes, cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError


def evil_forward(self, *args, **kwargs):
"""Evil forward method that raises an exception after 10 calls."""
NUMBER_OF_GOOD_PASSES = 10

if not hasattr(self, "num_calls"):
self.num_calls = 0

if (self.num_calls == NUMBER_OF_GOOD_PASSES
and get_tensor_model_parallel_rank() == 0):
raise Exception("Simulated illegal memory access on Rank 0!")
self.num_calls += 1

return self.model(*args, **kwargs)


@pytest.mark.asyncio
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
async def test_async_llm_model_error(monkeypatch, tensor_parallel_size):

if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

# Monkeypatch an error in the model.
m.setattr(LlamaForCausalLM, "forward", evil_forward)

engine_args = AsyncEngineArgs(
model="meta-llama/Llama-3.2-1B",
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size)
async_llm = AsyncLLM.from_engine_args(engine_args)

async def generate(request_id: str):
generator = async_llm.generate("Hello my name is",
request_id=request_id,
sampling_params=SamplingParams())
try:
async for _ in generator:
pass
except Exception as e:
return e

NUM_REQS = 3
tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
outputs = await asyncio.gather(*tasks)

# Every request should get an EngineDeadError.
for output in outputs:
assert isinstance(output, EngineDeadError)

# AsyncLLM should be errored.
assert async_llm.errored

# We should not be able to make another request.
with pytest.raises(EngineDeadError):
async for _ in async_llm.generate(
"Hello my name is",
request_id="abc",
sampling_params=SamplingParams()):
raise Exception("We should not get here.")

# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * GiB_bytes,
timeout_s=60,
)

# NOTE: shutdown is handled by the API Server if an exception
# occurs, so it is expected that we would need to call this.
async_llm.shutdown()


@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
def test_llm_model_error(monkeypatch, tensor_parallel_size):

if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

# Monkeypatch an error in the model.
m.setattr(LlamaForCausalLM, "forward", evil_forward)

llm = LLM(model="meta-llama/Llama-3.2-1B",
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size)

with pytest.raises(EngineDeadError):
llm.generate("Hello my name is Robert and I")

# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * GiB_bytes,
timeout_s=60,
)
65 changes: 65 additions & 0 deletions tests/v1/shutdown/test_processor_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# SPDX-License-Identifier: Apache-2.0
"""Test error handling in Processor. Should not impact other reqs."""

import asyncio

import pytest

from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError


@pytest.mark.asyncio
async def test_async_llm_processor_error(monkeypatch):

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
enforce_eager=True)
async_llm = AsyncLLM.from_engine_args(engine_args)

async def generate(request_id: str):
# [] is not allowed and will raise a ValueError in Processor.
generator = async_llm.generate(TokensPrompt([]),
request_id=request_id,
sampling_params=SamplingParams())
try:
async for _ in generator:
pass
except Exception as e:
return e

NUM_REQS = 3
tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
outputs = await asyncio.gather(*tasks)

# Every request should have get an EngineGenerateError.
for output in outputs:
with pytest.raises(EngineGenerateError):
raise output

# AsyncLLM should be errored.
assert not async_llm.errored

# This should be no problem.
EXPECTED_TOKENS = 5
outputs = []
async for out in async_llm.generate(
"Hello my name is",
request_id="abc",
sampling_params=SamplingParams(
max_tokens=EXPECTED_TOKENS,
output_kind=RequestOutputKind.DELTA)):
outputs.append(out)

generated_tokens = []
for out in outputs:
generated_tokens.extend(out.outputs[0].token_ids)
assert len(generated_tokens) == EXPECTED_TOKENS

async_llm.shutdown()
152 changes: 152 additions & 0 deletions tests/v1/shutdown/test_startup_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""

import pytest

from tests.utils import wait_for_gpu_memory_to_clear
from vllm import LLM
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import GiB_bytes, cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM


def evil_forward(self, *args, **kwargs):
"""Evil forward method that raises an exception."""

if get_tensor_model_parallel_rank() == 0:
raise RuntimeError("Simulated Error during forward pass!")

return self.model(*args, **kwargs)


def evil_load_weights(self, *args, **kwargs):
"""Evil load_weights method that raises an exception."""

raise RuntimeError("Simulated OOM Error during weight loading!")


MODELS = [
"meta-llama/Llama-3.2-1B",
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
def test_async_llm_forward_pass_error(monkeypatch, model,
tensor_parallel_size):
"""Test failure during first forward pass"""

if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

# Monkeypatch an error in the model.
m.setattr(LlamaForCausalLM, "load_weights", evil_load_weights)

engine_args = AsyncEngineArgs(
model=model,
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size)

# Confirm we get an exception.
with pytest.raises(Exception,
match="EngineCore initialization failed"):
_ = AsyncLLM.from_engine_args(engine_args)

# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * GiB_bytes,
timeout_s=60,
)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
def test_async_llm_weight_loading_failure(monkeypatch, model,
tensor_parallel_size):
"""Test failure during first forward pass"""

if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

# Monkeypatch an error in the model.
m.setattr(LlamaForCausalLM, "forward", evil_forward)

engine_args = AsyncEngineArgs(
model=model,
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size)

# Confirm we get an exception.
with pytest.raises(Exception,
match="EngineCore initialization failed"):
_ = AsyncLLM.from_engine_args(engine_args)

# Confirm all the processes are cleaned up.
wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * GiB_bytes,
timeout_s=60,
)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
def test_llm_forward_pass_failure(monkeypatch, model, tensor_parallel_size):
"""Test failure during first forward pass (after IPC setup)."""

if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

# Simulate error during forward pass
m.setattr(LlamaForCausalLM, "forward", evil_forward)

with pytest.raises(Exception,
match="EngineCore initialization failed"):
_ = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size)

wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * GiB_bytes,
timeout_s=60,
)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
def test_llm_weight_loading_failure(monkeypatch, model, tensor_parallel_size):
"""Test failure during weight loading (before IPC setup)."""

if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

# Simulate error during weight loading
m.setattr(LlamaForCausalLM, "load_weights", evil_load_weights)

with pytest.raises(Exception,
match="EngineCore initialization failed"):
_ = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tensor_parallel_size)

wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)),
threshold_bytes=2 * GiB_bytes,
timeout_s=60,
)
Loading