Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions tests/entrypoints/openai/test_response_api_with_harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
assert response.status == "completed"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str):
Expand Down
25 changes: 15 additions & 10 deletions vllm/entrypoints/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def __init__(
available_tools: list[str],
):
self._messages = messages
self.finish_reason: Optional[str] = None
self.available_tools = available_tools
self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
self.called_tools: set[str] = set()
Expand All @@ -135,7 +136,8 @@ def _update_num_reasoning_tokens(self):
if self.parser.current_channel in {"analysis", "commentary"}:
self.num_reasoning_tokens += 1

def append_output(self, output) -> None:
def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput):
output_token_ids = output.outputs[0].token_ids
self.parser = get_streamable_parser_for_assistant()
Expand All @@ -150,25 +152,27 @@ def append_output(self, output) -> None:
# Move current turn to previous turn for next turn's calculations
self.previous_turn = self.current_turn.copy()
output_msgs = self.parser.messages
# The responses finish reason is set in the last message
self.finish_reason = output.outputs[0].finish_reason
else:
# Tool output.
output_msgs = output
self._messages.extend(output_msgs)

def _update_prefill_token_usage(self, output: RequestOutput) -> None:
"""Update token usage statistics for the prefill phase of generation.

The prefill phase processes the input prompt tokens. This method:
1. Counts the prompt tokens for this turn
2. Calculates tool output tokens for multi-turn conversations
3. Updates cached token counts
4. Tracks state for next turn calculations

Tool output tokens are calculated as:
current_prompt_tokens - last_turn_prompt_tokens -
current_prompt_tokens - last_turn_prompt_tokens -
last_turn_output_tokens
This represents tokens added between turns (typically tool responses).

Args:
output: The RequestOutput containing prompt token information
"""
Expand Down Expand Up @@ -214,18 +218,18 @@ def _update_prefill_token_usage(self, output: RequestOutput) -> None:

def _update_decode_token_usage(self, output: RequestOutput) -> int:
"""Update token usage statistics for the decode phase of generation.

The decode phase processes the generated output tokens. This method:
1. Counts output tokens from all completion outputs
2. Updates the total output token count
3. Tracks tokens generated in the current turn

In streaming mode, this is called for each token generated.
In non-streaming mode, this is called once with all output tokens.

Args:
output: The RequestOutput containing generated token information

Returns:
int: Number of output tokens processed in this call
"""
Expand Down Expand Up @@ -385,7 +389,8 @@ def __init__(self, *args, **kwargs):
def messages(self) -> list:
return self.parser.messages

def append_output(self, output) -> None:
def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput):
# append_output is called for each output token in streaming case,
# so we only want to add the prompt tokens once for each message.
Expand Down
4 changes: 3 additions & 1 deletion vllm/entrypoints/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,9 @@ def parse_remaining_state(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
# if the parser still has messages (ie if the generator got cut
# abruptly), this should be incomplete
status="incomplete",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the parser still has messages (ie if the generator got cut abruptly, this should be incomplete and not completed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sense please move your comment to the code :)

type="message",
)
return [text_item]
Expand Down
17 changes: 13 additions & 4 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from openai.types.responses import (ResponseFormatTextConfig as
ResponseTextConfig)

from openai.types.responses.response import ToolChoice
from openai.types.responses.response import IncompleteDetails, ToolChoice
from openai.types.responses.tool import Tool
from openai.types.shared import Metadata, Reasoning
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
Expand Down Expand Up @@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
created_at: int = Field(default_factory=lambda: int(time.time()))
# error: Optional[ResponseError] = None
# incomplete_details: Optional[IncompleteDetails] = None
incomplete_details: Optional[IncompleteDetails] = None
instructions: Optional[str] = None
metadata: Optional[Metadata] = None
model: str
Expand Down Expand Up @@ -1904,9 +1904,18 @@ def from_request(
status: ResponseStatus,
usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse":

incomplete_details: Optional[IncompleteDetails] = None
if status == 'incomplete':
incomplete_details = IncompleteDetails(reason='max_output_tokens')
# TODO: implement the other reason for incomplete_details,
# which is content_filter
# incomplete_details = IncompleteDetails(reason='content_filter')
Comment on lines +1911 to +1913
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's missing from current logic btw.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think VLLM baseline implementation supports content filter as an abort reason currently: https://github.com/vllm-project/vllm/blob/main/vllm/v1/request.py#L206


return cls(
id=request.request_id,
created_at=created_time,
incomplete_details=incomplete_details,
instructions=request.instructions,
metadata=request.metadata,
model=model_name,
Expand Down Expand Up @@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):

class TokenizerInfoResponse(OpenAIBaseModel):
"""
Response containing tokenizer configuration
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""

Expand Down Expand Up @@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to_language: Optional[str] = None
"""The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this
Please note that this is not currently used by supported models at this
time, but it is a placeholder for future use, matching translation api.
"""

Expand Down
18 changes: 15 additions & 3 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
ResponseReasoningItem,
ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent,
response_text_delta_event)
ResponseStatus, response_text_delta_event)
from openai.types.responses.response_output_text import (Logprob,
LogprobTopLogprob)
# yapf: enable
Expand Down Expand Up @@ -461,10 +461,22 @@ async def responses_full_generator(
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))

# NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now.
status: ResponseStatus = "completed"

if self.use_harmony:
assert isinstance(context, HarmonyContext)
output = self._make_response_output_items_with_harmony(context)
num_tool_output_tokens = context.num_tool_output_tokens
if len(output) > 0:
if context.finish_reason == "length":
status = "incomplete"
elif context.finish_reason == "abort":
status = "cancelled"
else:
status = "incomplete"
else:
assert isinstance(context, SimpleContext)
final_res = context.last_output
Expand Down Expand Up @@ -501,7 +513,7 @@ async def responses_full_generator(
model_name=model_name,
created_time=created_time,
output=output,
status="completed",
status=status,
usage=usage,
)

Expand Down Expand Up @@ -658,7 +670,7 @@ def _make_response_output_items_with_harmony(
self,
context: HarmonyContext,
) -> list[ResponseOutputItem]:
output_items = []
output_items: list[ResponseOutputItem] = []
num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg))
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/core/sched/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,19 @@

def remove_all(lst: list, items_to_remove: set) -> list:
"""Remove all items from a list that are in the items_to_remove set.

This method optimizes for the common case of removing a single item,
falling back to list comprehension for multiple items.

Args:
lst: The list to remove items from
items_to_remove: Set of items to remove

Returns:
Either the modified original list (for single item removal) or
a new list (for multiple item removal). Callers should use the
returned value.

Note:
For single item removal, this modifies the original list in-place
and returns it. For multiple items, it creates and returns a new list.
Expand Down
6 changes: 3 additions & 3 deletions vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,17 +373,17 @@ def process_outputs(
1) Compute stats for logging
2) Detokenize
3) Create and handle RequestOutput objects:
* If there is a queue (for usage with AsyncLLM),
* If there is a queue (for usage with AsyncLLM),
put the RequestOutput objects into the queue for
handling by the per-request generate() tasks.

* If there is no queue (for usage with LLMEngine),
* If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects.

NOTE FOR DEVELOPERS

vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs.

If you need to touch every element of the batch, do it from
Expand Down