Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions vllm/entrypoints/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ class RequestLogger:
def __init__(self, *, max_log_len: int | None) -> None:
self.max_log_len = max_log_len

def log_arrival(self, request_id: str) -> None:
logger.info("Arrived request %s", request_id)

def log_inputs(
self,
request_id: str,
Expand Down
17 changes: 9 additions & 8 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,15 @@ async def create_chat_completion(
if self.engine_client.errored:
raise self.engine_client.dead_error

request_id = (
f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
)
self._log_arrival(request_id)

request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
raw_request.state.request_metadata = request_metadata

try:
lora_request = self._maybe_get_adapters(
request, supports_default_mm_loras=True
Expand Down Expand Up @@ -257,14 +266,6 @@ async def create_chat_completion(
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(f"{e} {e.__cause__}")

request_id = (
f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
)

request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
raw_request.state.request_metadata = request_metadata

# Schedule the request and get the result generator.
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/serving_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ async def create_classify(
) -> ClassificationResponse | ErrorResponse:
model_name = self.models.model_name()
request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

ctx = ClassificationServeContext(
request=request,
Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ async def create_completion(
)

request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
self._log_arrival(request_id)

created_time = int(time.time())

request_metadata = RequestResponseMetadata(request_id=request_id)
Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/serving_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ async def create_embedding(
f"{self.request_id_prefix}-"
f"{self._base_request_id(raw_request, request.request_id)}"
)
self._log_arrival(request_id)

ctx = EmbeddingServeContext(
request=request,
Expand Down
5 changes: 5 additions & 0 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,11 @@ def _get_prompt_components(

return get_prompt_components(prompt) # type: ignore[arg-type]

def _log_arrival(self, request_id: str) -> None:
if self.request_logger is None:
return
self.request_logger.log_arrival(request_id)

def _log_inputs(
self,
request_id: str,
Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/serving_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ async def create_pooling(
model_name = self.models.model_name()

request_id = f"pool-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

created_time = int(time.time())

is_io_processor_request = isinstance(request, IOProcessorRequest)
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,8 @@ async def create_score(
return error_check_ret

request_id = f"score-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

created_time = int(time.time())

try:
Expand Down Expand Up @@ -393,6 +395,8 @@ async def do_rerank(
return error_check_ret

request_id = f"rerank-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

documents = request.documents
top_n = (
request.top_n
Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/serving_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ async def create_tokenize(
return error_check_ret

request_id = f"tokn-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

try:
lora_request = self._maybe_get_adapters(request)
Expand Down Expand Up @@ -135,6 +136,7 @@ async def create_detokenize(
return error_check_ret

request_id = f"tokn-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

lora_request = self._maybe_get_adapters(request)

Expand Down
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ async def _create_speech_to_text(
)

request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
self._log_arrival(request_id)

request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
Expand Down