Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics to RequestOutput #2876

Merged
merged 5 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/async_engine/test_request_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def test_request_tracker():
stream_5 = tracker.add_request("5")
assert tracker.new_requests_event.flag
tracker.process_request_output(
RequestOutput("2", "output", [], [], [], finished=True))
RequestOutput("2", "output", [], [], [], bool(finished), 0, 0, 0, 0,
0))
new, finished = tracker.get_new_and_finished_requests()
assert not tracker.new_requests_event.flag
assert len(finished) == 1
Expand Down
5 changes: 5 additions & 0 deletions vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,10 +365,15 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
scheduler_outputs = self._schedule()
now = time.time()

# Create input data structures.
seq_group_metadata_list: List[SequenceGroupMetadata] = []
for seq_group in scheduler_outputs.scheduled_seq_groups:
if seq_group.first_scheduled_time is None:
seq_group.first_scheduled_time = now
seq_group.time_in_queue = now - seq_group.arrival_time

seq_data: Dict[int, SequenceData] = {}
block_tables: Dict[int, List[int]] = {}
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
Expand Down
29 changes: 28 additions & 1 deletion vllm/outputs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional
import time

from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
SequenceStatus)
Expand Down Expand Up @@ -60,6 +61,11 @@ class RequestOutput:
prompt_logprobs: The log probabilities to return per prompt token.
outputs: The output sequences of the request.
finished: Whether the whole request is finished.
arrival_time: The time when the request arrived.
first_scheduled_time: The time when the request was first scheduled.
first_token_time: The time when the first token was generated.
time_in_queue: The time the request spent in the queue.
finished_time: The time when the request was finished.
lora_request: The LoRA request that was used to generate the output.
"""

Expand All @@ -71,6 +77,11 @@ def __init__(
prompt_logprobs: Optional[PromptLogprobs],
outputs: List[CompletionOutput],
finished: bool,
arrival_time: float,
first_scheduled_time: float,
first_token_time: float,
time_in_queue: float,
finished_time: Optional[float] = None,
Yard1 marked this conversation as resolved.
Show resolved Hide resolved
lora_request: Optional[LoRARequest] = None,
) -> None:
self.request_id = request_id
Expand All @@ -79,6 +90,11 @@ def __init__(
self.prompt_logprobs = prompt_logprobs
self.outputs = outputs
self.finished = finished
self.arrival_time = arrival_time
self.first_scheduled_time = first_scheduled_time
self.first_token_time = first_token_time
self.time_in_queue = time_in_queue
self.finished_time = finished_time
self.lora_request = lora_request

@classmethod
Expand Down Expand Up @@ -115,12 +131,18 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
prompt_token_ids = seq_group.prompt_token_ids
prompt_logprobs = seq_group.prompt_logprobs
finished = seq_group.is_finished()
finished_time = time.time() if finished else None
return cls(seq_group.request_id,
prompt,
prompt_token_ids,
prompt_logprobs,
outputs,
finished,
arrival_time=seq_group.arrival_time,
first_scheduled_time=seq_group.first_scheduled_time,
first_token_time=seq_group.first_token_time,
time_in_queue=seq_group.time_in_queue,
finished_time=finished_time,
lora_request=seq_group.lora_request)

def __repr__(self) -> str:
Expand All @@ -130,4 +152,9 @@ def __repr__(self) -> str:
f"prompt_logprobs={self.prompt_logprobs}, "
f"outputs={self.outputs}, "
f"finished={self.finished}, "
f"lora_request={self.lora_request})")
f"arrival_time={self.arrival_time}, "
f"first_scheduled_time={self.first_scheduled_time}, "
f"first_token_time={self.first_token_time}, "
f"time_in_queue={self.time_in_queue}, "
f"finished_time={self.finished_time}, "
f"lora_request={self.lora_request})")
10 changes: 8 additions & 2 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,11 @@ def __init__(
self.request_id = request_id
self.seqs_dict = {seq.seq_id: seq for seq in seqs}
self.sampling_params = sampling_params
self.arrival_time = arrival_time
self.last_token_time = arrival_time
self.arrival_time: float = arrival_time
self.last_token_time: float = arrival_time
self.first_scheduled_time: Optional[float] = None
self.first_token_time: Optional[float] = None
self.time_in_queue: Optional[float] = None
self.lora_request = lora_request
self.prefix: Optional[Prefix] = prefix
self.prompt_logprobs: Optional[PromptLogprobs] = None
Expand All @@ -276,6 +279,9 @@ def lora_int_id(self) -> int:

def get_last_latency(self, now: float) -> float:
"""Gets last token latency for Request level timings."""
if self.first_token_time is None:
self.first_token_time = now
Yard1 marked this conversation as resolved.
Show resolved Hide resolved

latency = now - self.last_token_time
self.last_token_time = now
return latency
Expand Down
Loading