From 9783bcb2b27668787aa629126c72e4404e0525fb Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 11 Dec 2024 22:51:39 -0800 Subject: [PATCH 1/3] fix Signed-off-by: Roger Wang --- vllm/v1/engine/core_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4d96b323d166..02d9838e21d5 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -105,7 +105,7 @@ def shutdown(self): def __del__(self): self.shutdown() - async def profile(self, is_start=True) -> None: + def profile(self, is_start=True) -> None: self.engine_core.profile(is_start) From 3da89841f60fcf774b0f0896e88c397006ce8973 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 12 Dec 2024 00:13:24 -0800 Subject: [PATCH 2/3] fix for SyncMPClient Signed-off-by: Roger Wang --- vllm/v1/engine/core_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 02d9838e21d5..f13993fc9ff5 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -206,7 +206,7 @@ def add_request(self, request: EngineCoreRequest) -> None: def abort_requests(self, request_ids: List[str]) -> None: self._send_input(EngineCoreRequestType.ABORT, request_ids) - async def profile(self, is_start=True) -> None: + def profile(self, is_start=True) -> None: self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) From 93e05e2e308f7dbe281525c2ec8610fc89408e9a Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 12 Dec 2024 00:13:37 -0800 Subject: [PATCH 3/3] refactor example script Signed-off-by: Roger Wang --- examples/offline_inference_with_profiler.py | 31 +++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py index 1f00d2680877..abcfa8e8f2f2 100644 --- a/examples/offline_inference_with_profiler.py +++ b/examples/offline_inference_with_profiler.py @@ -1,4 +1,5 @@ import os +import time from vllm import LLM, SamplingParams @@ -15,19 +16,25 @@ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) +if __name__ == "__main__": -llm.start_profile() + # Create an LLM. + llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) + llm.start_profile() -llm.stop_profile() + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + llm.stop_profile() + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Add a buffer to wait for profiler in the background process + # (in case MP is on) to finish writing profiling output. + time.sleep(10)