From 9783bcb2b27668787aa629126c72e4404e0525fb Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 11 Dec 2024 22:51:39 -0800
Subject: [PATCH 1/3] fix

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/core_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4d96b323d166..02d9838e21d5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,7 @@ def shutdown(self):
     def __del__(self):
         self.shutdown()
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
         self.engine_core.profile(is_start)
 
 

From 3da89841f60fcf774b0f0896e88c397006ce8973 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 12 Dec 2024 00:13:24 -0800
Subject: [PATCH 2/3] fix for SyncMPClient

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/core_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 02d9838e21d5..f13993fc9ff5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -206,7 +206,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 

From 93e05e2e308f7dbe281525c2ec8610fc89408e9a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 12 Dec 2024 00:13:37 -0800
Subject: [PATCH 3/3] refactor example script

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_with_profiler.py | 31 +++++++++++++--------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py
index 1f00d2680877..abcfa8e8f2f2 100644
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 from vllm import LLM, SamplingParams
 
@@ -15,19 +16,25 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+if __name__ == "__main__":
 
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    llm.start_profile()
 
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
 
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    llm.stop_profile()
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)