Skip to content

Commit 96f1742

Browse files
committed
add fastdeploy engine support for api serve
1 parent b507eeb commit 96f1742

File tree

3 files changed

+47
-3
lines changed

3 files changed

+47
-3
lines changed

llm/benchmark/rl/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ python api_serve.py \
5151
--tokenizer "Qwen/Qwen2.5-7B-Instruct-1M" \
5252
--input_file ./data/gsm8k/instruct/train.parquet \
5353
--output_dir ${output_dir} \
54+
--use_fastdeploy true \
5455
--rollout_input_batch_size 8 \
5556
--rollout_n 8 \
5657
--top_p 1.0 \
@@ -65,6 +66,7 @@ python api_serve.py \
6566
* **`--tokenizer`**: Path or name of the tokenizer.
6667
* **`--input_file`**: Path to the input dataset file.
6768
* **`--output_dir`**: Directory to save output results.
69+
* **`--use_fastdeploy`**: Use FastDeploy if true, otherwise use vLLM (default: true).
6870
* **`--rollout_input_batch_size`**: The batch size for API requests.
6971
* **`--rollout_n`**: Number of responses to generate for each input query.
7072
* **`--max_dec_len`**: Maximum decoding length for responses.

llm/benchmark/rl/api_serve.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,43 @@ def batch_process(self, dataframe: pd.DataFrame):
125125
yield batch_prompts
126126
batch_prompts = []
127127

128-
async def call(self, request: RequestPayload) -> Tuple[str, float]:
128+
async def fastdeploy_call(self, request: RequestPayload) -> Tuple[str, float]:
129+
client = self.get_client()
130+
try:
131+
async with self.semaphore:
132+
start_time = time.time()
133+
response = await client.chat.completions.create(
134+
model=self.model,
135+
messages=[{"role": "user", "content": request.prompt}],
136+
temperature=self.args.temperature,
137+
top_p=self.args.top_p,
138+
max_tokens=self.args.max_response_length,
139+
n=1,
140+
stream=True,
141+
timeout=60*60,
142+
metadata={
143+
"training": True,
144+
"raw_request": False,
145+
}
146+
)
147+
# Streaming text is stored in a list of chunks
148+
chunks = []
149+
# Streaming responses
150+
async for chunk in response:
151+
delta = chunk.choices[0].delta
152+
if delta and delta.content:
153+
chunks.append(delta.content)
154+
text = "".join(chunks)
155+
end_time = time.time()
156+
elapsed_time = end_time - start_time
157+
logger.debug("Streaming response took %.2f seconds", elapsed_time)
158+
return text, round(elapsed_time, 2)
159+
160+
except Exception as e:
161+
logger.error("Error while streaming: %s", e)
162+
raise ValueError(e)
163+
164+
async def vllm_call(self, request: RequestPayload) -> Tuple[str, float]:
129165
client = self.get_client()
130166
try:
131167
async with self.semaphore:
@@ -157,7 +193,12 @@ async def call(self, request: RequestPayload) -> Tuple[str, float]:
157193

158194
async def group_call(self, request: RequestPayload) -> ResponsePayload:
159195
"""Performs n complete token generation rollouts for the given query."""
160-
tasks = [self.call(request) for _ in range(request.num_responses)]
196+
if self.args.use_fastdeploy == "true":
197+
call = self.fastdeploy_call
198+
else:
199+
call = self.vllm_call
200+
201+
tasks = [call(request) for _ in range(request.num_responses)]
161202

162203
result = ResponsePayload()
163204
result.idx = request.idx
@@ -341,9 +382,9 @@ def parse_args():
341382
parser.add_argument(
342383
"--limit_rows", type=int, default=-1, help="Maximum number of rows to read from the dataset (-1 means all)"
343384
)
385+
parser.add_argument("--use_fastdeploy", type=str.lower, choices=["true", "false"], default="true", help="Engine selection (true=FastDeploy, false=vLLM, default: true)")
344386
return parser.parse_args()
345387

346-
347388
if __name__ == "__main__":
348389
args = parse_args()
349390
task = ApiTask(args)

llm/benchmark/rl/scripts/api_serve.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ python api_serve.py \
55
--api_keys "key1" "key2" \
66
--model "Qwen2.5-7B-Instruct-1M" \
77
--tokenizer "Qwen/Qwen2.5-7B-Instruct-1M" \
8+
--use_fastdeploy true \
89
--input_file your_file \
910
--output_dir ${output_dir} \
1011
--rollout_input_batch_size 8 \

0 commit comments

Comments
 (0)