Skip to content

Commit 04fcf5d

Browse files
authoredMar 17, 2025
[LLM] add port check and fix bugs (#10153)
* [LLM] add port check and fix bugs * [LLM] fix stream output * [LLM] delete unused param * [LLM] fix start script
1 parent 7758b90 commit 04fcf5d

8 files changed

+36
-14
lines changed
 

‎llm/server/dockerfiles/Dockerfile_serving_cuda118_cudnn8

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
88
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
99
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
1010
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
11+
&& apt update && apt install net-tools \
1112
&& apt-get clean && rm -rf /var/lib/apt/lists/*
1213

1314
# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)

‎llm/server/dockerfiles/Dockerfile_serving_cuda124_cudnn9

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
88
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
99
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
1010
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
11+
&& apt update && apt install net-tools \
1112
&& apt-get clean && rm -rf /var/lib/apt/lists/*
1213

1314
# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)

‎llm/server/docs/deploy_usage_tutorial.md

+1
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ docker build --network=host -f ./dockerfiles/Dockerfile_serving_cuda124_cudnn9 -
416416
| USE_CACHE_KV_INT8 | int | 是否将 INT8配置为 KV Cache 的类型 || 0 | c8量化模型需要配置为1 |
417417
| MODEL_DIR | str | 模型文件路径 || /models/ | |
418418
| model_name | str | 模型名称 ||| 用于支持模型静态图下载,具体名称参考文档(#./static_models.md)|
419+
| OUTPUT_LOG_TO_CONSOLE | str | 是否定向输出到console 文件中 || 0 | |
419420

420421
## 请求参数介绍
421422

‎llm/server/server/scripts/start_server.sh

+24-4
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,21 @@ export SERVICE_GRPC_PORT=${GRPC_PORT:-${SERVICE_GRPC_PORT:-"8811"}}
5555
export INTER_PROC_PORT=${INTER_QUEUE_PORT:-${INTER_PROC_PORT:-"8813"}}
5656
export SERVICE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-${SERVICE_HTTP_PORT:-"9965"}}
5757

58+
check_port_occupied() {
59+
local port=$1
60+
if netstat -tuln | grep -q ":${port}\b"; then
61+
echo "PORT: ${port} occupied! Please change the port!"
62+
exit 1
63+
fi
64+
}
65+
66+
check_port_occupied ${HEALTH_HTTP_PORT}
67+
check_port_occupied ${METRICS_HTTP_PORT}
68+
check_port_occupied ${SERVICE_GRPC_PORT}
69+
check_port_occupied ${INTER_PROC_PORT}
70+
check_port_occupied ${SERVICE_HTTP_PORT}
71+
72+
5873

5974
if [ ! -d "llm_model" ];then
6075
ln -s /opt/source/PaddleNLP/llm/server/server/llm_model llm_model
@@ -83,15 +98,20 @@ else
8398
sleep ${SERVER_WAITTING_TIME:-"25"}
8499
fi
85100

86-
87-
88-
tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
101+
OUTPUT_LOG_TO_CONSOLE=${OUTPUT_LOG_TO_CONSOLE:-"0"}
102+
# Set the log redirection based on whether logs should be output to the console
103+
LOG_REDIRECT=""
104+
# If OUTPUT_LOG_TO_CONSOLE is set to "1", redirect logs to the console log file
105+
if [ "$OUTPUT_LOG_TO_CONSOLE" == "1" ]; then
106+
LOG_REDIRECT="> log/console.log 2>&1"
107+
fi
108+
eval tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
89109
--cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
90110
--cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
91111
--pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
92112
--allow-http false \
93113
--grpc-port=${SERVICE_GRPC_PORT} \
94114
--metrics-port=${METRICS_HTTP_PORT} \
95-
--log-file log/server.log --log-info true &
115+
--log-file log/server.log --log-info true $LOG_REDIRECT &
96116

97117
echo "The logs for the model service, please check" ${PWD}"/log/server.log and "${PWD}"/log/workerlog.0"

‎llm/server/server/server/data/processor.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,9 @@ def process_response(self, response_dict, **kwargs):
186186
response_dict["usage"] = {"completion_tokens" : response_dict["send_idx"] + 1}
187187

188188
if is_end:
189-
response_dict["tokens_all"] = self.clear_request_status(req_id)
189+
self.clear_request_status(req_id)
190+
token_ids = response_dict.get("tokens_all_ids", [])
191+
response_dict["tokens_all"] = self.ids2tokens(token_ids, response_dict["req_id"])
190192
return response_dict
191193

192194
def text2ids(self, text):

‎llm/server/server/server/engine/infer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,7 @@ def run(self):
612612
engine_healthy_recorded_time_array,
613613
) = self.initialize_engine_healthy_recorded_time_flag()
614614
engine_healthy_recorded_time_array[0] = time.time()
615-
# infer_live_flag_shm = self.initialize_engine_live_flag()
615+
infer_live_flag_shm = self.initialize_engine_live_flag()
616616
infer_seed_increment = paddle.full(shape=[self.args.max_batch_size, 1], fill_value=4, dtype="int64")
617617
# thread_executor = ThreadPoolExecutor(max_workers=1)
618618
real_bsz = None

‎llm/server/server/server/engine/token_processor.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,12 @@ def process_sampling_results(self):
100100
except Exception as e:
101101
model_server_logger.info("while get input_data error: {0} {1}".format(e, str(traceback.format_exc())))
102102

103-
def postprocess(self, batch_result, exist_finished_task=False):
103+
def postprocess(self, batch_result):
104104
"""
105105
single post-processing function
106106
107107
Args:
108108
batch_result (list): batch results
109-
exist_finished_task (bool): whether there is a finished task
110109
"""
111110
result_dir = "./generate_token_results"
112111
if not os.path.exists(result_dir):
@@ -213,7 +212,6 @@ def _process_batch_output(self):
213212
accept_num = tokens[2 : batch + 2]
214213

215214
batch_result = list()
216-
exist_finished_task = False
217215
for i in range(batch):
218216
if self.resource_manager.stop_flags[i]:
219217
continue
@@ -248,11 +246,10 @@ def _process_batch_output(self):
248246
f"Speculate accept ratio: {1 - self.total_step * 1.0 / self.number_of_output_tokens}"
249247
f" total step: {self.total_step}. total_output_token_num: {self.number_of_output_tokens}"
250248
)
251-
exist_finished_task = True
252249
break
253250
batch_result.append(result)
254251

255-
self.postprocess(batch_result, exist_finished_task)
252+
self.postprocess(batch_result)
256253

257254

258255
class WarmUpTokenProcessor(TokenProcessor):
@@ -265,7 +262,7 @@ def __init__(self, cfg):
265262
self._is_running = True
266263
self._is_blocking = True
267264

268-
def postprocess(self, batch_result, exist_finished_task=False):
265+
def postprocess(self, batch_result):
269266
pass
270267

271268
def process_sampling_results(self):

‎llm/server/server/server/triton_server.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,12 @@ def _cache_special_tokens(self, batch_result):
111111
["req_id"]] + batch_result[i]["token_scores"]
112112
del self.score_buffer[batch_result[i]["req_id"]]
113113

114-
def postprocess(self, batch_result, exist_finished_task=False):
114+
def postprocess(self, batch_result):
115115
"""
116116
single postprocess for triton
117117
"""
118118
try:
119-
self._cache_special_tokens(batch_result)
119+
# self._cache_special_tokens(batch_result)
120120
self.cached_generated_tokens.put(batch_result)
121121
except Exception as e:
122122
model_server_logger.info(

0 commit comments

Comments
 (0)
Failed to load comments.