Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLM] add port check and fix bugs #10153

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llm/server/dockerfiles/Dockerfile_serving_cuda118_cudnn8
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
&& apt update && apt install net-tools \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)
1 change: 1 addition & 0 deletions llm/server/dockerfiles/Dockerfile_serving_cuda124_cudnn9
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@ RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.or
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddle-qa.bj.bcebos.com/paddlenlp/wheel/2f85a64edd4aa9911c94ccb5ce53e83ac41ce22b/paddlenlp-3.0.0b3.post20250123-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir --force-reinstall https://paddlepaddle-inference-banchmark.bj.bcebos.com/paddlenlp_ops-0.0.0-py3-none-any.whl \
&& python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
&& apt update && apt install net-tools \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# clone paddle & paddlenlp 源码(代码版本应与上述安装版本对齐)
1 change: 1 addition & 0 deletions llm/server/docs/deploy_usage_tutorial.md
Original file line number Diff line number Diff line change
@@ -416,6 +416,7 @@ docker build --network=host -f ./dockerfiles/Dockerfile_serving_cuda124_cudnn9 -
| USE_CACHE_KV_INT8 | int | 是否将 INT8配置为 KV Cache 的类型 || 0 | c8量化模型需要配置为1 |
| MODEL_DIR | str | 模型文件路径 || /models/ | |
| model_name | str | 模型名称 ||| 用于支持模型静态图下载,具体名称参考文档(#./static_models.md)|
| OUTPUT_LOG_TO_CONSOLE | str | 是否定向输出到console 文件中 || 0 | |

## 请求参数介绍

28 changes: 24 additions & 4 deletions llm/server/server/scripts/start_server.sh
Original file line number Diff line number Diff line change
@@ -55,6 +55,21 @@ export SERVICE_GRPC_PORT=${GRPC_PORT:-${SERVICE_GRPC_PORT:-"8811"}}
export INTER_PROC_PORT=${INTER_QUEUE_PORT:-${INTER_PROC_PORT:-"8813"}}
export SERVICE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-${SERVICE_HTTP_PORT:-"9965"}}

check_port_occupied() {
local port=$1
if netstat -tuln | grep -q ":${port}\b"; then
echo "PORT: ${port} occupied! Please change the port!"
exit 1
fi
}

check_port_occupied ${HEALTH_HTTP_PORT}
check_port_occupied ${METRICS_HTTP_PORT}
check_port_occupied ${SERVICE_GRPC_PORT}
check_port_occupied ${INTER_PROC_PORT}
check_port_occupied ${SERVICE_HTTP_PORT}



if [ ! -d "llm_model" ];then
ln -s /opt/source/PaddleNLP/llm/server/server/llm_model llm_model
@@ -83,15 +98,20 @@ else
sleep ${SERVER_WAITTING_TIME:-"25"}
fi



tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
OUTPUT_LOG_TO_CONSOLE=${OUTPUT_LOG_TO_CONSOLE:-"0"}
# Set the log redirection based on whether logs should be output to the console
LOG_REDIRECT=""
# If OUTPUT_LOG_TO_CONSOLE is set to "1", redirect logs to the console log file
if [ "$OUTPUT_LOG_TO_CONSOLE" == "1" ]; then
LOG_REDIRECT="> log/console.log 2>&1"
fi
eval tritonserver --exit-timeout-secs 100000 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
--cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
--cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
--pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
--allow-http false \
--grpc-port=${SERVICE_GRPC_PORT} \
--metrics-port=${METRICS_HTTP_PORT} \
--log-file log/server.log --log-info true &
--log-file log/server.log --log-info true $LOG_REDIRECT &

echo "The logs for the model service, please check" ${PWD}"/log/server.log and "${PWD}"/log/workerlog.0"
4 changes: 3 additions & 1 deletion llm/server/server/server/data/processor.py
Original file line number Diff line number Diff line change
@@ -186,7 +186,9 @@ def process_response(self, response_dict, **kwargs):
response_dict["usage"] = {"completion_tokens" : response_dict["send_idx"] + 1}

if is_end:
response_dict["tokens_all"] = self.clear_request_status(req_id)
self.clear_request_status(req_id)
token_ids = response_dict.get("tokens_all_ids", [])
response_dict["tokens_all"] = self.ids2tokens(token_ids, response_dict["req_id"])
return response_dict

def text2ids(self, text):
2 changes: 1 addition & 1 deletion llm/server/server/server/engine/infer.py
Original file line number Diff line number Diff line change
@@ -612,7 +612,7 @@ def run(self):
engine_healthy_recorded_time_array,
) = self.initialize_engine_healthy_recorded_time_flag()
engine_healthy_recorded_time_array[0] = time.time()
# infer_live_flag_shm = self.initialize_engine_live_flag()
infer_live_flag_shm = self.initialize_engine_live_flag()
infer_seed_increment = paddle.full(shape=[self.args.max_batch_size, 1], fill_value=4, dtype="int64")
# thread_executor = ThreadPoolExecutor(max_workers=1)
real_bsz = None
9 changes: 3 additions & 6 deletions llm/server/server/server/engine/token_processor.py
Original file line number Diff line number Diff line change
@@ -100,13 +100,12 @@ def process_sampling_results(self):
except Exception as e:
model_server_logger.info("while get input_data error: {0} {1}".format(e, str(traceback.format_exc())))

def postprocess(self, batch_result, exist_finished_task=False):
def postprocess(self, batch_result):
"""
single post-processing function

Args:
batch_result (list): batch results
exist_finished_task (bool): whether there is a finished task
"""
result_dir = "./generate_token_results"
if not os.path.exists(result_dir):
@@ -213,7 +212,6 @@ def _process_batch_output(self):
accept_num = tokens[2 : batch + 2]

batch_result = list()
exist_finished_task = False
for i in range(batch):
if self.resource_manager.stop_flags[i]:
continue
@@ -248,11 +246,10 @@ def _process_batch_output(self):
f"Speculate accept ratio: {1 - self.total_step * 1.0 / self.number_of_output_tokens}"
f" total step: {self.total_step}. total_output_token_num: {self.number_of_output_tokens}"
)
exist_finished_task = True
break
batch_result.append(result)

self.postprocess(batch_result, exist_finished_task)
self.postprocess(batch_result)


class WarmUpTokenProcessor(TokenProcessor):
@@ -265,7 +262,7 @@ def __init__(self, cfg):
self._is_running = True
self._is_blocking = True

def postprocess(self, batch_result, exist_finished_task=False):
def postprocess(self, batch_result):
pass

def process_sampling_results(self):
4 changes: 2 additions & 2 deletions llm/server/server/server/triton_server.py
Original file line number Diff line number Diff line change
@@ -111,12 +111,12 @@ def _cache_special_tokens(self, batch_result):
["req_id"]] + batch_result[i]["token_scores"]
del self.score_buffer[batch_result[i]["req_id"]]

def postprocess(self, batch_result, exist_finished_task=False):
def postprocess(self, batch_result):
"""
single postprocess for triton
"""
try:
self._cache_special_tokens(batch_result)
# self._cache_special_tokens(batch_result)
self.cached_generated_tokens.put(batch_result)
except Exception as e:
model_server_logger.info(