In [3]:
# 可根据需要升级awscli和sagemaker，目前不升级保持默认版本可以部署成功

import sagemaker
import boto3

role = sagemaker.get_execution_role()  
sess = sagemaker.session.Session()  


model_bucket = sess.default_bucket()  

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:

# 设置chatglm3-6b的模型目录，模型提前上传到此目录
# 需修改成自己的目录
s3_model_path = "s3://sagemaker-cn-north-1-086238767671/chatglm3/chatglm3-6b/"  
# 设置部署脚本目录
# 需修改成自己的目录
s3_code_prefix = "chatglm3/deploy_code/"

### 2. 生成模型部署脚本

In [5]:
# 本地临时目录，用于存放部署脚本及模型配置
!mkdir -p llm_chatglm3_deploy_code

In [6]:
%%writefile llm_chatglm3_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os

from transformers import pipeline, AutoModel, AutoTokenizer

model = None
tokenizer = None

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_location, trust_remote_code=True).half().cuda()
    model = model.eval()
    
    logging.info(f"Finished Loading model in {model_location}")
    
    return model, tokenizer

def handle(inputs: Input):
    logging.info("Start inference request")
    
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    history = data["history"]
    
    response, history = model.chat(tokenizer, input_sentences, history=history, **params)
    result = {"outputs": response, "history" : history}
    
    logging.info("Finished inference request")
    
    return Output().add_as_json(result)

Writing llm_chatglm3_deploy_code/model.py


In [7]:
print(f"option.s3url ==> {s3_model_path}")

option.s3url ==> s3://sagemaker-cn-north-1-086238767671/chatglm3/chatglm3-6b/


#### 注意: 下面的option.s3url 需要修改成自己的S3路径, 可以直接拷贝上一个cell的输出

In [8]:
%%writefile llm_chatglm3_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-cn-north-1-086238767671/chatglm3/chatglm3-6b/

Writing llm_chatglm3_deploy_code/serving.properties


In [9]:
%%writefile llm_chatglm3_deploy_code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.30.2
cpm_kernels

Writing llm_chatglm3_deploy_code/requirements.txt


In [10]:
# 本地打包并上传部署脚本到S3
!rm model.tar.gz
!cd llm_chatglm3_deploy_code
!tar czvf model.tar.gz llm_chatglm3_deploy_code

s3_code_artifact = sess.upload_data("model.tar.gz", model_bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

llm_chatglm3_deploy_code/
llm_chatglm3_deploy_code/serving.properties
llm_chatglm3_deploy_code/model.py
llm_chatglm3_deploy_code/requirements.txt
S3 Code or Model tar ball uploaded to --- > s3://sagemaker-cn-north-1-086238767671/chatglm3/deploy_code/model.tar.gz


### 3. 部署模型

In [12]:
# 默认容器镜像，无特殊需求不用改动
inference_image_uri = (
     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.24.0-deepspeed0.10.0-cu118"
 )

In [13]:
# 创建模型
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base("chatglm3") 
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

chatglm3-2023-11-23-10-52-52-747
Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.24.0-deepspeed0.10.0-cu118
Created Model: arn:aws-cn:sagemaker:cn-north-1:086238767671:model/chatglm3-2023-11-23-10-52-52-747


In [14]:
# 创建endpoint config
# chatglm3-6b建议选择ml.g4dn.2xlarge或者ml.p3.2xlarge及以上机型部署(16G显存)
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "poctest",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.2xlarge",
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint-config/chatglm3-2023-11-23-10-52-52-747-config',
 'ResponseMetadata': {'RequestId': '5212e42c-9821-4bf4-814d-b438ee891761',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5212e42c-9821-4bf4-814d-b438ee891761',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '124',
   'date': 'Thu, 23 Nov 2023 10:53:01 GMT'},
  'RetryAttempts': 0}}

In [15]:
# 部署模型
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint/chatglm3-2023-11-23-10-52-52-747-endpoint


#### 持续检测模型部署进度(一般不超过10分钟)

In [16]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint/chatglm3-2023-11-23-10-52-52-747-endpoint
Status: InService


### 4. 测试模型

In [17]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {
  "temperature": 0.1
}

In [20]:
prompt_test = '你好'
response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompt_test,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

'{\n  "outputs":"你好👋！我是人工智能助手 ChatGLM3-6B，很高兴见到你，欢迎问我任何问题。",\n  "history":[\n    {\n      "role":"user",\n      "content":"你好"\n    },\n    {\n      "role":"assistant",\n      "metadata":"",\n      "content":"你好👋！我是人工智能助手 ChatGLM3-6B，很高兴见到你，欢迎问我任何问题。"\n    }\n  ]\n}'

### 5. 清理资源(也可以登录SageMaker Console手动删除)

In [None]:
# !aws sagemaker delete-endpoint --endpoint-name <your endpoint name> 

In [None]:
# !aws sagemaker delete-endpoint-config --endpoint-config-name <your endpoint config name> 

In [None]:
# !aws sagemaker delete-model --model-name <your model name> 