### 1.安装 HuggingFace 并下载模型到本地

In [1]:
!pip install huggingface-hub -Uq

In [2]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./LLM_chatglm2_model")
local_model_path.mkdir(exist_ok=True)
model_name = "THUDM/chatglm2-6b"

In [3]:
snapshot_download(repo_id=model_name, cache_dir=local_model_path)

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading (…)2a579/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)a6c2a579/config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)c2a579/MODEL_LICENSE:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading (…)iguration_chatglm.py:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)5fa6c2a579/README.md:   0%|          | 0.00/7.69k [00:00<?, ?B/s]

Downloading (…)/modeling_chatglm.py:   0%|          | 0.00/47.1k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Downloading (…)l-00004-of-00007.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00007.bin:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Downloading (…)a579/quantization.py:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading (…)l-00005-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

Downloading (…)l-00001-of-00007.bin:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

Downloading (…)l-00006-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)enization_chatglm.py:   0%|          | 0.00/9.29k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

'LLM_chatglm2_model/models--THUDM--chatglm2-6b/snapshots/e186c891cf64310ac66ef10a87e6635fa6c2a579'

### 2.SageMaker 初始化配置

In [4]:
import sagemaker
import boto3
import os
from sagemaker import image_uris

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name
account_id = sess.account_id()

### 3. 把模型拷贝到 S3 存储桶为后续部署做准备

In [5]:
s3_model_prefix = "LLM_chatglm2_model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM_chatglm2_deploy_code"

print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: LLM_chatglm2_deploy_code
model_snapshot_path: LLM_chatglm2_model/models--THUDM--chatglm2-6b/snapshots/e186c891cf64310ac66ef10a87e6635fa6c2a579


In [6]:
s3_client = boto3.client("s3")

for root, dirs, files in os.walk(model_snapshot_path):
    for file in files:
        local_path = os.path.join(root, file)
        s3_key = s3_model_prefix + '/' + os.path.relpath(local_path, model_snapshot_path)
        s3_client.upload_file(local_path, bucket, s3_key)

### 3.模型部署准备

* 推理容器镜像

In [7]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"
)

# 中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.22.1-deepspeed0.9.2-cu118"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118


In [8]:
chatglm2_deploy_code_path = Path("./LLM_chatglm2_deploy_code")
chatglm2_deploy_code_path.mkdir(exist_ok=True)

* Entrypoint 脚本 model.py

In [9]:
%%writefile LLM_chatglm2_deploy_code/model.py
from djl_python import Input, Output
from transformers import AutoModel, AutoTokenizer
import logging

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_location, trust_remote_code=True).half().cuda()
    model.eval()
    
    return model, tokenizer

model = None
tokenizer = None

def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    history = data["history"]
    
    response, history = model.chat(tokenizer, input_sentences, history=history, **params)
    
    result = {"outputs": response, "history" : history}
    return Output().add_as_json(result)

Writing LLM_chatglm2_deploy_code/model.py


* serving.properties 配置文件

In [10]:
print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")

option.s3url ==> s3://sagemaker-us-east-1-091166060467/LLM_chatglm2_model/


> 需要修改按照上述步骤的 s3url 修改 option.s3url

In [11]:
%%writefile LLM_chatglm2_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-us-east-1-091166060467/LLM_chatglm2_model/

Writing LLM_chatglm2_deploy_code/serving.properties


* 将配置文件压缩后上传 S3 存储桶

In [12]:
import tarfile

folder_path = 'LLM_chatglm2_deploy_code'
output_filename = 'model.tar.gz'

with tarfile.open(output_filename, "w:gz") as tar:
    tar.add(folder_path, arcname=os.path.basename(folder_path))

In [13]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-091166060467/LLM_chatglm2_deploy_code/model.tar.gz


### 4. 模型部署

In [14]:
from sagemaker.model import Model

def create_model(model_name, model_s3_url):
    model = Model(
        image_uri=inference_image_uri,
        model_data=model_s3_url,
        role=role,
        name=model_name,
        sagemaker_session=sess,
    )
    return model

In [15]:
from sagemaker import serializers, deserializers

def deploy_model(model, _endpoint_name):
    model.deploy(
        initial_instance_count=1,
        instance_type="ml.g4dn.2xlarge",
        endpoint_name=_endpoint_name
    )
    predictor = sagemaker.Predictor(
        endpoint_name=_endpoint_name,
        sagemaker_session=sess,
        serializer=serializers.JSONSerializer(),
        deserializer=deserializers.JSONDeserializer()
    )
    return predictor

In [16]:
from sagemaker.utils import name_from_base

_model_name = name_from_base(f"chatglm2") # Append a timestamp to the provided string
_model_s3_url = s3_code_artifact
_endpoint_name = f"{_model_name}-endpoint"

model = create_model(_model_name, _model_s3_url)
predictor = deploy_model(model, _endpoint_name)

--------------!

### 5. 模型测试

In [17]:
parameters = {
  "max_length": 4096,
  "temperature": 0.01,
  "top_p": 0.7,
}
history = [['你是气象专家智能问答机器人，了解各种气象知识和气象信息，可以自由回答问题，像人类一样思考和表达。当我向你提问时你必须使用，“您好，我是气象专家智能问答机器人”这句话作为开头','好的']]

In [18]:
prompts1 = """北京是不是夏天雨水比较多？"""

reponse = predictor.predict(
    {
        "inputs" : prompts1, 
        "parameters": parameters,
        "history" : history
    }
)
history.extend(reponse['history'])

In [19]:
print(reponse)

{'outputs': '您好，我是气象专家智能问答机器人。北京属于温带季风气候，夏季气温较高，降雨量较大。因此，北京在夏季雨水较多。', 'history': [['你是气象专家智能问答机器人，了解各种气象知识和气象信息，可以自由回答问题，像人类一样思考和表达。当我向你提问时你必须使用，“您好，我是气象专家智能问答机器人”这句话作为开头', '好的'], ['北京是不是夏天雨水比较多？', '您好，我是气象专家智能问答机器人。北京属于温带季风气候，夏季气温较高，降雨量较大。因此，北京在夏季雨水较多。']]}


In [20]:
# print(reponse['outputs'])

In [21]:
prompts2 = """你说的是真的吗？举个具体例子吧"""

reponse = predictor.predict(
    {
        "inputs" : prompts2, 
        "parameters": parameters,
        "history" : history
    }
)

In [22]:
print(reponse)

{'outputs': '好的，比如2021年7月，北京平均降水量达到了181.1毫米，属于历史同期最高值。', 'history': [['你是气象专家智能问答机器人，了解各种气象知识和气象信息，可以自由回答问题，像人类一样思考和表达。当我向你提问时你必须使用，“您好，我是气象专家智能问答机器人”这句话作为开头', '好的'], ['你是气象专家智能问答机器人，了解各种气象知识和气象信息，可以自由回答问题，像人类一样思考和表达。当我向你提问时你必须使用，“您好，我是气象专家智能问答机器人”这句话作为开头', '好的'], ['北京是不是夏天雨水比较多？', '您好，我是气象专家智能问答机器人。北京属于温带季风气候，夏季气温较高，降雨量较大。因此，北京在夏季雨水较多。'], ['你说的是真的吗？举个具体例子吧', '好的，比如2021年7月，北京平均降水量达到了181.1毫米，属于历史同期最高值。']]}


In [23]:
# print(reponse['outputs'])