### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq
!pip install -U sagemaker

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./LLM_chatglm2_model")
local_model_path.mkdir(exist_ok=True)
model_name = "THUDM/chatglm2-6b"
commit_hash = "b259b27320263629b0afccef134c54028233673d"

In [None]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

### 2. 把模型拷贝到S3为后续部署做准备

In [None]:
import sagemaker
from sagemaker.model import Model
from sagemaker import serializers, deserializers
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
import jinja2
from pathlib import Path
jinja_env = jinja2.Environment() # jinja environment to generate model configuration templates

In [None]:
s3_model_prefix = "LLM-RAG/workshop/LLM_chatglm2_model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/LLM_chatglm2_sb_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

In [None]:
s3_model_location = f"s3://{bucket}/{s3_model_prefix}/"
print("s3_model_location => {}".format(s3_model_location))

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [None]:
# lookup the inference image uri based on our current region
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
#     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
!mkdir -p LLM_chatglm2_sb_deploy_code

In [None]:
%%writefile LLM_chatglm2_sb_deploy_code/serving.template
engine=DeepSpeed
option.entryPoint=model.py
option.tensor_parallel_degree=1
option.s3url={{ s3url }}
batch_size=4
max_batch_delay=1000
#option.task=text-generation
#option.device_map=auto
#option.load_in_8bit=TRUE #lyb: quantize directly?

In [None]:
# we plug in the appropriate model location into our `serving.properties` file based on the region in which this notebook is running
template = jinja_env.from_string(Path("LLM_chatglm2_sb_deploy_code/serving.template").open().read())
Path("LLM_chatglm2_sb_deploy_code/serving.properties").open("w").write(template.render(s3url=s3_model_location))
!pygmentize LLM_chatglm2_sb_deploy_code/serving.properties | cat -n

In [None]:
%%writefile LLM_chatglm2_sb_deploy_code/model.py
from djl_python import Input, Output
import os
import logging
import torch
import deepspeed
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel
import re

model = None
#here, we need to set the global variable batch_size according to the batch_size in the serving.properties file.
batch_size = 4

def process_response(response):
    response = response.strip()
    response = response.replace("[[训练时间]]", "2023年")
    punkts = [
        [",", "，"],
        ["!", "！"],
        [":", "："],
        [";", "；"],
        ["\?", "？"],
    ]
    for item in punkts:
        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
    return response

def load_model(properties):

    model_location = properties['model_dir']
    tensor_parallel = properties["tensor_parallel_degree"]
    
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)

    model = AutoModel.from_pretrained(model_location, low_cpu_mem_usage=True, torch_dtype=torch.float16, trust_remote_code=True)
    print("----------model dtype is {0}---------".format(model.dtype))
    
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=torch.half,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    
#     local_rank = int(os.getenv("LOCAL_RANK", "0"))
#     generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, use_cache=True, device=local_rank)
    
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer

def handle(inputs: Input) -> None:

    global model, tokenizer

    try:
        if not model:
            model,tokenizer = load_model(inputs.get_properties())

        if inputs.is_empty():
            # Model server makes an empty call to warmup the model on startup
            return None
        
        if inputs.is_batch():
            #the demo code is just suitable for single sample per client request
            bs = inputs.get_batch_size()
            logging.info(f"Dynamic batching size: {bs}.")
            batch = inputs.get_batches()
            
            tmp_inputs = []
            for _, item in enumerate(batch):
                tmp_item = item.get_as_json()
                tmp_inputs.append(tmp_item.get("inputs"))
            
            #For server side batch, we just use the custom generation parameters for single Sagemaker Endpoint.
            inputs = tokenizer(tmp_inputs, return_tensors="pt")
            inputs = inputs.to('cuda')

            gen_kwargs = {"max_length": 512, "do_sample": True,
                      "temperature": 1.0}
            results = model.generate(**inputs, **gen_kwargs)
            
            outputs = Output()
            for i in range(len(results)):
                exist_len = len(inputs["input_ids"][i])
                tem_result = results[i][exist_len:]
                decode_result = tokenizer.decode(tem_result)
                outputs.add(decode_result, key="generate_text", batch_index=i)

            return outputs
        else:
            inputs = inputs.get_as_json()
            if not inputs.get("inputs"):
                return Output().add_as_json({"code":-1,"msg":"input field can't be null"})

            #input data
            data = inputs.get("inputs")
            params = inputs.get("parameters",{})

            #for pure client side batch
            if type(data) == str:
                bs = 1
            elif type(data) == list:
                if len(data) > batch_size:
                    bs = batch_size
                else:
                    bs = len(data)
            else:
                return Output().add_as_json({"code":-1,"msg": "input has wrong type"})
                
            print("client side batch size is ", bs)
            #predictor
#             result = predictor(data, batch_size = bs, **params)
#             result = model.chat(tokenizer, data, **params)
            inputs = tokenizer(data, return_tensors="pt")
            inputs = inputs.to('cuda')

            result = model.generate(**inputs, **params)
            result = result.tolist()[0][len(inputs["input_ids"][0]):]
            result = tokenizer.decode(result)
            result = process_response(result)
            #return
            return Output().add({"code":0,"msg":"ok","data":result})
    except Exception as e:
        return Output().add_as_json({"code":-1,"msg":e})

In [None]:
%%writefile LLM_chatglm2_sb_deploy_code/requirements.txt
transformers==4.28.1

In [None]:
!rm model.tar.gz
!cd LLM_chatglm2_sb_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf model.tar.gz LLM_chatglm2_sb_deploy_code

In [None]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = "chatglm2-sb-2023-07-26-02-32-59-520"#name_from_base(f"chatglm2-sb") #Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

#Note: ml.g4dn.2xlarge 也可以选择
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.2xlarge",
            "InitialInstanceCount": 2,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

#### 持续检测模型部署进度

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

### 5. 模型测试

In [None]:
%%time
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {
  "max_length": 2048,
  "temperature": 0.01,
  "num_beams": 1, # >1可能会报错，"probability tensor contains either `inf`, `nan` or element < 0"； 即使remove_invalid_values=True也不能解决
  "do_sample": False,
  "top_p": 0.7,
  "logits_processor" : None,
  # "remove_invalid_values" : True
}

In [None]:
prompts1 = """AWS Clean Rooms 的FAQ文档有提到 Q: 是否发起者和数据贡献者都会被收费？A: 是单方收费，只有查询的接收方会收费。
请问AWS Clean Rooms是多方都会收费吗？
"""
response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": prompts1,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

#### 清除模型Endpoint和config

In [None]:
!aws sagemaker delete-endpoint --endpoint-name chatglm2-sb-2023-07-26-02-32-59-520-endpoint
!aws sagemaker delete-endpoint-config --endpoint-config-name chatglm2-sb-2023-07-26-02-32-59-520-config
!aws sagemaker delete-model --model-name chatglm2-sb-2023-07-26-02-32-59-520