# Deploy the chatglm model on Amazon SageMaker

As we have finetuned the model, next we will show you how to deploy the model on SageMaker.

In this notebook, we explore how to host a large language model on SageMaker using the [Large Model Inference](https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints-large-model-inference.html) container that is optimized for hosting large models using DJLServing. DJLServing is a high-performance universal model serving solution powered by the Deep Java Library (DJL) that is programming language agnostic. To learn more about DJL and DJLServing, you can refer to our recent [blog post](https://aws.amazon.com/blogs/machine-learning/deploy-large-models-on-amazon-sagemaker-using-djlserving-and-deepspeed-model-parallel-inference/).

## Create a SageMaker Model for Deployment
As a first step, we'll import the relevant libraries and configure several global variables such as the hosting image that will be used nd the S3 location of our model artifacts

In [None]:
import sagemaker
from sagemaker.model import Model
from sagemaker import serializers, deserializers
from sagemaker import image_uris
import boto3
import os
import time
import json
import jinja2
from pathlib import Path

In [None]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

s3_client = boto3.client("s3") # client to intreract with S3 API
sm_client = boto3.client("sagemaker")  # client to intreract with SageMaker
smr_client = boto3.client("sagemaker-runtime") # client to intreract with SageMaker Endpoints
jinja_env = jinja2.Environment() # jinja environment to generate model configuration templates

如下镜像可以换新的

In [None]:
# lookup the inference image uri based on our current region
djl_inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"
)

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_cache_path = Path("./model")
local_cache_path.mkdir(exist_ok=True)

model_name = "THUDM/chatglm-6b"# 下载模型，如果模型已经在 s3了，可以跳过 直接到 deploy

# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.model", "*.py"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_cache_path,
    allow_patterns=allow_patterns,
)

In [None]:
# Get the model files path
import os
from glob import glob

local_model_path = None

paths = os.walk(r'./model')
for root, dirs, files in paths:
    for file in files:
        if file == 'config.json':
            print(os.path.join(root,file))
            local_model_path = str(os.path.join(root,file))[0:-11]
            print(local_model_path)
if local_model_path == None:
    print("Model download may failed, please check prior step!")

In [None]:
%%script env sagemaker_default_bucket=$bucket local_model_path=$local_model_path bash

chmod +x ./s5cmd
./s5cmd sync ${local_model_path} s3://${sagemaker_default_bucket}/chatglm-6b/

rm -rf model

In [None]:
pretrained_model_location = f"s3://{bucket}/chatglm-6b/"# Change to the model artifact path in S3 
print(f"Pretrained model will be downloaded from ---- > {pretrained_model_location}")

## Let's try DeepSpeed for LMI.

In [None]:
def deploy_model(image_uri, model_data, role, endpoint_name, instance_type, sagemaker_session):
    
    """Helper function to create the SageMaker Endpoint resources and return a predictor"""
    model = Model(
            image_uri=image_uri, 
              model_data=model_data, 
              role=role
             )
    
    model.deploy(
        initial_instance_count=1,
        instance_type=instance_type,
        endpoint_name=endpoint_name,
        container_startup_health_check_timeout=60*10
        )
    
    # our requests and responses will be in json format so we specify the serializer and the deserializer
    predictor = sagemaker.Predictor(
        endpoint_name=endpoint_name, 
        sagemaker_session=sagemaker_session, 
        serializer=serializers.JSONSerializer(), 
        deserializer=deserializers.JSONDeserializer())
    
    return predictor

In [None]:
!rm -rf deepspeed_src
!mkdir deepspeed_src

In [None]:
%%writefile deepspeed_src/serving.template
engine=DeepSpeed
option.s3url={{ s3url }}
option.tensor_parallel_degree=1
batch_size=4
max_batch_delay=50

In [None]:
# we plug in the appropriate model location into our `serving.properties` file based on the region in which this notebook is running
template = jinja_env.from_string(Path("deepspeed_src/serving.template").open().read())
Path("deepspeed_src/serving.properties").open("w").write(template.render(s3url=pretrained_model_location))
!pygmentize deepspeed_src/serving.properties | cat -n

In [None]:
%%writefile deepspeed_src/model.py
from djl_python import Input, Output
import os
import logging
import torch
import deepspeed
import transformers
# from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline, AutoTokenizer, AutoModel
import re

model = None
#here, we need to set the global variable batch_size according to the batch_size in the serving.properties file.
batch_size = 4

def process_response(response):
    response = response.strip()
    response = response.replace("[[训练时间]]", "2023年")
    punkts = [
        [",", "，"],
        ["!", "！"],
        [":", "："],
        [";", "；"],
        ["\?", "？"],
    ]
    for item in punkts:
        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
    return response

def load_model(properties):

    model_location = properties['model_dir']
    tensor_parallel = properties["tensor_parallel_degree"]
    
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)

    model = AutoModel.from_pretrained(model_location, low_cpu_mem_usage=True, torch_dtype=torch.float16, trust_remote_code=True)
    print("----------model dtype is {0}---------".format(model.dtype))
    
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=torch.half,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    
#     local_rank = int(os.getenv("LOCAL_RANK", "0"))
#     generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, use_cache=True, device=local_rank)
    
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer

def handle(inputs: Input) -> None:

    global model, tokenizer

    try:
        if not model:
            model,tokenizer = load_model(inputs.get_properties())

        if inputs.is_empty():
            # Model server makes an empty call to warmup the model on startup
            return None
        
        if inputs.is_batch():
            #the demo code is just suitable for single sample per client request
            bs = inputs.get_batch_size()
            logging.info(f"Dynamic batching size: {bs}.")
            batch = inputs.get_batches()
            #print(batch)
            tmp_inputs = []
            for _, item in enumerate(batch):
                tmp_item = item.get_as_json()
                tmp_inputs.append(tmp_item.get("input"))
            
            #For server side batch, we just use the custom generation parameters for single Sagemaker Endpoint.
            inputs = tokenizer([tmp_inputs], return_tensors="pt")
            inputs = inputs.to('cuda')
            gen_kwargs = {"max_length": 128, "do_sample": True,
                      "temperature": 1.0}
            result = model.generate(**inputs, **gen_kwargs)
#             result = predictor(tmp_inputs, batch_size = bs, max_new_tokens = 128, min_new_tokens = 128, temperature = 1.0, do_sample = True)
            result = result.tolist()[0][len(inputs["input_ids"][0]):]
            result = tokenizer.decode(result)
            result = process_response(result)
            
            outputs = Output()
            for i in range(len(result)):
                outputs.add(result[i], key="generate_text", batch_index=i)
            return outputs
        else:
            inputs = inputs.get_as_json()
            if not inputs.get("input"):
                return Output().add_as_json({"code":-1,"msg":"input field can't be null"})

            #input data
            data = inputs.get("input")
            params = inputs.get("params",{})

            #for pure client side batch
            if type(data) == str:
                bs = 1
            elif type(data) == list:
                if len(data) > batch_size:
                    bs = batch_size
                else:
                    bs = len(data)
            else:
                return Output().add_as_json({"code":-1,"msg": "input has wrong type"})
                
            print("client side batch size is ", bs)
            #predictor
#             result = predictor(data, batch_size = bs, **params)
#             result = model.chat(tokenizer, data, **params)
            inputs = tokenizer(data, return_tensors="pt")
            inputs = inputs.to('cuda')
            gen_kwargs = {"max_length": 50, "do_sample": True,
                      "temperature": 0.5}
            result = model.generate(**inputs, **gen_kwargs)
            result = result.tolist()[0][len(inputs["input_ids"][0]):]
            result = tokenizer.decode(result)
            result = process_response(result)
            #return
            return Output().add({"code":0,"msg":"ok","data":result})
    except Exception as e:
        return Output().add_as_json({"code":-1,"msg":e})

In [None]:
!tar czvf ds_model.tar.gz deepspeed_src/ 

In [None]:
s3_code_prefix = "chatglm/deploy/code"

code_artifact = sess.upload_data("ds_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

In [None]:
# creates a unique endpoint name
endpoint_name = sagemaker.utils.name_from_base("chatglm-6B-ds")
print(f"Our endpoint will be called {endpoint_name}")

In [None]:
# deployment will take about 10 minutes
ds_predictor = deploy_model(image_uri=djl_inference_image_uri, 
                            model_data=code_artifact, 
                            role=role, 
                            endpoint_name=endpoint_name, 
                            instance_type="ml.g4dn.xlarge", 
                            sagemaker_session=sess)

In [None]:
%%time
ds_predictor.predict({ 
                    "input" : "请介绍下你自己", 
#                     "params": { "max_length": 50, "temperature": 0.5}
                })

## Clear resources

In [None]:
ds_predictor.delete_endpoint()

-------

[sagemaker-hosting/Large-Language-Model-Hosting/](https://github.com/aws-samples/sagemaker-hosting/tree/main/Large-Language-Model-Hosting)