In [349]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

#### 参数定义

In [350]:
role = sagemaker.get_execution_role() 
sess = sagemaker.session.Session() 
bucket = sess.default_bucket()  
model_bucket = sess.default_bucket()  

s3_code_prefix_accelerate = "baichuan/Baichuan2-7B-Base-Code"  

#模型需要提前上传到这个路径
s3_model_prefix = "baichuan/Baichuan2-7B-Base"

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [351]:
pretrained_model_location = f"s3://{model_bucket}/{s3_model_prefix}/"
print(f"Pretrained model will be uploaded to ---- > {pretrained_model_location}")

Pretrained model will be uploaded to ---- > s3://sagemaker-cn-north-1-086238767671/baichuan/Baichuan2-7B-Base/


#### 创建推理代码

In [352]:
!rm -rf code_baichuan2-7b-accelerate
!mkdir -p code_baichuan2-7b-accelerate

In [353]:
%%writefile ./code_baichuan2-7b-accelerate/serving.properties
#engine=DeepSpeed
engine=Python
#engine=FasterTransformer
#option.entryPoint=djl_python.huggingface
#option.entryPoint=djl_python.fastertransformer
#option.entryPoint=djl_python.deepspeed
#option.task=text-generation
#option.dtype=bf16
option.tensor_parallel_degree=4
#option.trust_remote_code=true
#option.low_cpu_mem_usage=true
#option.device_map=auto
option.s3url = {{s3url}}


Writing ./code_baichuan2-7b-accelerate/serving.properties


In [354]:
%%writefile ./code_baichuan2-7b-accelerate/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.29.1
cpm_kernels
xformers
einops
accelerate>=0.17.1
transformers_stream_generator

Writing ./code_baichuan2-7b-accelerate/requirements.txt


In [355]:
template = jinja_env.from_string(Path("code_baichuan2-7b-accelerate/serving.properties").open().read())
Path("code_baichuan2-7b-accelerate/serving.properties").open("w").write(
    template.render(s3url=pretrained_model_location)
)
!pygmentize code_baichuan2-7b-accelerate/serving.properties | cat -n

     1	[37m#engine=DeepSpeed[39;49;00m[37m[39;49;00m
     2	[36mengine[39;49;00m=[33mPython[39;49;00m[37m[39;49;00m
     3	[37m#engine=FasterTransformer[39;49;00m[37m[39;49;00m
     4	[37m#option.entryPoint=djl_python.huggingface[39;49;00m[37m[39;49;00m
     5	[37m#option.entryPoint=djl_python.fastertransformer[39;49;00m[37m[39;49;00m
     6	[37m#option.entryPoint=djl_python.deepspeed[39;49;00m[37m[39;49;00m
     7	[37m#option.task=text-generation[39;49;00m[37m[39;49;00m
     8	[37m#option.dtype=bf16[39;49;00m[37m[39;49;00m
     9	[36moption.tensor_parallel_degree[39;49;00m=[33m4[39;49;00m[37m[39;49;00m
    10	[37m#option.trust_remote_code=true[39;49;00m[37m[39;49;00m
    11	[37m#option.low_cpu_mem_usage=true[39;49;00m[37m[39;49;00m
    12	[37m#option.device_map=auto[39;49;00m[37m[39;49;00m
    13	[36moption.s3url[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33ms3://sagemaker-cn-north-1-086238767671/baichuan/Baichuan2-7

In [356]:
%%writefile ./code_baichuan2-7b-accelerate/model.py
from djl_python import Input, Output
import os
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import warnings
import logging
import deepspeed


model = None
tokenizer = None

def load_model(properties):
        
    mp_size = int(properties["tensor_parallel_degree"])

    model_location = properties["model_dir"]
    if "model_id" in properties:
        model_location = properties["model_id"]
    logging.info(f"Loading model in {model_location}")

    tokenizer = AutoTokenizer.from_pretrained(model_location, use_fast=False, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_location, low_cpu_mem_usage=True, device_map="auto", trust_remote_code=True)
    
    return model, tokenizer


def handle(inputs: Input):

    logging.info("inference request received")
    
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    
    data = inputs.get_as_json()
    input_sentences = data["inputs"]
    params = data["parameters"]
    
    inputs = tokenizer(input_sentences, return_tensors='pt')    
    inputs = inputs.to('cuda:0')
    
    pred = model.generate(**inputs, **params)
    response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
    
    logging.info('inference request completed: ' + response)
    
    return Output().add_as_json(response)


Writing ./code_baichuan2-7b-accelerate/model.py


#### 上传代码到S3

In [357]:
!rm -f model.tar.gz
!tar czvf model.tar.gz -C code_baichuan2-7b-accelerate .

s3_code_artifact_accelerate = sess.upload_data("model.tar.gz", bucket, s3_code_prefix_accelerate)
print(f"S3 Code or Model tar for accelerate uploaded to --- > {s3_code_artifact_accelerate}")

./
./serving.properties
./model.py
./requirements.txt
S3 Code or Model tar for accelerate uploaded to --- > s3://sagemaker-cn-north-1-086238767671/baichuan/Baichuan2-7B-Base-Code/model.tar.gz


#### 部署推理节点


In [358]:
# inference_image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/djl-ds:latest"
inference_image_uri = (
    #f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.24.0-deepspeed0.10.0-cu118"
    f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)
print(f"Image going to be used is ---- > {inference_image_uri}")


Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117


In [359]:
model_name_acc = name_from_base(f"baichuan2-7b-model-acc")
print(model_name_acc)

baichuan2-7b-model-acc-2023-11-03-07-44-59-762


In [360]:
model_environment = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT':'600', 
    'SAGEMAKER_MODEL_SERVER_WORKERS':'1', 
#    'PYTORCH_CUDA_ALLOC_CONF':'max_split_size_mb:1024'
}

create_model_response = sm_client.create_model(
    ModelName=model_name_acc,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri, 
        "ModelDataUrl": s3_code_artifact_accelerate,
        "Environment": model_environment
    },
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

Created Model: arn:aws-cn:sagemaker:cn-north-1:086238767671:model/baichuan2-7b-model-acc-2023-11-03-07-44-59-762


In [361]:
model_name = model_name_acc
print(f"Building EndpointConfig and Endpoint for: {model_name}")

Building EndpointConfig and Endpoint for: baichuan2-7b-model-acc-2023-11-03-07-44-59-762


In [362]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "baichuan2-7b",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.12xlarge",
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint-config/baichuan2-7b-model-acc-2023-11-03-07-44-59-762-config',
 'ResponseMetadata': {'RequestId': '468f058d-21b4-45b6-8500-a643e4df537d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '468f058d-21b4-45b6-8500-a643e4df537d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '138',
   'date': 'Fri, 03 Nov 2023 07:45:01 GMT'},
  'RetryAttempts': 0}}

In [363]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint/baichuan2-7b-model-acc-2023-11-03-07-44-59-762-endpoint


In [364]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint/baichuan2-7b-model-acc-2023-11-03-07-44-59-762-endpoint
Status: InService


#### 推理测试

In [365]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters={
    "max_new_tokens" : 100,
    "repetition_penalty" : 1.1
}

prompt_test = '我有10元钱，买了一瓶水花了5元钱，我现在还剩下多少钱？'

response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
                {
                    "inputs": prompt_test,
                    "parameters": parameters
                }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

'"我有10元钱，买了一瓶水花了5元钱，我现在还剩下多少钱？\\n解：10-5=5（元）答：我还有5元钱．"'

#### 清理资源

In [366]:
sm_client.delete_endpoint(EndpointName=endpoint_name)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': 'a7da0f97-35c2-4cbb-8333-85091e2383c9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a7da0f97-35c2-4cbb-8333-85091e2383c9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Fri, 03 Nov 2023 07:55:03 GMT'},
  'RetryAttempts': 0}}