In [138]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

#### 参数定义

In [139]:
role = sagemaker.get_execution_role() 
sess = sagemaker.session.Session() 
bucket = sess.default_bucket()  
model_bucket = sess.default_bucket()  

s3_code_prefix_accelerate = "baichuan/Baichuan2-7B-Chat-Code"  

#模型需要提前上传到这个路径
s3_model_prefix = "baichuan/Baichuan2-7B-Chat"

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [140]:
pretrained_model_location = f"s3://{model_bucket}/{s3_model_prefix}/"
print(f"Pretrained model will be uploaded to ---- > {pretrained_model_location}")

Pretrained model will be uploaded to ---- > s3://sagemaker-cn-north-1-086238767671/baichuan/Baichuan2-7B-Chat/


#### 创建推理代码

In [141]:
!rm -rf code_baichuan2-7b-accelerate
!mkdir -p code_baichuan2-7b-accelerate

In [142]:
%%writefile ./code_baichuan2-7b-accelerate/serving.properties

engine=Python
option.tensor_parallel_degree=2
option.s3url = {{s3url}}
option.task = text-generation

option.max_idle_time = 86400
option.predict_timeout = 600

Writing ./code_baichuan2-7b-accelerate/serving.properties


In [143]:
%%writefile ./code_baichuan2-7b-accelerate/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
regex==2023.10.3 
xformers
safetensors==0.4.0
sympy==1.12 
tqdm==4.66.1
transformers==4.33.1
accelerate==0.24.1
colorama
bitsandbytes
sentencepiece
transformers_stream_generator==0.0.4
cpm_kernels
tokenizers==0.13.3

Writing ./code_baichuan2-7b-accelerate/requirements.txt


In [144]:
template = jinja_env.from_string(Path("code_baichuan2-7b-accelerate/serving.properties").open().read())
Path("code_baichuan2-7b-accelerate/serving.properties").open("w").write(
    template.render(s3url=pretrained_model_location)
)
!pygmentize code_baichuan2-7b-accelerate/serving.properties | cat -n

     1	[36mengine[39;49;00m=[33mPython[39;49;00m[37m[39;49;00m
     2	[36moption.tensor_parallel_degree[39;49;00m=[33m2[39;49;00m[37m[39;49;00m
     3	[36moption.s3url[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33ms3://sagemaker-cn-north-1-086238767671/baichuan/Baichuan2-7B-Chat/[39;49;00m[37m[39;49;00m
     4	[36moption.task[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33mtext-generation[39;49;00m[37m[39;49;00m
     5	[37m[39;49;00m
     6	[36moption.max_idle_time[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33m86400[39;49;00m[37m[39;49;00m
     7	[36moption.predict_timeout[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33m600[39;49;00m[37m[39;49;00m


In [145]:
%%writefile ./code_baichuan2-7b-accelerate/model.py
from djl_python import Input, Output
import os
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import warnings
import logging
import deepspeed


model = None
tokenizer = None
config = None

def load_model(properties):
        
    mp_size = int(properties["tensor_parallel_degree"])

    model_location = properties["model_dir"]
    if "model_id" in properties:
        model_location = properties["model_id"]
    logging.info(f"Loading model in {model_location}")

    tokenizer = AutoTokenizer.from_pretrained(model_location, use_fast=False, trust_remote_code=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_location, low_cpu_mem_usage=True, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
    
    model.generation_config = GenerationConfig.from_pretrained(
        model_location, trust_remote_code=True)
    
    return model, tokenizer, model.generation_config


def handle(inputs: Input):

    logging.info("inference request received")
    
    global model, tokenizer, config
    if not model:
        model, tokenizer, config = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    
    
    data = inputs.get_as_json()
    input_sentences = data["inputs"]
    params = data["parameters"]
    
    messages = []
    messages.append({"role": "user", "content": input_sentences})
    
    temp_config = GenerationConfig.from_dict(config.to_dict())
    temp_config.update(**params)
    
    response = model.chat(tokenizer, messages, False, temp_config)
    
    logging.info('inference request completed: ' + response)
    
    return Output().add_as_json(response)


Writing ./code_baichuan2-7b-accelerate/model.py


#### 上传代码到S3

In [146]:
!rm -f model.tar.gz
!tar czvf model.tar.gz -C code_baichuan2-7b-accelerate .

s3_code_artifact_accelerate = sess.upload_data("model.tar.gz", bucket, s3_code_prefix_accelerate)
print(f"S3 Code or Model tar for accelerate uploaded to --- > {s3_code_artifact_accelerate}")

./
./serving.properties
./model.py
./requirements.txt
S3 Code or Model tar for accelerate uploaded to --- > s3://sagemaker-cn-north-1-086238767671/baichuan/Baichuan2-7B-Chat-Code/model.tar.gz


#### 部署推理节点


In [147]:
# inference_image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/djl-ds:latest"
inference_image_uri = (
    f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.25.0-deepspeed0.11.0-cu118"
)
print(f"Image going to be used is ---- > {inference_image_uri}")


Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.25.0-deepspeed0.11.0-cu118


In [148]:
model_name_acc = name_from_base(f"baichuan2-7b-model-acc")
print(model_name_acc)

baichuan2-7b-model-acc-2024-01-04-18-59-59-382


In [149]:

create_model_response = sm_client.create_model(
    ModelName=model_name_acc,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri, 
        "ModelDataUrl": s3_code_artifact_accelerate,
    },
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

Created Model: arn:aws-cn:sagemaker:cn-north-1:086238767671:model/baichuan2-7b-model-acc-2024-01-04-18-59-59-382


In [150]:
model_name = model_name_acc
print(f"Building EndpointConfig and Endpoint for: {model_name}")

Building EndpointConfig and Endpoint for: baichuan2-7b-model-acc-2024-01-04-18-59-59-382


In [151]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "baichuan2-7b-chat",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.12xlarge",
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 10*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint-config/baichuan2-7b-model-acc-2024-01-04-18-59-59-382-config',
 'ResponseMetadata': {'RequestId': 'fb59083e-4080-48ad-811c-bd3c53e3f6c7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fb59083e-4080-48ad-811c-bd3c53e3f6c7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '138',
   'date': 'Thu, 04 Jan 2024 19:00:02 GMT'},
  'RetryAttempts': 0}}

In [152]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint/baichuan2-7b-model-acc-2024-01-04-18-59-59-382-endpoint


In [153]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws-cn:sagemaker:cn-north-1:086238767671:endpoint/baichuan2-7b-model-acc-2024-01-04-18-59-59-382-endpoint
Status: InService


#### 推理测试

In [170]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters={
    "temperature": 0.01
}

prompt_test = '''你是一名Amazon EC2的专家，请从以下内容中提取EC2实例，处理器，睿频这三种信息

全新 Amazon EC2 R7iz 实例针对高 CPU 性能、内存密集型工作负载进行了优化 | 亚马逊AWS官方博客 Skip to Main Content单击此处以返回 Amazon Web Services 主页联系我们 支持  中文（简体） 我的账户  登录  创建 AWS 账户 产品解决方案定价文档了解合作伙伴网络AWS Marketplace客户支持活动探索更多信息  关闭 عربيBahasa IndonesiaDeutschEnglishEspañolFrançaisItalianoPortuguêsTiếng ViệtTürkçeΡусскийไทย日本語한국어中文 (简体)中文 (繁體) 关闭 我的配置文件注销 AWS Builder IDAWS 管理控制台账户设置账单与成本管理安全证书AWS Personal Health Dashboard 关闭 支持中心专家帮助知识中心AWS Support 概述AWS re:Post单击此处以返回 Amazon Web Services 主页  免费试用   联系我们  产品  解决方案  定价  AWS 简介  入门  文档  培训和认证  开发人员中心  客户成功案例  合作伙伴网络  AWS Marketplace  支持  AWS re:Post  登录控制台  下载移动应用  博客主页 版本   关闭 中国版日本版한국 에디션기술 블로그Edisi Bahasa IndonesiaAWS 泰语博客Édition FrançaiseDeutsche EditionEdição em PortuguêsEdición en EspañolВерсия на русскомTürkçe Sürüm亚马逊AWS官方博客全新 Amazon EC2 R7iz 实例针对高 CPU 性能、内存密集型工作负载进行了优化        by        Veliswa Boya | on        11 9月 2023 | in        Compute |        Permalink |         Share        今天，我们宣布 Amazon EC2 R7iz 实例正式上市。R7iz 实例是云中速度最快的基于第四代 Intel Xeon 可扩展（Sapphire Rapids）的实例，具有 3.9 GHz 持续全核睿频频率。R7iz 实例适用于需要更多内存来处理额外数据、需要更大的实例大小来纵向扩展、需要更高的计算和内存性能来缩短完成时间，以及需要更高的网络和 Amazon Elastic Block Store（Amazon EBS）性能来缩短延迟的工作负载。R7iz 实例的高计算性能与大量内存相结合，可提高应用程序的整体性能，包括前端电子设计自动化（EDA）、单位内核许可费用较高的关系数据库工作负载以及财务、精算和数据分析模拟工作负载。这可以帮助您加快产品开发的上市速度，同时降低许可成本。R7iz 实例R7iz 实例的规格如下。vCPU 数量 内存（GiB） 网络带宽 EBS 带宽 r7iz.large216最高 12.5 Gbps最高 10 Gbpsr7iz.xlarge432最高 12.5 Gbps最高 10 Gbpsr7iz.2xlarge864最高 12.5 Gbps最高 10 Gbpsr7iz.4xlarge16128最高 12.5 Gbps最高 10 Gbpsr7iz.8xlarge3225612.5 Gbps10 Gbpsr7iz.12xlarge4838425 Gbps19 Gbpsr7iz.16xlarge6451225 Gbps20 Gbpsr7iz.32xlarge128102450 Gbps40 Gbps最多可以向每个 R7iz 实例连接 88 个 EBS 卷；相比之下，z1d 实例实例最多允许您连接 28 个卷。我们还准备推出两种大小的裸机 R7iz 实例：vCPU 数量 内存（GiB） 网络带宽 EBS 带宽 r7iz.metal-16xl6451225 Gbps20 Gbpsr7iz.metal-32xl128102450 Gbps40 Gbps 内置加速器 R7iz 实例还包括四个内置加速器：高级矩阵扩展（AMX）、Intel 数据流加速器（DSA）、Intel 内存分析加速器（IAA）和 Intel QuickAssist 技术（QAT）。其中一些加速器需要使用特定的内核版本、驱动程序和/或编译器。高级矩阵扩展适用于所有大小的 R7iz 实例，而 Intel QAT、Intel IAA 和 Intel DSA 加速器将在 r7iz.metal-16xl 和 r7iz.metal-32xl 实例（即将推出）上提供。现已推出 R7iz 实例现已在 AWS 区域美国东部（弗吉尼亚州北部）和美国西部（俄勒冈州）正式发布。和 Amazon EC2 一样，您只需为实际使用的资源付费。有关更多信息，请参阅 Amazon EC2 定价。要了解更多信息，请访问我们的 Amazon EC2 R7iz 实例页面并发送反馈至 EC2 的 AWS re:Post，或使用您的常用 AWS Support 联系方式。– Veliswa  登录控制台  了解有关 AWS 的信息什么是 AWS？什么是云计算？AWS 包容性、多样性和公平性什么是 DevOps？什么是容器？什么是数据湖？AWS 云安全性最新资讯博客新闻稿 AWS 资源入门培训和认证AWS 解决方案库架构中心产品和技术常见问题分析报告AWS 合作伙伴 AWS 上的开发人员开发人员中心软件开发工具包与工具运行于 AWS 上的 .NET运行于 AWS 上的 Python运行于 AWS 上的 Java运行于 AWS 上的 PHP运行于 AWS 上的 JavaScript 帮助联系我们获取专家帮助提交支持工单AWS re:PostKnowledge CenterAWS Support 概览法律人员亚马逊云科技诚聘英才  创建账户                 Amazon 是一个倡导机会均等的雇主：          反对少数族裔、妇女、残疾人士、退伍军人、性别认同和性取向歧视。语言عربيBahasa IndonesiaDeutschEnglishEspañolFrançaisItalianoPortuguêsTiếng ViệtTürkçeΡусскийไทย日本語한국어中文 (简体)中文 (繁體)隐私|网站条款| Cookie 首选项 |© 2023, Amazon Web Services, Inc. 或其联属公司。保留所有权利。

将输出结果格式化成JSON，包含以下key
instance_family
processor_type
turbo_boost_frenquency
'''

response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
                {
                    "inputs": prompt_test,
                    "parameters": parameters
                }
            ),
            ContentType="application/json",
        )

result = json.loads(response_model['Body'].read().decode('utf8'))
print(result)

{
  "instance_family": "R7iz",
  "processor_type": "Intel Xeon",
  "turbo_boost_frequency": 3.9
}


#### 清理资源

In [171]:
sm_client.delete_endpoint(EndpointName=endpoint_name)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': '03562f69-c459-439b-a0ab-e579c139eae5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '03562f69-c459-439b-a0ab-e579c139eae5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 04 Jan 2024 20:25:16 GMT'},
  'RetryAttempts': 0}}