# 使用 LMI Streaming 部署 ChatGLM2

## 上传自定义推理代码

In [None]:
!rm -f llm_model.tar.gz
!tar czvf llm_model.tar.gz -C src/llm .
!aws s3 cp llm_model.tar.gz s3://cloudbeer-aigc-works/search_bot/llm_model.tar.gz

## 部署模型

### 声明变量等

下面的部署使用了 djl deepspeed 部署模型

In [37]:
import boto3
import sagemaker
from sagemaker import Model, serializers, deserializers, Predictor


iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='HuggingfaceExecuteRole')['Role']['Arn']


region = "us-east-1"
llm_model = "s3://cloudbeer-aigc-works/search_bot/llm_model.tar.gz"
endpoint_name = "chatglm2-lmi-model"

image_uri = sagemaker.image_uris.retrieve(
    framework="djl-deepspeed", region=region, version="0.23.0"
)

print(image_uri)


763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118


### 开始部署

In [38]:
model = Model(image_uri=image_uri, model_data=llm_model, role=role)
instance_type = "ml.g5.2xlarge"  

model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    container_startup_health_check_timeout=900,
)

-------------!

# 测试预测

In [29]:
import io

class StreamScanner:
    def __init__(self):
        self.buff = io.BytesIO()
        self.read_pos = 0
        
    def write(self, content):
        self.buff.seek(0, io.SEEK_END)
        self.buff.write(content)
        
    def readlines(self):
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b'\n':
                self.read_pos += len(line)
                yield line[:-1]
                
    def reset(self):
        self.read_pos = 0

In [35]:
import boto3
import json

smr = boto3.client('sagemaker-runtime')

parameters = {
  "max_length": 4092,
  "temperature": 0.01,
  "top_p":0.8
}

response_model = smr.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": f"""翻译下面的文档为莎士比亚风格的英文：
白日依山尽，黄河入海流
    """,
                "parameters": parameters,
                "history" : []
            }
            ),
            ContentType="application/json",
        )

event_stream = response_model['Body']
# print(event_stream)
scanner = StreamScanner()
for event in event_stream:
    scanner.write(event['PayloadPart']['Bytes'])
    for line in scanner.readlines():
        try:
            resp = json.loads(line)
            print(resp.get("outputs")['outputs'], end='')
        except Exception as e:
            continue

The sun doth set, and the river flows,
As doth end, and night doth grow.

In [36]:

from sagemaker import  Predictor
predictor = Predictor(
    endpoint_name=endpoint_name
)

predictor.delete_model()
predictor.delete_endpoint()