In [33]:
!rm -rf code && mkdir code

In [34]:
%%writefile code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
diffusers
ftfy
spacy
boto3
sagemaker
nvgpu
sentencepiece
protobuf
transformers==4.30.2
icetk
cpm_kernels
accelerate
torch>=2.0

Writing code/requirements.txt


In [35]:
%%writefile code/inference.py
import os
import json
import uuid
import io
import sys

import traceback

from PIL import Image

import requests
import boto3
import sagemaker
import torch


from torch import autocast
from transformers import AutoTokenizer, AutoModel


parent_dir = "/opt/amazon/var/run"
directory = "chatglm2"
path = os.path.join(parent_dir, directory) 
os.mkdir(path) 

LLM_NAME = "/opt/amazon/var/run/chatglm2"

#替换成自己的模型路径
s3_location = "s3://sagemaker-cn-north-1-086238767671/chatglm2/chatglm2-6b/" 

os.system(f"aws s3 cp {s3_location} {LLM_NAME} --recursive")


tokenizer = AutoTokenizer.from_pretrained(LLM_NAME, trust_remote_code=True)


def preprocess(text):
    text = text.replace("\n", "\\n").replace("\t", "\\t")
    return text

def postprocess(text):
    return text.replace("\\n", "\n").replace("\\t", "\t")

def answer(text, history=[], sample=True, top_p=0.45, temperature=0.01, model=None):
    text = preprocess(text)
    response, history = model.chat(tokenizer, text, history=history, temperature=temperature)
    
    return postprocess(response), history


def model_fn(model_dir):
    """
    Load the model for inference,load model from os.environ['model_name'],diffult use stabilityai/stable-diffusion-2
    
    """
    print("=================model_fn_Start=================")
    #model = AutoModel.from_pretrained(LLM_NAME, trust_remote_code=True).half().cuda()
    model = AutoModel.from_pretrained(LLM_NAME, torch_dtype=torch.float16, device_map='auto', trust_remote_code=True)
    model = model.eval()
    print("=================model_fn_End=================")
    return model


def input_fn(request_body, request_content_type):
    """
    Deserialize and prepare the prediction input
    """
    # {
    # "ask": "写一个文章，题目是未来城市"
    # }
    print(f"=================input_fn=================\n{request_content_type}\n{request_body}")
    input_data = json.loads(request_body)
    if 'ask' not in input_data:
        input_data['ask']="写一个文章，题目是未来城市"
    return input_data


def predict_fn(input_data, model):
    """
    Apply model to the incoming request
    """
    print("=================predict_fn=================")
   
    print('input_data: ', input_data)
    

    try:
        if 'history' not in input_data:
            history = []
        else:
            history = input_data['history']
        if 'temperature' not in input_data:
            temperature = 0.01
        else:
            temperature = input_data['temperature']
        result, history = answer(input_data['ask'], history=history, model=model)
        print(f'====result {result}====')
        return result
        
    except Exception as ex:
        traceback.print_exc(file=sys.stdout)
        print(f"=================Exception================={ex}")

    return 'Not found answer'


def output_fn(prediction, content_type):
    """
    Serialize and prepare the prediction output
    """
    print(content_type)
    return json.dumps(
        {
            'answer': prediction
        }
    )

Writing code/inference.py


In [36]:
import boto3
import sagemaker

account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

print(role)
print(bucket)


!touch dummy
!tar czvf model.tar.gz dummy
assets_dir = 's3://{0}/{1}/assets/'.format(bucket, 'llm_chatglm2')
model_data = 's3://{0}/{1}/assets/model.tar.gz'.format(bucket, 'llm_chatglm2')
!aws s3 cp model.tar.gz $assets_dir
!rm -f dummy model.tar.gz

model_name = None
entry_point = 'inference.py'
framework_version = '1.13.1'
py_version = 'py39'
model_environment = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT':'600', 
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 
}

from sagemaker.pytorch.model import PyTorchModel

model = PyTorchModel(
    name = model_name,
    model_data = model_data,
    entry_point = entry_point,
    source_dir = './code',
    role = role,
    framework_version = framework_version, 
    py_version = py_version,
    env = model_environment
)

endpoint_name = 'pytorch-inference-chatglm2'
instance_type='ml.g4dn.12xlarge' 
instance_count = 1

from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor = model.deploy(
    endpoint_name = endpoint_name,
    instance_type = instance_type, 
    initial_instance_count = instance_count,
    serializer = JSONSerializer(),
    deserializer = JSONDeserializer()
)

arn:aws-cn:iam::086238767671:role/service-role/AmazonSageMaker-ExecutionRole-20230112T104094
sagemaker-cn-north-1-086238767671
dummy
upload: ./model.tar.gz to s3://sagemaker-cn-north-1-086238767671/llm_chatglm2/assets/model.tar.gz
------------!

### 等待10分钟后测试，期间可以检查CloudWatch日志有无异常


In [39]:
inputs= {
    "ask": '你好'
}

response = predictor.predict(inputs)
print(response["answer"])



你好👋！我是人工智能助手 ChatGLM2-6B，很高兴见到你，欢迎问我任何问题。


### 删除SageMaker  Endpoint
删除推理服务

In [7]:
#predictor.delete_endpoint()