In [2]:
import sagemaker
import boto3

sess = sagemaker.Session()
region = sess.boto_session.region_name
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
default_bucket=None
if default_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    default_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {default_bucket}")
print(f"sagemaker session region: {region}")

sagemaker role arn: arn:aws:iam::310850127430:role/NotebookStack-SmartSearchNotebookRole6F6BB12B-690JW6F9FRZD
sagemaker bucket: sagemaker-us-east-1-310850127430
sagemaker session region: us-east-1


In [37]:
code_tarname = 'acc_llama2_model'

!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints
!tar czvf {code_tarname}.tar.gz {code_tarname}/

# copy the deployment configs tar to a path (different from hf model artifacts)
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", default_bucket, sagemaker.utils.name_from_base("tmp0625/v1"))
print(s3_code_artifact)

from sagemaker.model import Model
from sagemaker import serializers, deserializers
from sagemaker import image_uris
import boto3

# specify a inference container version, found at: 
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers

inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"

# image_uri in china region
#inference_image_uri = f"727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.22.1-deepspeed0.9.2-cu118"

# endpoint_name = sagemaker.utils.name_from_base(code_tarname.replace('_','-'))


endpoint_name = "llama2-chinese-v2"

try:
    client = boto3.client('sagemaker')
    client.delete_endpoint_config(EndpointConfigName=endpoint_name)
except:
    pass

model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

instance_type = 'ml.g5.4xlarge'
if region.find('cn') >=0:
    instance_type = 'ml.g4dn.4xlarge'

model.deploy(initial_instance_count = 1,
             instance_type = instance_type, 
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 480
            )

acc_llama2_model/
acc_llama2_model/requirements.txt
acc_llama2_model/model.py
acc_llama2_model/serving.properties
s3://sagemaker-us-east-1-310850127430/tmp0625/v1-2023-08-13-13-13-17-026/acc_llama2_model.tar.gz
------------------------!

In [38]:
predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )
predictor.predict(
    {"ask": "如何快速入睡", 
     # "parameters": {"max_length": 100, "top_p":0.45, "temperature":0.45},
     # "history":[]
     }
)

{'answer': "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n            If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n如何快速入睡 [/INST]  以下是一些快速入睡的方法：\n1. 建立规律的睡眠时间表。\n2. 避免在睡前饮用含咖啡因的饮料。\n3. 创造一个安静、舒适的睡眠环境。\n4. 放松身体和头脑，例如泡个热水澡或冥想。\n5. 避免在床上看电视或使用电子设备。\n6. 避免在床上工作或学习。\n7. 尝试使用呼吸练习或深度放松技巧。\n8. 尝试使用柔和的音乐或自然声音。\n9. 尝试使用香薰或蜡烛。\n10. 如果以上方法都不起作用，可以考虑咨询医生或专业人士。 "}