In [None]:
# !pip install -r requirements.txt

In [None]:
# import os
# os.environ['TRANSFORMERS_CACHE'] = '/home/ec2-user/SageMaker/cache/'


# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("csdc-atl/baichuan-7B-chat", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("csdc-atl/baichuan-7B-chat", trust_remote_code=True).half().cuda()
# response, history = model.chat(tokenizer, "“面朝大海，春暖花开”的出处是？", history=[])
# print(response)
# response, history = model.chat(tokenizer, "能不能把这一首诗完整背诵一下", history=history)
# print(response)


# # from transformers import AutoModelForCausalLM, AutoTokenizer

# # tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)
# # model = AutoModelForCausalLM.from_pretrained("baichuan-inc/Baichuan-7B", device_map="auto", trust_remote_code=True)
# # #,offload_folder
# # inputs = tokenizer('登鹳雀楼->王之涣\n夜雨寄北->', return_tensors='pt')
# # inputs = inputs.to('cuda:0')
# # pred = model.generate(**inputs, max_new_tokens=64,repetition_penalty=1.1)
# # print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))

In [13]:
# import boto3

# client = boto3.client('sagemaker')

# response = client.delete_endpoint_config(EndpointConfigName='pytorch-inference-baichuan-v1')

In [3]:
!rm -rf src
!mkdir src

In [4]:
%%writefile ./src/serving.properties
engine=DeepSpeed
option.tensor_parallel_degree=1
option.s3url=s3://sagemaker-us-east-1-310850127430/baichuan7b/
batch_size=2
max_batch_delay=50

Writing ./src/serving.properties


In [5]:
%%writefile ./src/requirements.txt
transformers==4.28.1
sagemaker
nvgpu

Writing ./src/requirements.txt


In [6]:
%%writefile ./src/model.py
from djl_python import Input, Output
import os
import logging
import torch
import deepspeed
import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
# from transformers.models.llama.tokenization_llama import LlamaTokenizer

predictor = None
#here, we need to set the global variable batch_size according to the batch_size in the serving.properties file.
batch_size = 8

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location, torch_dtype=torch.float16)

    #for deepspeed inference 
    model = AutoModelForCausalLM.from_pretrained(model_location, low_cpu_mem_usage=True, torch_dtype=torch.float16)
    print("----------model dtype is {0}---------".format(model.dtype))
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=torch.half,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
        
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, use_cache=True, device=local_rank)
    
    
    #for HF accelerate inference
    '''
    model = AutoModelForCausalLM.from_pretrained(model_location, device_map="auto", torch_dtype=torch.float16)
    print("----------model dtype is {0}---------".format(model.dtype))
    generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, use_cache=True)
    '''
    
    #for llama model, maybe the followiong code is need when you invoke the pipleline API for batch input prompts.
    # generator.tokenizer.pad_token_id = model.config.eos_token_id
    return generator, model, tokenizer


def handle(inputs: Input) -> None:
    global predictor, model, tokenizer
    try:
        if not predictor:
            predictor,model,tokenizer = load_model(inputs.get_properties())

        #print(inputs)
        if inputs.is_empty():
            # Model server makes an empty call to warmup the model on startup
            return None
        
        if inputs.is_batch():
            #the demo code is just suitable for single sample per client request
            bs = inputs.get_batch_size()
            logging.info(f"Dynamic batching size: {bs}.")
            batch = inputs.get_batches()
            #print(batch)
            tmp_inputs = []
            for _, item in enumerate(batch):
                tmp_item = item.get_as_json()
                tmp_inputs.append(tmp_item.get("input"))
            
            #For server side batch, we just use the custom generation parameters for single Sagemaker Endpoint.
            result = predictor(tmp_inputs, batch_size = bs, max_new_tokens = 128, min_new_tokens = 128, temperature = 1.0, do_sample = True)
            
            outputs = Output()
            for i in range(len(result)):
                outputs.add(result[i], key="generate_text", batch_index=i)
            return outputs
        else:
            inputs = inputs.get_as_json()
            if not inputs.get("input"):
                return Output().add_as_json({"code":-1,"msg":"input field can't be null"})

            #input data
            data = inputs.get("input")
            params = inputs.get("params",{})

            #for pure client side batch
            if type(data) == str:
                bs = 1
            elif type(data) == list:
                if len(data) > batch_size:
                    bs = batch_size
                else:
                    bs = len(data)
            else:
                return Output().add_as_json({"code":-1,"msg": "input has wrong type"})
                
            print("client side batch size is ", bs)
            #predictor
            result = predictor(data, batch_size = bs, **params)

            #return
            return Output().add({"code":0,"msg":"ok","data":result})
    except Exception as e:
        return Output().add_as_json({"code":-1,"msg":e})
    
    

Writing ./src/model.py


In [7]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

sage_session = sagemaker.Session()
model_bucket = sage_session.default_bucket()  # bucket to house artifacts
s3_code_prefix = (
    "hf-large-model-llama-7b-0604/code"  # folder within bucket where code artifact will go
)

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

inference_image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117"
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117


In [16]:
!rm model.tar.gz
!tar czvf model.tar.gz src

s3_code_artifact = sage_session.upload_data("model.tar.gz", model_bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

print(f"S3 Model Bucket is -- > {model_bucket}")

from sagemaker.utils import name_from_base

model_name = name_from_base(f"baichuan7b")
print(model_name)

role = sagemaker.get_execution_role()

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact,
    },
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

endpoint_config_name = f"{model_name}-config-06041312"

endpoint_name = "pytorch-inference-baichuan-v1"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.4xlarge",
            "InitialInstanceCount": 1,
            #"VolumeSizeInGB" : 300,
            "ModelDataDownloadTimeoutInSeconds": 6*60,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")


src/
src/requirements.txt
src/model.py
src/serving.properties
src/.ipynb_checkpoints/
src/.ipynb_checkpoints/model-checkpoint.py
src/.ipynb_checkpoints/serving-checkpoint.properties
S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-310850127430/hf-large-model-llama-7b-0604/code/model.tar.gz
S3 Model Bucket is -- > sagemaker-us-east-1-310850127430
baichuan7b-2023-07-31-13-28-07-224
Created Model: arn:aws:sagemaker:us-east-1:310850127430:model/baichuan7b-2023-07-31-13-28-07-224
Created Endpoint: arn:aws:sagemaker:us-east-1:310850127430:endpoint/pytorch-inference-baichuan-v1


In [17]:
#This step can take ~ 15 min or longer so please be patient
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)


Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Failed
Arn: arn:aws:sagemaker:us-east-1:310850127430:endpoint/pytorch-inference-baichuan-v1
Status: Failed


In [22]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

prompt1 = "The house is wonderful. I"
prompt2="##Eva:How often do you travel?## Malcolm:I like David Bowie too. I don’t travel much any more, but I used to.## Eva:That's cool! I recently took a road trip with my friend. We had so much fun and it opened up so many possibilities for us. What kind of places did you like to explore?## Malcolm:I love history and culture, so those are my favorite.## Eva: He was born in Birmingham, England and raised in Los Angeles, California.Eva: Yes, Sir. Queen is one of the most influential bands of all time.## Malcolm:It is. They are one of my favorite rock groups. What about you?## Eva:I'm more into classic rock, especially David Bowie. Who is your favorite artist?## Malcolm:Marylin Manson. You?## Eva:My favorite artist is David Bowie.## Eva:How often do you travel?## Malcolm:I like David Bowie too. I don’t travel much any more, but I used to.## Eva:That's cool! I recently took a road trip with my friend. We had so much fun and it opened up so many possibilities for us. What kind of places did you like to explore?## Malcolm:I love history and culture, so those are my favorite.## Eva: He was born in Birmingham, England and raised in Los Angeles, California.##Eva: Yes, Sir. Queen is one of the most influential bands of all time.## Malcolm:It is. They are one of my favorite rock groups. What about you?## Eva:I'm more into classic rock, especially David Bowie. Who is your favorite artist?## Malcolm:Marylin Manson. You?## Eva:My favorite artist is David Bowie.## Eva:How often do you travel?## Malcolm:I like David Bowie too. I don’t travel much any more, but I used to.## Eva:That's cool! I recently took a road trip with my friend. We had so much fun and it opened up so many possibilities for us. What kind of places did you like to explore?## Malcolm:I love history and culture, so those are my favorite.## Eva: He was born in Birmingham, England and raised in Los Angeles, California.##Eva: Yes, Sir. Queen is one of the most influential bands of all time.## Malcolm:It is. They are one of my favorite rock groups. What about you?## Eva:I'm more into classic rock, especially David Bowie. Who is your favorite artist?## Malcolm:Marylin Manson. You?## Eva:My favorite artist is David Bowie.## Eva:How often do you travel?## Malcolm:I like David Bowie too. I don’t travel much any more, but I used to.## Eva:That's cool! I recently took a road trip with my friend. We had so much fun and it opened up so many possibilities for us. What kind of places did you like to explore?## Malcolm:I love history and culture, so those are my favorite.## Eva: He was born in Birmingham, England and raised in Los Angeles, California.#### Malcolm:Oh. What are you wearing right now, pet?## Eva:"

parameters = {
  "early_stopping": True,
  "max_new_tokens": 128,
  "min_new_tokens": 128,
  "do_sample": True,
  "temperature": 1.0,
}

response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                #"input": prompt1,
                "input": prompt2,
                #"input": [prompt2,prompt2],
                #"input": [prompt2,prompt2, prompt2,prompt2],
                #"input": [prompt1,prompt1, prompt1,prompt1, prompt1,prompt1, prompt1,prompt1],
                #"input": [prompt2,prompt2, prompt2,prompt2, prompt2,prompt2, prompt2,prompt2],
                #"input": [prompt1, prompt2],
                #"input": [prompt1, prompt2, prompt1, prompt2, prompt1, prompt2,prompt1, prompt2,],
                "params": parameters
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint hf-inference-baichuan-v1 of account 310850127430 not found.

# huggingface

In [None]:
import boto3
import sagemaker

account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

print(role)
print(bucket)

if "cn-" in region_name:
    with open('./code/requirements.txt', 'r') as original: data = original.read()
    with open('./code/requirements.txt', 'w') as modified: modified.write("-i https://pypi.tuna.tsinghua.edu.cn/simple\n" + data)

!touch dummy
!tar czvf model.tar.gz dummy
assets_dir = 's3://{0}/{1}/assets/'.format(bucket, 'llm_chinese')
model_data = 's3://{0}/{1}/assets/model.tar.gz'.format(bucket, 'llm_chinese')
!aws s3 cp model.tar.gz $assets_dir
!rm -f dummy model.tar.gz

model_name = None
entry_point = 'inference.py'
# framework_version = '1.13.1'
# py_version = 'py39'
model_environment = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT':'420', 
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 
}



url = '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-cpu-py310-ubuntu20.04'
from sagemaker.huggingface.model import HuggingFaceModel
model = HuggingFaceModel(
    name = model_name,
    model_data = model_data,
    entry_point = entry_point,
    source_dir = './code',
    role = role,
    # framework_version = framework_version, 
    # py_version = py_version,
    # env = model_environment
    image_uri=url
)

endpoint_name = 'hf-inference-baichuan-v1-1'
instance_type='ml.g5.4xlarge' 

instance_count = 1


import boto3

client = boto3.client('sagemaker')
try:
    response = client.delete_endpoint_config(EndpointConfigName=endpoint_name)
except:
    pass


from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor = model.deploy(
    endpoint_name = endpoint_name,
    instance_type = instance_type, 
    initial_instance_count = instance_count,
    serializer = JSONSerializer(),
    deserializer = JSONDeserializer()
)

arn:aws:iam::310850127430:role/NotebookStack-SmartSearchNotebookRole6F6BB12B-690JW6F9FRZD
sagemaker-us-east-1-310850127430
dummy
upload: ./model.tar.gz to s3://sagemaker-us-east-1-310850127430/llm_chinese/assets/model.tar.gz
---

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor = sagemaker.predictor.Predictor(endpoint_name)
predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

inputs= {
    "ask": "晚上睡不着应该怎么办"

}

response = predictor.predict(inputs)
print(response["answer"])