# Deploy the finetuned vicuna model on Amazon SageMaker with djl server batch

## Create a SageMaker Model for Deployment
As a first step, we'll import the relevant libraries and configure several global variables such as the hosting image that will be used nd the S3 location of our model artifacts

In [1]:
import sagemaker
from sagemaker.model import Model
from sagemaker import serializers, deserializers
from sagemaker import image_uris
import boto3
import os
import time
import json
import jinja2
from pathlib import Path

In [2]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

s3_client = boto3.client("s3") # client to intreract with S3 API
sm_client = boto3.client("sagemaker")  # client to intreract with SageMaker
smr_client = boto3.client("sagemaker-runtime") # client to intreract with SageMaker Endpoints
jinja_env = jinja2.Environment() # jinja environment to generate model configuration templates

In [None]:
# lookup the inference image uri based on our current region
djl_inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

In [None]:
pretrained_model_location = "s3://sagemaker-us-west-2-687912291502/llama/output/2023-05-10-12-43-02/llama_out/"# Change to the model artifact path in S3 which we get from the fine tune job
print(f"Pretrained model will be downloaded from ---- > {pretrained_model_location}")

## Build the inference contianer image

In [3]:
%%writefile Dockerfile.inference
## You should change below region code to the region you used, here sample is use us-west-2
#From 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117
From 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118

ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE

## Install transfomers version which support LLaMaTokenizer
#RUN python3 -m pip install git+https://github.com/huggingface/transformers.git@68d640f7c368bcaaaecfc678f11908ebbd3d6176
## Install transfomers version which support vicuna v1.1 LLaMaTokenizer
#RUN python3 -m pip install transformers==4.29.0
#RUN python3 -m pip install transformers==4.28.1
RUN python3 -m pip install transformers==4.30.2

## Make all local GPUs visible
ENV NVIDIA_VISIBLE_DEVICES="all"

Overwriting Dockerfile.inference


In [4]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [5]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sagemaker-vicuna-inference-severbatch-demo"

In [13]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} -f Dockerfile.inference .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon   22.6GB
Step 1/6 : From 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118
 ---> a4515b81505b
Step 2/6 : ENV LANG=C.UTF-8
 ---> Using cache
 ---> 57b814a63c1c
Step 3/6 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 353ffb632ace
Step 4/6 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 072e2ee21c51
Step 5/6 : RUN python3 -m pip install transformers==4.30.2
 ---> Using cache
 ---> a9f148b3e05a
Step 6/6 : ENV NVIDIA_VISIBLE_DEVICES="all"
 ---> Using cache
 ---> b88d0cb3f10a
Successfully built b88d0cb3f10a
Successfully tagged sagemaker-vicuna-inference-severbatch-demo:latest
The push refers to repository [687912291502.dkr.ecr.us-west-2.amazonaws.com/sagemaker-vicuna-inference-severbatch-demo]
aa3fb0261bc7: Preparing
8232c22b63e4: Preparing
08f3a337b655: Preparing
49cbf9c77aed: Preparing
48878ad231c8: Preparing
ecbe7b57a5a3: Preparing
d50b7a306b3a: Preparing
1e85726aa032: Preparing
d2f

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [14]:
## The image uri which is build and pushed above
inference_image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account_id, region, repo_name)
inference_image_uri

'687912291502.dkr.ecr.us-west-2.amazonaws.com/sagemaker-vicuna-inference-severbatch-demo:latest'

## Deploying a Large Language Model using deepspeed engine
The DJL Inference Image which we will be utilizing ships with a number of built-in inference handlers for a wide variety of tasks including:
- `text-generation`
- `question-answering`
- `text-classification`
- `token-classification`

You can refer to this [GitRepo](https://github.com/deepjavalibrary/djl-serving/tree/master/engines/python/setup/djl_python) for a list of additional handlers and available NLP Tasks. <br>
These handlers can be utilized as is without having to write any custom inference code. We simply need to create a `serving.properties` text file with our desired hosting options and package it up into a `tar.gz` artifact.

Lets take a look at the `serving.properties` file that we'll be using for our first example

In [15]:
!mkdir deepspeed_src

mkdir: cannot create directory ‘deepspeed_src’: File exists


In [2]:
!aws s3 cp s3://sagemaker-us-west-2-687912291502/llm/models/LLM_llama2_7b/config.json ./

download: s3://sagemaker-us-west-2-687912291502/llm/models/LLM_llama2_7b/config.json to ./config.json


In [29]:
%%writefile deepspeed_src/serving.properties
engine=DeepSpeed
option.entryPoint=model.py
option.tensor_parallel_degree=1
#option.model_id=helloollel/vicuna-7b
option.s3url=s3://sagemaker-us-west-2-687912291502/llm/models/LLM_llama2_7b/
batch_size=16
max_batch_delay=100
#option.task=text-generation
#option.device_map=auto
option.load_in_8bit=TRUE

Overwriting deepspeed_src/serving.properties


There are a few options specified here. Lets go through them in turn<br>
1. `engine` - specifies the engine that will be used for this workload. In this case we'll be hosting a model using the [DJL Python Engine](https://github.com/deepjavalibrary/djl-serving/tree/master/engines/python)
2. `option.entryPoint` - specifies the entrypoint code that will be used to host the model. djl_python.huggingface refers to the `huggingface.py` module from [djl_python repo](https://github.com/deepjavalibrary/djl-serving/tree/master/engines/python/setup/djl_python).  
3. `option.s3url` - specifies the location of the model files. Alternativelly an `option.model_id` option can be used instead to specifiy a model from Hugging Face Hub (e.g. `EleutherAI/gpt-j-6B`) and the model will be automatically downloaded from the Hub. The s3url approach is recommended as it allows you to host the model artifact within your own environment and enables faster deployments by utilizing optimized approach within the DJL inference container to transfer the model from S3 into the hosting instance 
4. `option.task` - This is specific to the `huggingface.py` inference handler and specifies for which task this model will be used
5. `option.device_map` - Enables layer-wise model partitioning through [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map). With `option.device_map=auto`, Accelerate will determine where to put each **layer** to maximize the use of your fastest devices (GPUs) and offload the rest on the CPU, or even the hard drive if you don’t have enough GPU RAM (or CPU RAM). Even if the model is split across several devices, it will run as you would normally expect.
6. `option.load_in_8bit` - Quantizes the model weights to int8 thereby greatly reducing the memory footprint of the model from the initial FP32. See this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration) from Hugging Face for additional information 

For more information on the available options, please refer to the [SageMaker Large Model Inference Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints-large-model-configuration.html)

Our initial approach here is to utilize the built-in functionality within Hugging Face Transformers to enable Large Language Model hosting. 

In [31]:
%%writefile deepspeed_src/model.py
from djl_python import Input, Output
import os
import logging
import math
import deepspeed
import torch
import torch.distributed as dist
import sys
import subprocess
import time
import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.tokenization_llama import LlamaTokenizer
import time

print("transformers version=="+transformers.__version__)

predictor = None

#for deepspeed engine

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    #logging.info(f"Loading model in {model_location}")
    
    print("----------tensor parallel is {0}---------".format(tensor_parallel))
    
    #model_location = "EleutherAI/gpt-neo-2.7B"
    logging.info(f"Loading model in {model_location}")
    print(f"Loading model in {model_location}")
    model = AutoModelForCausalLM.from_pretrained(model_location, torch_dtype=torch.float16)
    tokenizer = LlamaTokenizer.from_pretrained(model_location, torch_dtype=torch.float16)

    print("----------model dtype is {0}---------".format(model.dtype))
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=model.dtype,
        # dtype=torch.half,
        # dtype=torch.int8,
        replace_method="auto",
        replace_with_kernel_inject=True,
        #replace_method="auto",
        #replace_with_kernel_inject=True,
    )
    
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank, use_cache=True
    )
    generator.tokenizer.pad_token_id = model.config.eos_token_id
    return generator, model, tokenizer


def handle(inputs: Input) -> None:
    global predictor, model, tokenizer
    try:
        if not predictor:
            predictor,model,tokenizer = load_model(inputs.get_properties())

        print(inputs)
        if inputs.is_empty():
            # Model server makes an empty call to warmup the model on startup
            return None
        
        if inputs.is_batch():
            batch_size = inputs.get_batch_size()
            logging.info(f"Dynamic batching size: {batch_size}.")
            batch = inputs.get_batches()
            print(batch)
            tmp_inputs = []
            for _, item in enumerate(batch):
                tmp_item = item.get_as_json()
                tmp_inputs.append(tmp_item.get("inputs"))
                
                ##use the unified parameters for batch prediction###
                params = tmp_item.get("parameters",{})
            
            t1 = time.time()
            result = predictor(tmp_inputs, batch_size=batch_size, **params)
            t2 = time.time()
            print("Batched: the inference time is {} sec, and size is {}.".format(t2 - t1, batch_size))
            outputs = Output()
            for i in range(len(result)):
                outputs.add(result[i], key="generate_text", batch_index=i)
            return outputs
        else:
            inputs = inputs.get_as_json()
            if not inputs.get("inputs"):
                return Output().add_as_json({"code":-1,"msg":"input field can't be null"})


            #input data
            data = inputs.get("inputs")
            params = inputs.get("parameters",{})
            print(params)
            
            #predictor
            t1 = time.time()
            result = predictor(data, **params)
            t2 = time.time()
            print("the inference time is {} sec.".format(t2 - t1))
            #return
            return Output().add({"code":0,"msg":"ok","data":result})
    except Exception as e:
        return Output().add_as_json({"code":-1,"msg":e})




Overwriting deepspeed_src/model.py


In [32]:
%%writefile deepspeed_src/requirements.txt
protobuf==3.20
#accelerate=>0.17.0

Overwriting deepspeed_src/requirements.txt


We place the `serving.properties` file into a tarball and upload it to S3

In [33]:
!tar czvf acc_model.tar.gz deepspeed_src/ 

deepspeed_src/
deepspeed_src/.ipynb_checkpoints/
deepspeed_src/.ipynb_checkpoints/model-checkpoint.py
deepspeed_src/.ipynb_checkpoints/requirements-checkpoint.txt
deepspeed_src/.ipynb_checkpoints/serving-checkpoint.properties
deepspeed_src/model.py
deepspeed_src/serving.template
deepspeed_src/serving.properties
deepspeed_src/requirements.txt


In [34]:
s3_code_prefix = "llama/deploy/code"

code_artifact = sess.upload_data("acc_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-687912291502/llama/deploy/code/acc_model.tar.gz


## Deploy Model to a SageMaker Endpoint
With a helper function we can now deploy our endpoint and invoke it with some sample inputs

In [35]:
from sagemaker.utils import name_from_base

model_name = name_from_base(f"vicuna-7B")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": code_artifact
    }
)
model_arn = create_model_response["ModelArn"]
print(model_arn)

endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
        }
    ]
)
print(endpoint_config_response)

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response)

arn:aws:sagemaker:us-west-2:687912291502:model/vicuna-7b-2023-08-30-01-34-19-580
{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:687912291502:endpoint-config/vicuna-7b-2023-08-30-01-34-19-580-config', 'ResponseMetadata': {'RequestId': '83dabf07-1813-4e8f-97d6-b32f2d018696', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '83dabf07-1813-4e8f-97d6-b32f2d018696', 'content-type': 'application/x-amz-json-1.1', 'content-length': '121', 'date': 'Wed, 30 Aug 2023 01:34:20 GMT'}, 'RetryAttempts': 0}}
{'EndpointArn': 'arn:aws:sagemaker:us-west-2:687912291502:endpoint/vicuna-7b-2023-08-30-01-34-19-580-endpoint', 'ResponseMetadata': {'RequestId': '70f96d4f-09c2-4fad-8555-44dbff2ef4b9', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '70f96d4f-09c2-4fad-8555-44dbff2ef4b9', 'content-type': 'application/x-amz-json-1.1', 'content-length': '110', 'date': 'Wed, 30 Aug 2023 01:34:20 GMT'}, 'RetryAttempts': 0}}


Let's run an example with a basic text generation prompt Large model inference is

In [36]:
from dotenv import load_dotenv
load_dotenv(override=True)
import boto3
import json
runtime = boto3.client('runtime.sagemaker')
from joblib import Parallel, delayed

prompt_a = f"""You are an AI shopping assistant for e-commerce company AAA.|||I want to eat hotpot, could you recommend products for hotpot. It would be nice if the products covers all types of hotpot items and total price exceed the free shipping threshold.|||\n=========\n[[yumei dongbei style hot pot,\nchinese,\ninstant food,shou xiguo,]]\n=========\n we chose to persuade a purchase in no more than 40 words.\nAnswer:
"""
print("prompt_a=="+str(len(prompt_a)))

prompt_b = f"""
You are an AI shopping assistant expert for e-commerce company AAA.\nUser is asking a product exploration question (delimited by |||), we already found related products provided as context (delimited by [[]].\n|||navigate to Zion|||\n=========\n[[israel sharon fruit, 12-14ct gift box,\nchinese,\nfruits,\npersimmons,\nfuyu,\n\n\nairborne vitamin c with zinc effervescent tablets 27 tablets,\nmainstream,\nhealth,\ndietary supplements,\n]]\n=========\nPlease comment on user’s question  more than 40 words.\nAnswer:
"""
print("prompt_b=="+str(len(prompt_b)))

prompts = [prompt_a, prompt_b]

prompt="""你正在一个聊天室里和不同国家的人们聊天，你能读懂所有国家的语言，你负责通过聊天记录分析所有聊天者的性格和有效信息，具体步骤如下：
1.阅读他们的聊天记录
2.总结他们聊天里面的重要信息
3.抽象他们的人设
4.使用评分体系抽象他们之间的人际关系，然后给一个评分，范围1-10分，分越高关系越好"""

prompt=prompt+"""聊天记录如下：
                WaRGazmo : "you lucked out there buddy" 
                WarLord : "suerte? eso no existe " 
                WarLord : "soy más rápido que la luz " 
                WaRGazmo : "it exists.. or karma" 
                DirtyE1bow : "so you was a planned birth ?" 
                WaRGazmo : "thats what she said bruh" 
                WarLord : "te amo mi amor " 
                Manowarik : "Мир вам,люди добрые.." 
                kotofei : "и тебе боярин, что не подался в челядь королю)" 
                XxNORxXMithra : "God morgen folkens :) " 
                kotofei : "и прочие жители галактики " 
                XxNORxXMithra : "Ja de også forsåvidt :) " 
                Manowarik : "Котофей-это который по цепи кругом?Песни там,сказки?😆😆" 
                kotofei : "не, то дальний убогий родственник " 
                Manowarik : "Эххх..Лукоморье мимо..((" 
                kipl : "Котофей он из сказки Лиса и Котофей Иванович. " 
                kipl : "Межвидовой брак и крышевание леса" 
                kotofei : "лиса 🦊 мералиса и Котофей Иваныч " 
                leister : "😆" XxFoxyQBAxX : "po co tyle zrobiłeś?" """

prompts = [prompt]

def call_endpoint(prompt):
    input = {"inputs": prompt, "parameters": {"temperature": 0, "max_new_tokens": 100,
                                              "return_full_text": False}}
    input = json.dumps(input).encode('utf-8')

    response = runtime.invoke_endpoint(EndpointName="vicuna-7B-2023-08-30-01-03-19-806-endpoint",
                                       ContentType='application/json',
                                       Accept='application/json',
                                       Body=input)
    results = json.loads(response['Body'].read().decode())
    print(results)


results = Parallel(n_jobs=10, prefer='threads', verbose=1, )(
    delayed(call_endpoint)(prompt)
    for prompt in prompts
)

prompt_a==1842
prompt_b==1461


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


{'code': 0, 'msg': 'ok', 'data': [{'generated_text': '\n                leister : "😆" XxFoxyQBAxX : "Ja, to co tyle zrobiłeś?" \n                XxNORxXMithra : "Ja, to co tyle zrobiłeś?" \n                XxNORxXMithra : "Ja, to co tyle zrobiłeś?" \n                XxNORx'}]}


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.6s finished


In [None]:
# Clean up the endpoint before proceeding
predictor.delete_endpoint()

## Reference

[sagemaker-hosting/Large-Language-Model-Hosting/](https://github.com/aws-samples/sagemaker-hosting/tree/main/Large-Language-Model-Hosting)