In [1]:
!pip install sagemaker boto3 huggingface_hub awscli --upgrade --quiet

In [2]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

  import scipy.sparse


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [3]:
boto3.setup_default_session(region_name='us-east-1')
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

In [4]:
model_bucket = sess.default_bucket()  # bucket to house model artifacts
s3_code_prefix = "hf-large-model-djl/meta-llama/Llama-2-7b-fp16/code"  # folder within bucket where code artifact will go

s3_model_prefix = "hf-large-model-djl/meta-llama/Llama-2-7b-fp16/model"  # folder within bucket where model artifact will go
region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()

In [5]:
deepspeed_image_uri = image_uris.retrieve(
    framework="djl-deepspeed", 
    region=sess.boto_session.region_name, 
    version="0.26.0"
)

env_generation = {"HUGGINGFACE_HUB_CACHE": "/tmp",
                  "TRANSFORMERS_CACHE": "/tmp",
                  "SERVING_LOAD_MODELS": "test::Python=/opt/ml/model",
                  "OPTION_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
                  "OPTION_TRUST_REMOTE_CODE": "true",
                  "OPTION_TENSOR_PARALLEL_DEGREE": "max",
                  "OPTION_ROLLING_BATCH": "vllm",
                  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
                  "OPTION_DTYPE":"fp16"
                 }

In [6]:
trtllm_image_uri = image_uris.retrieve(
    framework="djl-tensorrtllm",
    region=sess.boto_session.region_name,
    version="0.26.0"
)


env_summarization = {"HUGGINGFACE_HUB_CACHE": "/tmp",
                     "TRANSFORMERS_CACHE": "/tmp",
                     "SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
                     "OPTION_MODEL_ID": "TheBloke/Llama-2-7b-fp16",
                     "OPTION_TENSOR_PARALLEL_DEGREE": "max",
                     "OPTION_ROLLING_BATCH": "trtllm",
                     "OPTION_MAX_ROLLING_BATCH_SIZE": "64"
                    }

In [7]:
# - Select the appropriate environment variable which will tune the deployment server.
env = env_generation # use this in case it is 'generation' task 
# env = env_summarization # enable this in case your use case is summarization ( high input and medium output sizes )

# - now we select the appropriate container 
inference_image_uri = deepspeed_image_uri # use this in case it is 'generation' task 
#inference_image_uri = trtllm_image_uri # enable this in case your use case is summarization ( high input and medium output sizes ) 


print(f"Environment variables are ---- > {env}")
print(f"Image going to be used is ---- > {inference_image_uri}")

Environment variables are ---- > {'HUGGINGFACE_HUB_CACHE': '/tmp', 'TRANSFORMERS_CACHE': '/tmp', 'SERVING_LOAD_MODELS': 'test::Python=/opt/ml/model', 'OPTION_MODEL_ID': 'TheBloke/Llama-2-7B-Chat-fp16', 'OPTION_TRUST_REMOTE_CODE': 'true', 'OPTION_TENSOR_PARALLEL_DEGREE': 'max', 'OPTION_ROLLING_BATCH': 'vllm', 'OPTION_MAX_ROLLING_BATCH_SIZE': '32', 'OPTION_DTYPE': 'fp16'}
Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121


In [8]:
model_name = sagemaker.utils.name_from_base("lmi-llama2-7b")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "Environment": env,
    }
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

lmi-llama2-7b-2024-06-21-10-35-51-213
Created Model: arn:aws:sagemaker:us-east-1:705247044519:model/lmi-llama2-7b-2024-06-21-10-35-51-213


In [9]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 2400,
            "RoutingConfig": {
                'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'
            },
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:705247044519:endpoint-config/lmi-llama2-7b-2024-06-21-10-35-51-213-config',
 'ResponseMetadata': {'RequestId': '223e24b2-52e7-4fba-bff8-c93b838c3082',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '223e24b2-52e7-4fba-bff8-c93b838c3082',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '125',
   'date': 'Fri, 21 Jun 2024 10:35:55 GMT'},
  'RetryAttempts': 0}}

In [10]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-east-1:705247044519:endpoint/lmi-llama2-7b-2024-06-21-10-35-51-213-endpoint


In [11]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:705247044519:endpoint/lmi-llama2-7b-2024-06-21-10-35-51-213-endpoint
Status: InService


In [12]:
# use this for Chatbot or QA or open ended generation task
prompt = "Amazon.com is the best"
params = { "max_new_tokens": 100,"do_sample": False }

# # - use these for Summarization use case test 
# prompt = """Briefly summarize this paragraph: Amazon Comprehend uses natural language processing (NLP) to extract insights about the content of documents. It develops insights by recognizing the entities, key phrases, language, sentiments, and other common elements in a document. Use Amazon Comprehend to create new products based on understanding the structure of documents. For example, using Amazon Comprehend you can search social networking feeds for mentions of products or scan an entire document repository for key phrases.
# You can access Amazon Comprehend document analysis capabilities using the Amazon Comprehend console or using the Amazon Comprehend APIs. You can run real-time analysis for small workloads or you can start asynchronous analysis jobs for large document sets. You can use the pre-trained models that Amazon Comprehend provides, or you can train your own custom models for classification and entity recognition.
# All of the Amazon Comprehend features accept UTF-8 text documents as the input. In addition, custom classification and custom entity recognition accept image files, PDF files, and Word files as input.
# Amazon Comprehend can examine and analyze documents in a variety of languages, depending on the specific feature. For more information, see Languages supported in Amazon Comprehend. Amazon Comprehend’s Dominant language capability can examine documents and determine the dominant language for a far wider selection of languages."""
# params = { "max_new_tokens":64, "temperature":0.1}

In [15]:
%%time
response_model = smr_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(
        {
            "inputs": prompt,
            "parameters": params
        }
    ),
    ContentType="application/json",
)

response_model["Body"].read().decode("utf8")

CPU times: user 9.74 ms, sys: 4.78 ms, total: 14.5 ms
Wall time: 3.13 s


'{"generated_text": " place to buy a new laptop. According to CNET, Amazon offers a wide selection of laptops from top brands like Dell, HP, Lenovo, and more. Amazon also offers free shipping, streaming, and customer reviews to help you find the right laptop for your needs. Amazon.com has a wide selection of laptops from top brands like Dell, HP, Lenovo, and more. Additionally, Amazon offers free shipping and customer reviews to"}'

In [16]:
sm_client.delete_endpoint(EndpointName=endpoint_name)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': '5e655787-12af-4c36-b156-f9444a9b5267',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5e655787-12af-4c36-b156-f9444a9b5267',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 21 Jun 2024 11:34:36 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}