In [1]:
import io
import sagemaker
import boto3
import json
import time 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = session._region_name
client = boto3.client("sagemaker-runtime")
session_bucket = 'notebookllama-project'
bucket_prefix = "async-llm-output"
async_output_path = f"s3://{session_bucket}/{bucket_prefix}/output"

time_str = str(time.time())[-4:]
print('ouput s3 path:', async_output_path)
print('role', role)
print(region)
print(session_bucket)
print(client)
print(time_str)

ouput s3 path: s3://notebookllama-project/async-llm-output/output
role arn:aws:iam::867521064370:role/sagemaker
us-east-1
notebookllama-project
<botocore.client.SageMakerRuntime object at 0x7f91b917ce50>
1368


## Deploy Llama models via jumpstart

In [4]:
from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig

async_config = AsyncInferenceConfig(
    output_path=async_output_path,
    max_concurrent_invocations_per_instance=10,
)

In [5]:
from sagemaker.jumpstart.model import JumpStartModel

model_id_list =[ "meta-textgeneration-llama-3-2-1b-instruct",  "meta-textgeneration-llama-3-1-8b-instruct"
, "meta-textgeneration-llama-3-1-70b-instruct"]
instance_type_list = ['ml.g5.12xlarge','ml.g5.24xlarge', 'ml.g5.48xlarge'] 

for index, model_id in enumerate(model_id_list):
    model = JumpStartModel(model_id=model_id)
    predictor = model.deploy(
        # async_inference_config=async_config,
        instance_type = instance_type_list[index],
        endpoint_name= model_id+'-endpoint-'+time_str,
        accept_eula=True)

Using model 'meta-textgeneration-llama-3-2-1b-instruct' with wildcard version identifier '*'. You can pin to version '1.0.4' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


--------------!

Using model 'meta-textgeneration-llama-3-1-8b-instruct' with wildcard version identifier '*'. You can pin to version '2.2.4' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


------------!

Using model 'meta-textgeneration-llama-3-1-70b-instruct' with wildcard version identifier '*'. You can pin to version '2.2.5' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


----------------!

## Deploy Qwen 

In [6]:
container_uri = sagemaker.image_uris.retrieve(framework="djl-lmi", version="0.28.0", region=region)
model_id = 'Qwen/Qwen2-72B-Instruct'
endpoint_name = 'Qwen2-72B-Instruct-endpoint-'+time_str
instance_type ='ml.g5.48xlarge'

model = sagemaker.Model(
    image_uri=container_uri, 
    role=role,
    env={
        "HF_MODEL_ID": model_id,
        "OPTION_DTYPE":"fp16",
    }
)

model.deploy(
    instance_type=instance_type,
    initial_instance_count=1,
    endpoint_name=endpoint_name,
    # async_inference_config=async_config
)


--------------------!