In [1]:
%pip install "sagemaker==2.163.0" --upgrade --quiet --index-url https://pypi.python.org/simple

Note: you may need to restart the kernel to use updated packages.


In [1]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::802376408542:role/Admin
sagemaker session region: us-west-2


In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

llm_image = get_huggingface_llm_image_uri("huggingface", version="0.8.2")
print(llm_image)


from sagemaker import Session
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()

from sagemaker import image_uris
# retrieve the HuggingFace LLM DLC URI
image_uri = image_uris.retrieve(
    framework="huggingface-llm",
    region=sagemaker_session.boto_region_name,
    version="0.8.2",
    image_scope="inference",
)
print(image_uri)

763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04
763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04


In [9]:
from concurrent import futures

import boto3
from botocore.config import Config
from sagemaker.session import Session
import sagemaker
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import HuggingFaceModel



NUM_DEPLOYMENTS = 30
MAX_CONCURRENT_DEPLOYMENTS = 10
SM_SESSION = Session(
    sagemaker_client=boto3.client(
        "sagemaker",
        config=Config(connect_timeout=5, read_timeout=60, retries={"max_attempts": 20}),
    )
)
SM_ROLE = sagemaker.get_execution_role()

def deploy_llm_endpoint(
        hf_model_id: str,
        instance_type: str = "ml.g5.48xlarge",
        number_of_gpu: int = 4,
        max_input_length: int = 1024,
        max_total_tokens: int = 2048,
        health_check_timeout: int = 600
) -> bool:
    success = False
    try:
        llm_image = get_huggingface_llm_image_uri("huggingface", version="0.8.2")
        env = {
            "HF_MODEL_ID": hf_model_id,
            "SM_NUM_GPUS": str(number_of_gpu),
            "MAX_INPUT_LENGTH": str(max_input_length),
            "MAX_TOTAL_TOKENS": str(max_total_tokens),
        }
        model = HuggingFaceModel(role=SM_ROLE, image_uri=llm_image, env=env, sagemaker_session=SM_SESSION)
        predictor = model.deploy(
            initial_instance_count=1,
            instance_type=instance_type,
            container_startup_health_check_timeout=health_check_timeout,
        )
        predictor.delete_model()
        predictor.delete_endpoint()
        success = True
        print("\nSuccessful deployment.")
    except Exception as e:
        print(f"\nError with deploying model: {e}")
    return success


with futures.ThreadPoolExecutor(max_workers=NUM_DEPLOYMENTS) as executor:
    results = executor.map(deploy_llm_endpoint, ["tiiuae/falcon-40b-instruct"] * NUM_DEPLOYMENTS)

results = list(results)
print(results)
print(sum(results) / len(results))


Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying a model

Error with deploying model: Role can not be null for deploying 

In [8]:
print(list(results))

[]


In [3]:


# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04


In [4]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
health_check_timeout = 300

# TGI config
config = {
  'HF_MODEL_ID': "tiiuae/falcon-7b-instruct", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  'MAX_TOTEL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [5]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

--------------------*

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-06-08-21-53-38-431: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [None]:
prompt = """You are an helpful Assistant, called Falcon. Knowing everyting about AWS.

User: Can you tell me something about Amazon SageMaker?
Falcon:"""

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:","<|endoftext|>","</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

# print assistant respond
assistant = response[0]["generated_text"][len(prompt):]
print(assistant)

 Sure! Amazon SageMaker is a fully managed platform for building, training, and deploying machine learning models at scale. It provides a range of tools and services to help data scientists and developers create and deploy ML models quickly and easily. These include pre-built algorithms, data processing tools, and integrated development environments. Additionally, SageMaker provides the ability to automate the end-to-end ML workflow, from data preparation to model training and deployment, making it easier to build and deploy ML models in production environments.


In [None]:
new_prompt = f"""{prompt}{assistant}
User: How would you recommend start using Amazon SageMaker? If i am new to Machine Learning?
Falcon:"""
# update payload
payload["inputs"] = new_prompt

import time
t0 = time.time()

# send request to endpoint
response = llm.predict(payload)

# print assistant respond
new_assistant = response[0]["generated_text"][len(new_prompt):]
print(new_assistant)
print(len(new_assistant.split()))
print(len(new_assistant.split()) / (time.time() - t0))

 If you are new to machine learning, you may want to start with the SageMaker JumpStart program, which provides hands-on experience with the platform and a range of ML algorithms. You can also explore the SageMaker Quick Start guides, which provide step-by-step instructions for building and deploying ML models using popular frameworks such as TensorFlow and PyTorch. Additionally, you may want to consider taking online courses or attending workshops to learn more about machine learning and how to use SageMaker. Finally, don't be afraid to experiment with different algorithms and approaches to find the best solution for your specific problem.
100
14.789933982576365
