In [1]:
%pip install "sagemaker==2.163.0" huggingface_hub safetensors loguru --upgrade --quiet --index-url https://pypi.python.org/simple

Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'tiiuae/falcon-7b',
	'SM_NUM_GPUS': json.dumps(4)
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="0.8.2"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.12xlarge",
	container_startup_health_check_timeout=400,
  )
  
# send request
predictor.predict({
	"inputs": "My name is Julien and I like to",
})

------------------------*

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-06-14-12-16-29-069: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [1]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::802376408542:role/Admin
sagemaker session region: us-west-2


In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
    "huggingface",
    version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04


In [12]:
import json

from sagemaker import image_uris
from sagemaker import Session
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer


# model config
model_id = "tiiuae/falcon-40b-instruct"
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
max_input_length = 1024
max_total_tokens = 2048
health_check_timeout = 300

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()

# retrieve the HuggingFace LLM DLC URI
image_uri = image_uris.retrieve(
    framework="huggingface-llm",
    region=sagemaker_session.boto_region_name,
    version="0.8.2",
    image_scope="inference",
)

# define environment variables for TGI config
env = {
    'HF_MODEL_ID': model_id,
    'SM_NUM_GPUS': json.dumps(number_of_gpu),
    'MAX_INPUT_LENGTH': json.dumps(max_input_length),
    'MAX_TOTEL_TOKENS': json.dumps(max_total_tokens),
}

model = Model(
    image_uri=image_uri,
    role=aws_role,
    env=env,
    predictor_cls=Predictor,
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

------------------!

In [14]:
prompt = """You are an helpful Assistant, called Falcon. Knowing everyting about AWS.

User: Can you tell me something about Amazon SageMaker?
Falcon:"""
prompt = "write a Python program to compute fibonacci sequence"

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:","<|endoftext|>","</s>"]
  }
}

# send request to endpoint
response = predictor.predict(payload)

# print assistant respond
assistant = response[0]["generated_text"][len(prompt):]
print(response)

[{'generated_text': "\nHere's a Python program to compute Fibonacci sequence:\n\n```python\ndef fibonacci(n):\n    a, b = 0, 1\n    for i in range(n-1):\n        c = a + b\n        a, b = b, c\n    return c\n\nif __name__ == '__main__':\n    print(fibonacci(10))\n```\n\nThis program defines a function `fibonacci` that takes an integer argument `n` and returns the nth Fibonacci number. The program then calls the function with `n=10`, prints the result, and exits."}]


In [15]:
predictor.delete_model()
predictor.delete_endpoint()

ClientError: An error occurred (ExpiredTokenException) when calling the DescribeEndpoint operation: The security token included in the request is expired

In [1]:
import datetime
import torch

from collections import defaultdict
from loguru import logger
from pathlib import Path
from safetensors.torch import save_file
from safetensors import safe_open
from typing import Dict, List


def check_file_size(source_file: Path, target_file: Path):
    """
    Check that two files are close in size
    """
    source_file_size = source_file.stat().st_size
    target_file_size = target_file.stat().st_size

    if (source_file_size - target_file_size) / source_file_size > 0.01:
        raise RuntimeError(
            f"""The file size different is more than 1%:
         - {source_file}: {source_file_size}
         - {target_file}: {target_file_size}
         """
        )


def remove_shared_pointers(tensors: Dict[str, torch.Tensor]):
    """
    For a Dict of tensors, check if two or more tensors point to the same underlying memory and
    remove them
    """
    ptrs = defaultdict(list)
    for k, v in tensors.items():
        ptrs[v.data_ptr()].append(k)

    # Iterate over all found memory addresses
    for ptr, names in ptrs.items():
        if len(names) > 1:
            # Multiple tensors are point to the same memory
            # Only keep the first tensor
            for name in names[1:]:
                tensors.pop(name)


def convert_file(pt_file: Path, sf_file: Path):
    """
    Convert a pytorch file to a safetensors file
    """
    logger.info(f"Convert {pt_file} to {sf_file}.")

    pt_state = torch.load(pt_file, map_location="cpu")
    if "state_dict" in pt_state:
        pt_state = pt_state["state_dict"]

    remove_shared_pointers(pt_state)

    # Tensors need to be contiguous
    pt_state = {k: v.contiguous() for k, v in pt_state.items()}

    sf_file.parent.mkdir(parents=True, exist_ok=True)
    save_file(pt_state, str(sf_file), metadata={"format": "pt"})

    # Check that both files are close in size
    check_file_size(pt_file, sf_file)

    # Load safetensors state
    for k in pt_state:
        pt_tensor = pt_state[k]
        with safe_open(sf_file, framework="pt") as f:
            sf_tensor = f.get_tensor(k)
            if not torch.equal(pt_tensor, sf_tensor):
                raise RuntimeError(f"The output tensors do not match for key {k}")


def convert_files(pt_files: List[Path], sf_files: List[Path]):
    assert len(pt_files) == len(sf_files)

    N = len(pt_files)
    # We do this instead of using tqdm because we want to parse the logs with the launcher

    for i, (pt_file, sf_file) in enumerate(zip(pt_files, sf_files)):
        start = datetime.datetime.now()
        convert_file(pt_file, sf_file)
        elapsed = datetime.datetime.now() - start
        logger.info(f"Convert: [{i + 1}/{N}] -- Took: {elapsed}")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import subprocess
from pathlib import Path
from distutils.dir_util import copy_tree

from huggingface_hub import snapshot_download


model_id = "tiiuae/falcon-40b-instruct"
model_dir = Path.cwd() / "models" / model_id.split('/')[1]
model_tarball_path = Path.cwd() / "models" / f"{model_id.split('/')[1]}.tar.gz"

# print("Downloading snapshot of HuggingFace repository ...")
# snapshot_dir = snapshot_download(repo_id=model_id)

# print("Copying snapshot to a model directory ...")
# copy_tree(snapshot_dir, str(model_dir))

# local_pt_files =  list(model_dir.glob("*.bin"))
# local_st_files = [p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors" for p in local_pt_files]
# convert_files(local_pt_files, local_st_files)

print("Compressing model snapshot directory ...")
command = [
    'tar',
    '--use-compress-program="pigz --best --recursive"',
    '-cvf',
    str(model_tarball_path),
    '-C',
    str(model_dir),
    '.'
]
subprocess.run(" ".join(command), shell=True, check=True)

print("Uploading compressed model snapshot to S3 ...")
model_s3_uri = f"s3://sagemaker-jumpstart-cache-contributor-staging/jumpstart-1p/ulrichkr/{model_tarball_path.name}"
subprocess.run(f"aws s3 cp {model_tarball_path} {model_s3_uri}".split(" "))

Compressing model snapshot directory ...
./
./tokenizer_config.json
./README.md
./handler.py
./model-00008-of-00009.safetensors
./tokenizer.json
./model-00002-of-00009.safetensors
./modelling_RW.py
./model-00009-of-00009.safetensors
./config.json
./pytorch_model.bin.index.json
./model-00005-of-00009.safetensors
./model-00006-of-00009.safetensors
./model-00007-of-00009.safetensors
./.gitattributes
./model-00004-of-00009.safetensors
./model-00003-of-00009.safetensors
./configuration_RW.py
./special_tokens_map.json
./generation_config.json
./model-00001-of-00009.safetensors
Uploading compressed model snapshot to S3 ...
upload: models/falcon-40b-instruct.tar.gz to s3://sagemaker-jumpstart-cache-contributor-staging/jumpstart-1p/ulrichkr/falcon-40b-instruct.tar.gz


CompletedProcess(args=['aws', 's3', 'cp', '/home/ubuntu/code/aws/amazon-sagemaker-examples/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-benchmarking/models/falcon-40b-instruct.tar.gz', 's3://sagemaker-jumpstart-cache-contributor-staging/jumpstart-1p/ulrichkr/falcon-40b-instruct.tar.gz'], returncode=0)

In [10]:
import json
from pathlib import Path

from sagemaker import image_uris
from sagemaker import Session
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer


model_id = "tiiuae/falcon-40b-instruct"
model_dir = Path.cwd() / "models" / model_id.split('/')[1]
model_tarball_path = Path.cwd() / "models" / f"{model_id.split('/')[1]}.tar.gz"
model_s3_uri = f"s3://sagemaker-jumpstart-cache-contributor-staging/jumpstart-1p/ulrichkr/{model_tarball_path.name}"

# model config
model_id = "tiiuae/falcon-40b-instruct"
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
max_input_length = 1024
max_total_tokens = 2048
health_check_timeout = 1200

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()

# retrieve the HuggingFace LLM DLC URI
image_uri = image_uris.retrieve(
    framework="huggingface-llm",
    region=sagemaker_session.boto_region_name,
    version="0.8.2",
    image_scope="inference",
)

# define environment variables for TGI config
env = {
    'HF_MODEL_ID': "/opt/ml/model",
    'SM_NUM_GPUS': json.dumps(number_of_gpu),
    'MAX_INPUT_LENGTH': json.dumps(max_input_length),
    'MAX_TOTEL_TOKENS': json.dumps(max_total_tokens),
}

model = Model(
    image_uri=image_uri,
    model_data=model_s3_uri,
    role=aws_role,
    env=env,
    predictor_cls=Predictor,
    enable_network_isolation=True,
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

--------------------------!

In [4]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
health_check_timeout = 300

# TGI config
config = {
  'HF_MODEL_ID': "tiiuae/falcon-40b-instruct", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  'MAX_TOTEL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [5]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

-------------------!

In [10]:
prompt = """You are an helpful Assistant, called Falcon. Knowing everyting about AWS.

User: Can you tell me something about Amazon SageMaker?
Falcon:"""

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:","<|endoftext|>","</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

# print assistant respond
assistant = response[0]["generated_text"][len(prompt):]
print(assistant)

 Sure! Amazon SageMaker is a fully managed platform for building, training, and deploying machine learning models at scale. It provides a range of tools and services to help data scientists and developers create and deploy ML models quickly and easily. These include pre-built algorithms, data processing tools, and integrated development environments. Additionally, SageMaker provides the ability to automate the end-to-end ML workflow, from data preparation to model training and deployment, making it easier to build and deploy ML models in production environments.


In [15]:
new_prompt = f"""{prompt}{assistant}
User: How would you recommend start using Amazon SageMaker? If i am new to Machine Learning?
Falcon:"""
# update payload
payload["inputs"] = new_prompt

import time
t0 = time.time()

# send request to endpoint
response = llm.predict(payload)

# print assistant respond
new_assistant = response[0]["generated_text"][len(new_prompt):]
print(new_assistant)
print(len(new_assistant.split()))
print(len(new_assistant.split()) / (time.time() - t0))

 Amazon SageMaker provides several resources to help users get started with machine learning. Here are some suggestions:

1. Start with the Getting Started tutorial on the SageMaker website, which guides you through the basics of creating a model.

2. Use SageMaker's pre-built algorithms and tools, such as Amazon Rekognition, to get started with image and video analysis, natural language processing, and more.

3. Join the AWS Machine Learning Community, where you can connect with other ML practitioners and get help from AWS experts.

4. Consider taking an online course or attending a workshop to learn more about machine learning concepts and how to apply them using SageMaker.

5. Finally, experiment with different models and algorithms to see what works best for your specific use case. SageMaker provides a range of tools and services to help you optimize and fine-tune your models over time.
143
13.836271420701376
