In [2]:
%%sh
pip install -Uq sagemaker

In [4]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'meta-llama/Meta-Llama-3-8B-Instruct',
	'SM_NUM_GPUS': json.dumps(1),
	'HUGGING_FACE_HUB_TOKEN': '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'
}

# assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."

image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04-v2.0'
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	#image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.1"),
    image_uri = image_uri,
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.12xlarge",
	container_startup_health_check_timeout=300,
  )


-----------------*

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2024-04-21-10-43-27-770: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html

In [None]:
from transformers import AutoTokenizer

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=hub['HUGGING_FACE_HUB_TOKEN'])

# Prompt to generate
messages = [
    {"role": "system", "content": "You are a friendly ML engineer answering developer questions"},
    {"role": "user", "content": "How is Llama3 a better model than Llama2"},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
# Generation arguments
payload = {
    "max_new_tokens":256,
    "eos_token_id":terminators,
    "do_sample":True,
    "temperature":0.6,
    "top_p":0.9,
    "return_full_text": False,
}

# Synchronous inference

In [None]:
# Default settings
# Only added to jump back and forth between synchronous and streaming
from sagemaker.base_deserializers import JSONDeserializer
predictor.deserializer=JSONDeserializer()

In [None]:
%%time
response = predictor.predict({"inputs":prompt, "parameters":payload})

In [None]:
import pprint

pprint.pprint(response[0]['generated_text'])

# Streaming inference

In [None]:
import io

# source: https://aws.amazon.com/blogs/machine-learning/elevating-the-generative-ai-experience-introducing-streaming-support-in-amazon-sagemaker-hosting/
# https://github.com/aws-samples/sagemaker-hosting/tree/main/GenAI-Hosting/Large-Language-Model-Hosting/LLM-Streaming/llama-2-hf-tgi

class LineIterator:
    """
    A helper class for parsing the byte stream input. 
    
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    
    While usually each PayloadPart event from the event stream will contain a byte array 
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```
    
    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'scan_lines' function. It maintains the position of the last read 
    position to ensure that previous bytes are not exposed again. 
    """
    
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord('\n'):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if 'PayloadPart' not in chunk:
                print('Unknown event type:' + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk['PayloadPart']['Bytes'])

In [None]:
def print_event_stream(event_stream):
    start_json = b'{'
    stop_token = '<|eot_id|>'

    for line in LineIterator(event_stream):
        if line != b'' and start_json in line:
            data = json.loads(line[line.find(start_json):].decode('utf-8'))
            if data['token']['text'] != stop_token:
                print(data['token']['text'],end='')

In [None]:
from sagemaker.base_deserializers import StreamDeserializer

predictor.deserializer=StreamDeserializer() # <------------------

body = {
    "inputs": prompt, 
    "parameters": payload, 
    "stream": True,  # <------------------
}

In [None]:
smr = boto3.client("sagemaker-runtime")
response = smr.invoke_endpoint_with_response_stream(EndpointName=predictor.endpoint_name, Body=json.dumps(body), ContentType='application/json')
print_event_stream(response['Body'])

In [None]:
predictor.delete_model()
predictor.delete_endpoint()

In [None]:
# HF Vsersion 

In [6]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'meta-llama/Meta-Llama-3-8B-Instruct',
	'SM_NUM_GPUS': json.dumps(1),
	'HUGGING_FACE_HUB_TOKEN': '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'
}

assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.2"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.2xlarge",
	container_startup_health_check_timeout=300,
  )
  
# send request
predictor.predict({
	"inputs": "Hey my name is Julien! How are you?",
})

----------------*

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2024-04-21-10-55-14-300: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html