In [1]:
%%capture --no-display

!pip install boto3
!pip install sagemaker --upgrade

In [2]:
import sagemaker
import boto3

session = boto3.Session()
sagemaker_session = sagemaker.Session(boto_session=session)
role = sagemaker.get_execution_role(sagemaker_session=sagemaker_session)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:
sagemaker.__version__

'2.226.1'

In [10]:
from sagemaker.jumpstart.model import JumpStartModel

model = JumpStartModel(model_id = "meta-textgeneration-llama-3-8b-instruct",
                       model_version="2.2.0", # to keep it stable
                       instance_type="ml.g4dn.2xlarge")

predictor = model.deploy(endpoint_name = "llama-3-8B-Instruct",
                         initial_instance_count=1,
                         accept_eula=True)

Overriding instance type to ml.g4dn.2xlarge


---------------------------------------

KeyboardInterrupt: 

In [None]:
payload = {
    "inputs": "What was Lincoln like?",
   "parameters":{"max_new_tokens":64, "top_p":0.9, "temperature":0.6}
}

predictor.predict(payload)

In [15]:
from sagemaker.huggingface.model import HuggingFaceModel

# Hub model configuration <https://huggingface.co/models>
hub = {
  'HF_MODEL_ID':'meta-llama/Meta-Llama-3.1-8B-Instruct',   # model_id from hf.co/models (unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit)
  'HF_TASK':'question-answering',                          # NLP task you want to use for predictions (https://huggingface.co/docs/transformers/main_classes/pipelines)
  "HF_API_TOKEN": "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" # Llama 2 is a gated model and requires a Hugging Face Hub token.  
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,                                                # configuration for loading model from Hub
   role=role,                                              # IAM role with permissions to create an endpoint                                            # IAM role with permissions to create an endpoint
   transformers_version="4.43.1",                             # Transformers version used
   pytorch_version="2.1.0",                                  # PyTorch version used
   py_version='py310',                                      # Python version used
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   endpoint_name="llama-3-1-8B-Instruct",
   initial_instance_count=1,
   instance_type="ml.g4dn.2xlarge"
)

ValueError: Unsupported huggingface version: 4.43.1. You may need to upgrade your SDK version (pip install -U sagemaker) for newer huggingface versions. Supported huggingface version(s): 4.6.1, 4.10.2, 4.11.0, 4.12.3, 4.17.0, 4.26.0, 4.28.1, 4.37.0, 4.6, 4.10, 4.11, 4.12, 4.17, 4.26, 4.28, 4.37.

In [4]:
data = {
    "inputs": {
    "question": "Who was the best president of the USA ever?",
    "context": "Don't include Lincoln"
    }
}
predictor.predict(data)

ParamValidationError: Parameter validation failed:
Invalid type for parameter Body, value: {'inputs': {'question': 'Who was the best president of the USA ever?', 'context': "Don't include Lincoln"}}, type: <class 'dict'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object

In [3]:
from sagemaker.predictor import Predictor
predictor = Predictor("huggingface-pytorch-inference-2024-07-29-18-32-33-243")

In [16]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::283774148357:role/sagemaker-role-prod
sagemaker session region: eu-west-1


In [17]:
# COMMENT IN WHEN PR (https://github.com/aws/sagemaker-python-sdk/pull/4314) IS MERGED
# from sagemaker.huggingface import get_huggingface_llm_image_uri

# # retrieve the llm image uri
# llm_image = get_huggingface_llm_image_uri(
#   "huggingface",
#   version="2.0.0"
# )
llm_image = f"763104351884.dkr.ecr.{sess.boto_region_name}.amazonaws.com/huggingface-pytorch-tgi-inference:2.1-tgi2.0-gpu-py310-cu121-ubuntu22.04"

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1-tgi2.0-gpu-py310-cu121-ubuntu22.04


In [19]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.2xlarge"
health_check_timeout = 1800

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "meta-llama/Meta-Llama-3-8B-Instruct", # model_id from hf.co/models
  'MAX_INPUT_LENGTH': "2048",  # Max length of input text
  'MAX_TOTAL_TOKENS': "4096",  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': "8192",  # Limits the number of tokens that can be processed in parallel during the generation
  'MESSAGES_API_ENABLED': "true", # Enable the messages API
  'HUGGING_FACE_HUB_TOKEN': "<REPLACE WITH YOUR TOKEN>",
  "HF_API_TOKEN": "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", # Llama 2 is a gated model and requires a Hugging Face Hub token.  
"HUGGING_FACE_HUB_TOKEN": "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"
}

# check if token is set
assert config['HUGGING_FACE_HUB_TOKEN'] != "<REPLACE WITH YOUR TOKEN>", "Please set your Hugging Face Hub token"

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [20]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

------------!

In [21]:
# Prompt to generate
messages=[
    { "role": "system", "content": "You are a helpful assistant." },
    { "role": "user", "content": "What is deep learning?" }
  ]

# Generation arguments
parameters = {
    "model": "meta-llama/Meta-Llama-3-8B-Instruct", # placholder, needed
    "top_p": 0.6,
    "temperature": 0.9,
    "max_tokens": 512,
    "stop": ["<|eot_id|>"],
}

In [22]:
chat = llm.predict({"messages" :messages, **parameters})

print(chat["choices"][0]["message"]["content"].strip())

Deep learning is a subfield of machine learning that involves the use of artificial neural networks to model and analyze complex data. It is a type of machine learning that is inspired by the structure and function of the human brain, where artificial neural networks are designed to mimic the way the brain processes information.

In deep learning, neural networks are composed of multiple layers of interconnected nodes or "neurons," which process and transform the input data in a hierarchical manner. Each layer builds upon the previous one, allowing the network to learn increasingly complex and abstract representations of the data.

Deep learning has many applications in areas such as:

1. Computer Vision: Deep learning is used in image and video analysis, object detection, facial recognition, and image segmentation.
2. Natural Language Processing: Deep learning is used in language translation, speech recognition, sentiment analysis, and text summarization.
3. Speech Recognition: Deep l