In [None]:
pip install sagemaker -U

# Using HuggingFace TGI for Serving

### Deploy the DeepSeek-R1-Distill-Llama-70B to a SageMaker Endpoint

In [5]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
	'SM_NUM_GPUS': json.dumps(8) # Change this based on the GPU used, ml.g6.48xlarge has 8 GPUs
}



# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.1"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g6e.48xlarge", # has 8 GPUs with a total GPU memeory of 384 GB
	container_startup_health_check_timeout=3600,
  )


----------------!

### Invoke the SageMaker Endpoint

In [None]:
# send request
system_message="""
You are a Chatty Assitant
"""
query= "What is the most expensive gem?"

use_reasoning_template = True # uses template that includes the reasoning step (<think></think>) step in response

if use_reasoning_template:
    # DeepSeek does not recommedn using system prompts and recommends adding it to the user prompt
    user_question = f"{system_message}\n\n{query}"
    payload={'inputs':  f"""<｜begin▁of▁sentence｜><｜User｜>{user_question}<｜Assistant｜>""", 
     'parameters': {'max_new_tokens': 500, 'top_p': 0.9, 'temperature': 0.1, "return_full_text": False}}

    # template with system message parameter
    # payload={'inputs':  f"""<｜begin▁of▁sentence｜>{system_message}<｜User｜>{query}<｜Assistant｜>""", 
    #  'parameters': {'max_new_tokens': 500, 'top_p': 0.9, 'temperature': 0.1, "return_full_text": False}}
else:
    
    # To use the model without the reasoning step (<think></think>) step that increases latency of response and response token, you can make use of deafult llama 3.3 template
    
    user_question = system_message+"\n\nHere is a question from the user: "+query
    payload={'inputs':  f"""<｜begin▁of▁sentence｜><|start_header_id|>user<|end_header_id|>{user_question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""", 
     'parameters': {'max_new_tokens': 500, 'top_p': 0.9, 'temperature': 0.1, "return_full_text": False}}
    
    # template with system message parameter
    # payload={'inputs': f"""<｜begin▁of▁sentence｜><|start_header_id|>system<|end_header_id|>{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""", 
    #  'parameters': {'max_new_tokens': 500, 'top_p': 0.9, 'temperature': 0.1, "return_full_text": False}}

response = predictor.predict(
payload
)

print(response[0]['generated_text'])

### To invoke an existing SageMaker Endpoint

In [None]:
from sagemaker.predictor import Predictor
predictor1 = Predictor(endpoint_name="ENDPOINT NAME")

system_message="""
You are a Chatty Assitant
"""
query= "What is the most expensive gem?"

user_question = f"{system_message}\n\n{query}"
payload={'inputs':  f"""<｜begin▁of▁sentence｜><｜User｜>{user_question}<｜Assistant｜>""", 
 'parameters': {'max_new_tokens': 500, 'top_p': 0.9, 'temperature': 0.1, "return_full_text": False}}

response = predictor1.predict(json.dumps(payload),
                  initial_args={"ContentType": "application/json"}
                 )

print(json.loads(response)[0]['generated_text'])

# Using DJL with vLLM for serving

In [2]:
import sagemaker
from sagemaker.djl_inference.model import DJLModel
import boto3
import json



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
model_id = 'deepseek-ai/DeepSeek-R1-Distill-Llama-70B' # model will be download form Huggingface hub

env = {
    "TENSOR_PARALLEL_DEGREE": "8",            # use 8 GPUs, modify baed on instance types
    "OPTION_ROLLING_BATCH": "vllm",           # use vllm for rolling batching
    "OPTION_TRUST_REMOTE_CODE": "true",
   }
role = sagemaker.get_execution_role()
model = DJLModel(
    model_id=model_id,
    env=env,
    role=role)

In [5]:
instance_type = "ml.g6e.48xlarge" # has 8 GPUs with a total GPU memeory of 384 GB

predictor = model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name="djl-llama-70-distil-r1",
             container_startup_health_check_timeout=3600
            )

-------------------!

In [14]:
%%time

## Use Inference API Schema
response = predictor.predict(
    {"inputs": "What are the planets in our solar suystem?", "parameters": {"max_new_tokens":128,'temperature': 0.6,}}
)
print(response['generated_text'])

 Well, let me think. There's Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Wait, didn't Pluto used to be considered a planet? Yeah, I remember hearing that it was reclassified as a dwarf planet a while back. So, now we have eight planets in total. 

Let me list them in order from the Sun. Mercury is the closest, then Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. That makes sense. I think Jupiter is the largest planet, followed by Saturn with its beautiful rings. Uranus and Neptune are the ice giants,
CPU times: user 489 μs, sys: 3.51 ms, total: 4 ms
Wall time: 4.39 s


In [18]:
%%time

## Use Chat Completions API Schema
response = predictor.predict(
    {
    "messages": [
      {
        "role": "system",
        "content": "You are a helpful assistant."
      },
      {
        "role": "user",
        "content": "What is deep learning?"
      }
    ],
    "max_tokens":256,
    "temperature": 0.6,
  }
)
print(response['choices'])

[{'index': 0, 'message': {'role': 'assistant', 'content': "<think>\nOkay, so the user is asking about not receiving a promotion on Robinhood. I need to figure out how to address this. Let me break down what the user might be experiencing. Promotions on Robinhood could relate to stock bonuses, referral rewards, or other incentives. The user might be expecting something but hasn't received it.\n\nFirst, I should consider possible reasons. Maybe the user signed up for a promotion but didn't meet the criteria, like funding requirements or timing. It's also possible there's an issue with their account verification or the promotion has expired.\n\nLooking at the available agents, the help_center_librarian can provide official info on promotions. Then, account_management_librarian can check their account status, and account_money_movement_reporter can look into any recent transactions related to the promotion. \n\nI'll structure the plan to first get general info from the help center, then ch