# Running Inference of your FT model

## SageMaker auth

In [1]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

DATASET_S3 = f's3://{sess.default_bucket()}/processed/wandbot/train'

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::372108735839:role/SageMakerExecutionRole
sagemaker bucket: sagemaker-us-east-1-372108735839
sagemaker session region: us-east-1


## Creating the Endpoint

In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


Lets create the endpoint with the model stored on S3

In [5]:
model_s3_path = "s3://sagemaker-us-east-1-372108735839/wandb-qlora-codellama7-2023-10-26-00-01-56-374/output/model/"

In [6]:
import json
from sagemaker.huggingface import HuggingFaceModel

# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
# model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(3000), # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [7]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

----------!

In [8]:
llm.endpoint_name

'huggingface-pytorch-tgi-inference-2023-10-26-11-46-38-157'

In [9]:
session = sess.boto_session
smr = session.client("sagemaker-runtime")

## Eval Dataset

I recommend reading this [excellent blog post](https://www.philschmid.de/sagemaker-falcon-180b) to understand the ins and outs of deploying to SageMaker with Huggingface models

In [10]:
!pip install -qqqU wandb datasets

In [11]:
WANDB_PROJECT = "aws_llm_workshop"

In [28]:
import wandb
import json
from datasets import load_from_disk
run = wandb.init(project=WANDB_PROJECT, job_type="inference")
artifact = run.use_artifact('capecape/aws_llm_workshop/wandbot_eval_dataset:v1', type='dataset')
artifact_dir = artifact.download()

[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [13]:
eval_ds = load_from_disk(artifact_dir)["train"]  # I know, it has a split named train

In [14]:
one_sample = eval_ds[0]["text"]

In [15]:
# formatted_prompt = format_prompt(prompt, history, system_prompt)
import json

# hyperparameters for llm
parameters = {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 952,
    "repetition_penalty": 1.03,
    "stop": ["[/W&B]", "</s>"],
}

# hyperparameters for llm
payload = {
  "inputs": one_sample,
  "parameters": parameters,
}

# send request to endpoint
response = llm.predict(payload)

In [19]:
assistant = response[0]["generated_text"]
print(assistant)

Yes, you can initialize wandb within a specific route function. In your case, `@app.route('/')` would be an ideal place to initialize W&B. Here's a sample code snippet:\n\n```python\nimport wandb\nfrom fastapi import FastAPI\n\napp = FastAPI()\n\n@app.get("/")\ndef read_root():\n    return {"Hello": "World"}\n\n```\n\nIn this code, `app` is a FastAPI instance, `read_root` is a simple route function that returns a dictionary. You can replace `read_root` with your own route function. For instance, you can add `@app.post("/")` to handle HTTP POST requests.\n\n\nIt's important to note that, you should only initialize wandb once for your process. If you initialize it in every route function, it will create multiple runs, which can lead to confusion.\n[/W&B]


In [20]:
from time import perf_counter

def call_endpoint(formatted_input, parameters):
    "Call the SM endpoint and parse the output in string format"
    t0 = perf_counter()
    payload = {
      "inputs": formatted_input,
      "parameters": parameters,
    }
    response = llm.predict(payload)
    total_time = perf_counter() - t0

    return response[0]["generated_text"], total_time

In [21]:
call_endpoint(one_sample, parameters)

('Yes, you can initialize wandb within a specific route function. In fact, that is one of the best practices for initializing wandb. You can do it like this:\n\n```python\nfrom fastapi import FastAPI\nimport wandb\n\napp = FastAPI()\n\n@app.on_event("startup")\nasync def startup():\n    await wandb.init(...)\n```\n\nHowever, you should initialize wandb beforehand if you want to be able to use `@wandb_log` decorator for logging. Here is an example of how to do that:\n\n```python\nfrom fastapi import FastAPI\nimport wandb\n\napp = FastAPI()\n\nwandb.init(...)\n\n@app.on_event("startup")\nasync def startup():\n    pass\n```\n\nRegarding your issue with long list of empty log items, I would suggest that you make sure you are not logging the same thing multiple times. W&B does not allow logging the same value multiple times within a single run. You can also check the size of your log items by using `wandb.log({"a": wandb.Histogram(np.random.random(1000))})`. If the histogram size is large, 

## Saving our eval results to W&B

In [22]:
from tqdm.auto import tqdm

In [24]:
params_cols = list(parameters.keys())
table = wandb.Table(columns=["question", "original_answer", "generated_answer", "time(s)"] + params_cols) 

In [25]:
for s in tqdm(eval_ds):
    generated_answer, req_time = call_endpoint(s["text"], parameters)
    table.add_data(s["question"], s["answer"], generated_answer, req_time, *list(parameters.values()))

  0%|          | 0/132 [00:00<?, ?it/s]

In [35]:
wandb.log({"evaluation_answers": table})
wandb.finish()



VBox(children=(Label(value='0.350 MB of 0.695 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.504367â€¦

don't forget to kill the endopoint! 

In [27]:
llm.delete_model()
llm.delete_endpoint()