# Running Inference of your FT model

## SageMaker auth

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

DATASET_S3 = f's3://{sess.default_bucket()}/processed/wandbot/train'

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Creating the Endpoint

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

Lets create the endpoint with the model stored on S3

In [None]:
model_s3_path = "s3://sagemaker-us-east-1-372108735839/wandb-qlora-codellama7-2023-10-26-00-01-56-374/output/model/"

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
# model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(3000), # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

In [None]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

In [None]:
llm.endpoint_name

In [None]:
session = sess.boto_session
smr = session.client("sagemaker-runtime")

## Eval Dataset

I recommend reading this [excellent blog post](https://www.philschmid.de/sagemaker-falcon-180b) to understand the ins and outs of deploying to SageMaker with Huggingface models

In [None]:
!pip install -qqqU wandb datasets

In [None]:
WANDB_PROJECT = "aws_llm_workshop"

In [None]:
import wandb
import json
from datasets import load_from_disk
run = wandb.init(project=WANDB_PROJECT, job_type="inference")
artifact = run.use_artifact('capecape/aws_llm_workshop/wandbot_eval_dataset:v1', type='dataset')
artifact_dir = artifact.download()

In [None]:
eval_ds = load_from_disk(artifact_dir)["train"]  # I know, it has a split named train

In [None]:
one_sample = eval_ds[0]["text"]

In [None]:
# formatted_prompt = format_prompt(prompt, history, system_prompt)
import json

# hyperparameters for llm
parameters = {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 952,
    "repetition_penalty": 1.03,
    "stop": ["[/W&B]", "</s>"],
}

# hyperparameters for llm
payload = {
  "inputs": one_sample,
  "parameters": parameters,
}

# send request to endpoint
response = llm.predict(payload)

In [None]:
assistant = response[0]["generated_text"]
print(assistant)

In [None]:
from time import perf_counter

def call_endpoint(formatted_input, parameters):
    "Call the SM endpoint and parse the output in string format"
    t0 = perf_counter()
    payload = {
      "inputs": formatted_input,
      "parameters": parameters,
    }
    response = llm.predict(payload)
    total_time = perf_counter() - t0

    return response[0]["generated_text"], total_time

In [None]:
call_endpoint(one_sample, parameters)

## Saving our eval results to W&B

In [None]:
from tqdm.auto import tqdm

In [None]:
params_cols = list(parameters.keys())
table = wandb.Table(columns=["question", "original_answer", "generated_answer", "time(s)"] + params_cols) 

In [None]:
for s in tqdm(eval_ds):
    generated_answer, req_time = call_endpoint(s["text"], parameters)
    table.add_data(s["question"], s["answer"], generated_answer, req_time, *list(parameters.values()))

In [None]:
wandb.log({"evaluation_answers": table})
wandb.finish()

don't forget to kill the endopoint! 

In [None]:
llm.delete_model()
llm.delete_endpoint()