# Running Inference of your FT model

## SageMaker auth

In [36]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

DATASET_S3 = f's3://{sess.default_bucket()}/processed/wandbot/train'

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::372108735839:role/SageMakerExecutionRole
sagemaker bucket: sagemaker-us-east-1-372108735839
sagemaker session region: us-east-1


## Creating the Endpoint

In [37]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


Lets create the endpoint with the model stored on S3

In [20]:
from wandb import Api
api = Api()
at = api.artifact('capecape/aws_llm_workshop/codellama7_wandb:latest', type='model')

In [38]:
model_s3_path = "s3://sagemaker-us-east-1-372108735839/wandb-qlora-codellama7-2023-10-26-00-01-56-374/output/model/"

In [46]:
import json
from sagemaker.huggingface import HuggingFaceModel

# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
# model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [47]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

----------!

In [51]:
llm.endpoint_name

'huggingface-pytorch-tgi-inference-2023-10-26-09-16-17-809'

In [86]:
# formatted_prompt = format_prompt(prompt, history, system_prompt)
import json

# hyperparameters for llm
parameters = {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["[/W&B]", "</s>"],
}

session = sess.boto_session
smr = session.client("sagemaker-runtime")

## Eval Dataset

In [80]:
!pip install -qqqU wandb datasets

In [78]:
WANDB_PROJECT = "aws_llm_workshop"

In [90]:
import wandb
import json
from datasets import load_from_disk
run = wandb.init()
artifact = run.use_artifact('capecape/aws_llm_workshop/wandbot_eval_dataset:v1', type='dataset')
artifact_dir = artifact.download()

In [126]:
eval_ds = load_from_disk(artifact_dir)["train"]  # I know, it has a split named train

In [128]:
one_sample = eval_ds[0]["text"]

In [92]:
request = {"inputs": one_sample, 
           "parameters": parameters, 
           "stream": False}

In [111]:
resp = smr.invoke_endpoint(
    EndpointName=llm.endpoint_name,
    Body=json.dumps(request),
    ContentType="application/json",
)

In [112]:
resp

{'ResponseMetadata': {'RequestId': '88d8b176-dd0f-47eb-b45d-a4d3d297ca09',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '88d8b176-dd0f-47eb-b45d-a4d3d297ca09',
   'x-amzn-invoked-production-variant': 'AllTraffic',
   'date': 'Thu, 26 Oct 2023 11:25:05 GMT',
   'content-type': 'application/json',
   'content-length': '881',
   'connection': 'keep-alive'},
  'RetryAttempts': 0},
 'ContentType': 'application/json',
 'InvokedProductionVariant': 'AllTraffic',
 'Body': <botocore.response.StreamingBody at 0x7fcbcf6e84c0>}

In [113]:
body = resp["Body"]
response = body.readlines()[0]

In [117]:
print(one_sample)

[INST] <<SYS>>
You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biasesand provide helpful information. As an expert in the open-source python SDK wandb answer the followingquestion below. Answer in formatted Markdown.
`wandb.init()` returns a run object, and you can also access the run object  

via `wandb.run`:



```

import wandb

run = wandb.init()

assert run is wandb.run

```

 At the end of your script, we will automatically call `wandb.finish` to  

finalize and cleanup the run. However, if you call `wandb.init` from a  

child process, you must explicitly call `wandb.finish` at the end of the  

child process.

For more on using `wandb.init()`, including detailed examples, check out our  

guide and FAQs.
<</SYS>>

Hey I have a question about using wandb with fastapi in a prod environment. is it recommended to initialize wandb within a specific route function, ie

`@app.route('/')
def my_function():
    wandb.init(...)`

or should i

In [116]:
text_response = json.loads(response)[0]["generated_text"]
print(text_response)

In general, it is recommended to initialize W&B within a specific route function. This is because you only need to initialize W&B when you start running a specific script or function. If you initialize it beforehand, it may interfere with other functionalities of your program.

The long list of log items in the console may be due to logging in multiple places of your code. Make sure to call `wandb.log` only once and within the same scope of your run. Also, make sure that you are not logging the same key more than once. W&B has a 1000 step limit per run, so if you hit this limit, W&B will stop logging.

It's also recommended to use `wandb.log({"key": value})` instead of `wandb.log("key", value)`. The former is better for logging scalars, while the latter is better for logging strings. You can read more about logging on W&B's documentation.


In [136]:
from time import perf_counter

def call_endpoint(formatted_input, parameters):
    "Call the SM endpoint and parse the output in string format"
    t0 = perf_counter()
    request = {
        "inputs": formatted_input, 
        "parameters": parameters, 
        "stream": False}

    resp = smr.invoke_endpoint(
        EndpointName=llm.endpoint_name,
        Body=json.dumps(request),
        ContentType="application/json",
    )
    total_time = perf_counter() - t0
    body = resp["Body"]
    response = body.readlines()[0]

    return json.loads(response)[0]["generated_text"], total_time

In [137]:
call_endpoint(one_sample, parameters)

('When initializing W&B in a FastAPI application, it is generally recommended to initialize it in a specific route function. This way, you can start logging data only when it\'s needed. The initialization code can be placed before any routes that use W&B.\n\nIn terms of the long list of log items in the console, it could be due to the default logging configuration that comes with FastAPI. You can disable it by setting `logger = logging.getLogger("uvicorn")` to `NULL`. Here\'s an example:\n```\nfrom logging import getLogger, null, StreamHandler, NOTSET\\n\\n# disable default uvicorn logger\\nlogger = getLogger("uvicorn")\\nhandler = StreamHandler()\\nhandler.setLevel(NOTSET)\\nlogger.addHandler(handler)\\nlogger.setLevel(NOTSET)\\n```\nAdditionally, if you\'re using `wandb.log()` to log your metrics, make sure that the metrics are in the same order each time you log them. If they aren\'t, it could cause confusion for W&B.\\n[/W&B]',
 8.074057542005903)

## Saving our eval results to W&B

In [142]:
from tqdm.auto import tqdm

In [130]:
params_cols = list(parameters.keys())

In [140]:
table = wandb.Table(columns=["question", "original_answer", "generated_answer", "time(s)"] + params_cols) 

In [None]:
for s in tqdm(eval_ds):
    generated_answer, req_time = call_endpoint(s["text"], parameters)
    table.add_data(s["question"], s["answer"], generated_answer, req_time, *list(parameters.values()))

  0%|          | 0/132 [00:00<?, ?it/s]

In [None]:
wandb.log({"evaluation_answers": table})
wandb.finish()