## Stability AI's FreeWilly2

- model hub : https://huggingface.co/stabilityai/FreeWilly2


### Prepare the model

In [2]:
!pip install -q transformers accelerate sentencepiece bitsandbytes

In [3]:
import sagemaker
import transformers
print(sagemaker.__version__)
print(transformers.__version__)

2.167.0
4.31.0


In [4]:
from huggingface_hub import snapshot_download
from pathlib import Path
import os

local_model_path = Path("./pretrained-models")
local_model_path.mkdir(exist_ok=True)
model_name = "stabilityai/FreeWilly2"
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model", "*.py"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

Fetching 37 files:   0%|          | 0/37 [00:00<?, ?it/s]

Downloading (…)l-00004-of-00029.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)l-00001-of-00029.bin:   0%|          | 0.00/9.44G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00029.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)0870d317/config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading (…)7/llama2/LICENSE.txt:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading (…)l-00006-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00008-of-00029.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00009-of-00029.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00010-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00011-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00016-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00012-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00013-of-00029.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00015-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00014-of-00029.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00017-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00018-of-00029.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00019-of-00029.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00020-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00021-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00022-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00023-of-00029.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00024-of-00029.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00027-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00029-of-00029.bin:   0%|          | 0.00/7.56G [00:00<?, ?B/s]

Downloading (…)l-00026-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading (…)l-00028-of-00029.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/66.7k [00:00<?, ?B/s]

Downloading (…)0d317/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)l-00025-of-00029.bin:   0%|          | 0.00/9.33G [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [5]:
print(f"Local model download path: {model_download_path}")

Local model download path: pretrained-models/models--stabilityai--FreeWilly2/snapshots/aceb6bfa684ccb3fb47c885aab0da3a60870d317


In [6]:
s3_model_prefix = "llm/stabilityai/model"  # folder where model checkpoint will go

In [7]:
base_model_s3 = f"{s3_model_prefix}/freewilly2-70b"

In [8]:
sagemaker_session = sagemaker.Session()
s3_model_artifact = sagemaker_session.upload_data(path=model_download_path, key_prefix=base_model_s3)

In [11]:
print(f"Model s3 uri : {s3_model_artifact}")

Model s3 uri : s3://sagemaker-us-west-2-723597067299/llm/stabilityai/model/freewilly2-70b


### Deployment with DJL

### Model deployment using HF LLM container to SageMaker endpoint


- HF LLM currently not works for FreeWilly2 (It fails)

In [44]:
import boto3
import json
import sagemaker
from sagemaker.utils import name_from_base
from sagemaker import image_uris

In [45]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
sm_client = sagemaker_session.sagemaker_client
sm_runtime_client = sagemaker_session.sagemaker_runtime_client
s3_client = boto3.client('s3')

In [46]:
inference_image_uri = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04"

In [47]:
# sagemaker config
instance_type = "ml.g5.48xlarge"
number_of_gpu = 8

# TGI config
config = {
  'HF_MODEL_ID': "stabilityai/FreeWilly2", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  'MAX_TOTEL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

In [48]:
model_name = name_from_base(f"freewilly2-70b-hf-llm")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "Environment" : config
    },
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

freewilly2-70b-hf-llm-2023-07-24-06-07-58-206
Created Model: arn:aws:sagemaker:us-west-2:723597067299:model/freewilly2-70b-hf-llm-2023-07-24-06-07-58-206


In [49]:
default_bucket = sagemaker_session.default_bucket()
async_output_uri = f"s3://{default_bucket}/llm/outputs/{model_name}/"
print(async_output_uri)

s3://sagemaker-us-west-2-723597067299/llm/outputs/freewilly2-70b-hf-llm-2023-07-24-06-07-58-206/


In [50]:
endpoint_config_name = f"{model_name}-async-config"
endpoint_name = f"{model_name}-async-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 1800,
        },
    ],
    AsyncInferenceConfig={
        "OutputConfig": {
            "S3OutputPath": async_output_uri,
        },
        "ClientConfig": {
            "MaxConcurrentInvocationsPerInstance": 1
        }
    }
)
print(endpoint_config_response)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:723597067299:endpoint-config/freewilly2-70b-hf-llm-2023-07-24-06-07-58-206-async-config', 'ResponseMetadata': {'RequestId': '65700614-6d09-4b99-b811-d8754505082d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '65700614-6d09-4b99-b811-d8754505082d', 'content-type': 'application/x-amz-json-1.1', 'content-length': '139', 'date': 'Mon, 24 Jul 2023 06:07:59 GMT'}, 'RetryAttempts': 0}}


In [51]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}",
    EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:723597067299:endpoint/freewilly2-70b-hf-llm-2023-07-24-06-07-58-206-async-endpoint


In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Failed
Arn: arn:aws:sagemaker:us-west-2:723597067299:endpoint/freewilly2-70b-hf-llm-2023-07-24-06-07-58-206-async-endpoint
Status: Failed


In [53]:
# user_utter = "How can I learn spear fishing in korea?"
user_utter = "Could you recommend the best route to travel korea at winter with my two kids?"

In [54]:
# define payload
prompt = f"""### System:
You are Free Willy, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.

### User: {user_utter}

### Assistant:
"""

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.95,
    "top_k": 0,
    "temperature": 0.8,
    "max_new_tokens": 1024
  }
}

print(payload)


{'inputs': "### System:\nYou are Free Willy, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n### User: Could you recommend the best route to travel korea at winter with my two kids?\n\n### Assistant:\n", 'parameters': {'do_sample': True, 'top_p': 0.95, 'temperature': 0.8, 'max_new_tokens': 1024}}


In [55]:
import uuid

# Upload input data onto the S3
s3_uri = f"llm/inputs/{model_name}/{uuid.uuid4()}.json"
s3_client.put_object(
    Bucket=default_bucket,
    Key=s3_uri,
    Body=json.dumps(payload))

input_data_uri = f"s3://{default_bucket}/{s3_uri}"
input_location = input_data_uri

In [56]:
response = sm_runtime_client.invoke_endpoint_async(
    EndpointName=endpoint_name, 
    InputLocation=input_location,
    ContentType="application/json"
)
output_location = response["OutputLocation"]
print(output_location)
output_key_uri = "/".join(output_location.split("/")[3:])

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpointAsync operation: Endpoint freewilly2-70b-hf-llm-2023-07-24-06-07-58-206-async-endpoint of account 723597067299 not found.

In [None]:
try:
    exists = s3_client.head_object(Bucket=default_bucket, Key=output_key_uri)['ResponseMetadata']['HTTPStatusCode'] == 200
    if exists:
        text_obj = s3_client.get_object(Bucket=default_bucket, Key=output_key_uri)['Body'].read()
        text = text_obj.decode('utf-8')
        raw_output = json.loads(text)[0]["generated_text"]
        output = raw_output[len(prompt):]
        print(output)
except:
    print("Data is not exist yet. Wait until inference finished or check the CW log")