# Deploy RWKV Hugging Face models to Amazon SageMaker by using LMI, DeepSpeed 

reference:

https://huggingface.co/RWKV

https://sagemaker.readthedocs.io/en/stable/frameworks/djl/using_djl.html

https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-dlc.html



## Settings

In [1]:
#upgrade sdk library
!pip install -qU sagemaker
!pip install -qU boto3
!pip install -qU botocore

In [2]:
# sagemaker environment setting
import sagemaker
import boto3
import os
import shutil
import sagemaker.huggingface
from sagemaker.djl_inference.model import DJLModel,DeepSpeedModel,HuggingFaceAccelerateModel,DJLPredictor

sagemaker_session = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sagemaker_session = sagemaker.Session(default_bucket=sagemaker_session_bucket)
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

sagemaker role arn: arn:aws:iam::432088571089:role/AmazonSageMaker-ExecutionRole-20210324T123126
sagemaker bucket: sagemaker-us-east-1-432088571089
sagemaker session region: us-east-1


In [3]:
source_dir = 'source_dir'
if not os.path.exists(source_dir):
    os.mkdir(source_dir)
#entry_point = 'entry_point.py'

### NOTE: From v4.29.0, RWKV was supported by Transformers, the built-in Transformers(4.26.0) needs to be upgraded.

In [4]:
%%writefile $source_dir/requirements.txt
transformers==4.30.2
boto3
sagemaker
sentencepiece
nvgpu==0.9.0
pynvml==11.4.1

Overwriting source_dir/requirements.txt


## Download model files from Hugging Face Hub, then upload them to S3 bucket

In [5]:
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz
#!mv s5cmd ./$source_dir/s5cmd

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4176k  100 4176k    0     0  21.5M      0 --:--:-- --:--:-- --:--:-- 21.5M


In [6]:
model_id = "RWKV/rwkv-raven-3b"#"RWKV/rwkv-raven-14b"#"RWKV/rwkv-4-169m-pile"#"RWKV/rwkv-raven-7b"

In [7]:
!pip install -qU huggingface_hub

In [8]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./model")
local_model_path.mkdir(exist_ok=True)
# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model"]
# - Leverage the snapshot library to download the model since the model is stored in repository using LFS
model_download_path = snapshot_download(
    repo_id=model_id,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

# define a variable to contain the s3url of the location that has the model
#stabilityai--stable-diffusion-2-1
pretrained_model_location = f"s3://internal-modelzoo-us-east-1/RWKV/{model_id.split('/')[1]}/"

#model_artifact = sess.upload_data(path=model_download_path, key_prefix=s3_model_prefix)
!chmod +x ./s5cmd
!./s5cmd sync {model_download_path}/ {pretrained_model_location}

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)49458f67/config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)58f67/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00007.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00007.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Downloading (…)l-00001-of-00007.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

Downloading (…)l-00006-of-00007.bin:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00007.bin:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00007.bin:   0%|          | 0.00/515M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00007.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

cp model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67/generation_config.json s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/generation_config.json
cp model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67/config.json s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/config.json
cp model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67/special_tokens_map.json s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/special_tokens_map.json
cp model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67/tokenizer.json s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/tokenizer.json
cp model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67/tokenizer_config.json s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/tokenizer_config.json
cp model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67/pytorch_model.bin.in

In [9]:
print(model_download_path)
print(pretrained_model_location)
#!./s5cmd sync {model_download_path}/  {pretrained_model_location}
!rm -fr {local_model_path}

model/models--RWKV--rwkv-raven-3b/snapshots/1ddeea6a7313c8ba8824645d7aa88d5449458f67
s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/


## LMI + Create a model using the DeepSpeed backend, then do inferencing

In [11]:
# LMI + Create a model using the DeepSpeed backend
model_id = "s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/"
print(f"model_id:{model_id}")

deepspeed_model = DeepSpeedModel(
    model_id, # This can also be a HuggingFace Hub model id
    role,
    dtype="fp16",
    task="text-generation",
    tensor_parallel_degree=1, # number of gpus to partition the model across using tensor parallelism
    #entry_point = entry_point,
    source_dir = source_dir,
)

# Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
print(f"Deploying..., please wait for 3-10 minutes!")
deepspeed_predictor = deepspeed_model.deploy(
    "ml.g5.2xlarge",
    initial_instance_count=1,
    model_data_download_timeout=10*60,
    container_startup_health_check_timeout=15*60
)
endpoint_name = deepspeed_predictor.endpoint_name
print("")
print(f"endpoint_name:{endpoint_name}")

model_id:s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/
Deploying..., please wait for 3-10 minutes!
-------------!
endpoint_name:djl-inference-2023-08-10-06-10-14-302


In [12]:
#predict
print(deepspeed_predictor.predict(
    { 
        "inputs" : "American election is", 
        "parameters": { "max_length": 50 },
    }
))

[{'generated_text': 'American election is a very important event. It is a chance for the people of the United States to have a say in who will be the next president. It is a chance for the people of the United States to have a say in who will be'}]


In [16]:

# # LMI + Create a model using the HuggingFace Accelerate backend
# model_id = "s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/"
# print(f"model_id:{model_id}")

# hf_accelerate_model = HuggingFaceAccelerateModel(
#     model_id, # This can also be a HuggingFace Hub model id
#     role,
#     dtype="fp16",
#     task="text-generation",
#     number_of_partitions=1, # number of gpus to partition the model across
#     #entry_point = entry_point,
#     source_dir = source_dir
# )
# # Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
# print(f"Deploying..., please wait for 3-10 minutes!")

# hf_accelerate_predictor = hf_accelerate_model.deploy("ml.g5.2xlarge",
#                                                      initial_instance_count=1,
#                                                      model_data_download_timeout=10*60,
#                                                      container_startup_health_check_timeout=15*60)
# #predict
# print(hf_accelerate_predictor.predict(
#     { 
#         "inputs" : "Large model inference is", 
#         "parameters": { "max_length": 50 },
#     }
# ))

model_id:s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/
Deploying..., please wait for 3-10 minutes!
------------![{'generated_text': 'Large model inference is a challenging task due to the high dimensionality of the model and the large number of parameters. In this paper, we propose a novel approach to model inference that leverages the power of deep learning. Our approach is based on the'}]


## ONLY for re-invoke already-created endpoint

In [14]:
#only for re-invoke already-created endpoint
endpoint_name="djl-inference-2023-06-18-14-37-50-264"
endpoint_name="djl-inference-2023-08-10-06-10-14-302"
from sagemaker.djl_inference.model import DJLPredictor
from sagemaker import Model, image_uris, serializers, deserializers

predictor = DJLPredictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)
#predict
print(predictor.predict(
    { 
        "inputs" : "Today is sunny,", 
        "parameters": { "max_length": 50 },
    }
))


[{'generated_text': 'Today is sunny, but the wind is blowing hard.\n\nBob: Can you tell me more about the weather forecast for tomorrow?\n\nAlice: Sure, I can provide you with the weather forecast for tomorrow. The weather is expected to'}]


## clean up

In [None]:
#endpoint_name = ""
#model_name = ""
#sagemaker_session.delete_endpoint(endpoint_name)
#sagemaker_session.delete_endpoint_config(endpoint_name)
#sagemaker_session.delete_model(endpoint_name)