# Deploying serverless endpoint

## Creating model from previous training stored in S3

In [4]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

huggingface_model = HuggingFaceModel(
   model_data="s3://sagemaker-violentometro/saturdaysAI/beto-gru/model/model.tar.gz",  # path to your trained SageMaker model
   role=role,                                            # IAM role with permissions to create an endpoint
   transformers_version="4.17.0",                           # Transformers version used
   pytorch_version="1.10.2",                                # PyTorch version used
   py_version='py38',                                    # Python version used
   entry_point="inference_nlp.py",
   source_dir="code"
)

## Deploying model (Probably not needed)

In [5]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.t2.medium' # ec2 instance type
)

-----------!

## Testing deployed model

In [7]:
data = {
   "sentence": "hola"
}

# request
predictor.predict(data)

{'prediction': 0.6604039669036865}

In [8]:
import boto3

sm = boto3.client(service_name='sagemaker')
sm_rt = boto3.client(service_name='sagemaker-runtime')



In [9]:
from time import gmtime, strftime

def name_with_timestamp(name):
    return '{}-{}'.format(name, strftime('%Y-%m-%d-%H-%M-%S', gmtime()))

huggingface_model_name    = name_with_timestamp('huggingface-serverless')
huggingface_epc_name      = name_with_timestamp('huggingface-serverless-epc')
huggingface_endpoint_name = name_with_timestamp('huggingface-serverless-ep')

## Creating serverless endpoint configuration

In [None]:
endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=huggingface_epc_name,
    ProductionVariants=[
        {
            'VariantName': 'single-variant',
            'ModelName': 'huggingface-pytorch-inference-2022-08-07-21-36-49-967',
            'ServerlessConfig': {
                'MemorySizeInMB': 2048,
                'MaxConcurrency': 8,
            },
        },
    ],
)

endpoint_config_response['EndpointConfigArn']

## Creating serverless endpoint

In [None]:
create_endpoint_response = sm.create_endpoint(
    EndpointName=huggingface_endpoint_name,
    EndpointConfigName=huggingface_epc_name,
)

create_endpoint_response['EndpointArn']

In [14]:
waiter = sm.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=huggingface_endpoint_name)

## Invoking serverless endpoint

In [15]:
import boto3, json

sm_rt = boto3.client(service_name='sagemaker-runtime')

In [17]:
test_data = {
   "sentence": "eres un malparido"
}

response = sm_rt.invoke_endpoint(
            EndpointName=huggingface_endpoint_name,
            Body=json.dumps(test_data),
            ContentType='application/json'
)

print(response["Body"].read())

b'{"prediction": 0.9597054719924927}'
