# Autoscaling a SageMaker Endpoint

In [1]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
autoscale = boto3.Session().client(service_name="application-autoscaling", region_name=region)

In [2]:
%store -r tensorflow_endpoint_name

In [4]:
tensorflow_endpoint_name = "tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092"

In [5]:
print(tensorflow_endpoint_name)

tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092


# Copy the Model to the Notebook

In [6]:
autoscale.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=1,
    MaxCapacity=2,
    RoleARN=role,
    SuspendedState={
        "DynamicScalingInSuspended": False,
        "DynamicScalingOutSuspended": False,
        "ScheduledScalingSuspended": False,
    },
)

{'ResponseMetadata': {'RequestId': 'fbbaeeba-5b1b-4ebe-8a20-4ef7c9431bc1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fbbaeeba-5b1b-4ebe-8a20-4ef7c9431bc1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'date': 'Wed, 14 Apr 2021 18:24:30 GMT'},
  'RetryAttempts': 0}}

In [7]:
# check the target is available
autoscale.describe_scalable_targets(
    ServiceNamespace="sagemaker",
    MaxResults=100,
)

{'ScalableTargets': [{'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092/variant/AllTraffic',
   'ScalableDimension': 'sagemaker:variant:DesiredInstanceCount',
   'MinCapacity': 1,
   'MaxCapacity': 2,
   'RoleARN': 'arn:aws:iam::117859797117:role/aws-service-role/sagemaker.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_SageMakerEndpoint',
   'CreationTime': datetime.datetime(2021, 4, 14, 18, 24, 31, 126000, tzinfo=tzlocal()),
   'SuspendedState': {'DynamicScalingInSuspended': False,
    'DynamicScalingOutSuspended': False,
    'ScheduledScalingSuspended': False}}],
 'ResponseMetadata': {'RequestId': 'c7156f52-4492-4458-9f0a-8c3d1b36c9f8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c7156f52-4492-4458-9f0a-8c3d1b36c9f8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '568',
   'date': 'Wed, 14 Apr 2021 18:24:33 GMT'},
  'RetryAttempts': 0}}

In [8]:
autoscale.put_scaling_policy(
    PolicyName="bert-reviews-autoscale-policy",
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 2.0,
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance",
        },
        "ScaleOutCooldown": 60,
        "ScaleInCooldown": 300,
    },
)

{'PolicyARN': 'arn:aws:autoscaling:us-east-1:117859797117:scalingPolicy:d88fae87-172c-4d21-bb28-d5b85d0d8b79:resource/sagemaker/endpoint/tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092/variant/AllTraffic:policyName/bert-reviews-autoscale-policy',
 'Alarms': [{'AlarmName': 'TargetTracking-endpoint/tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092/variant/AllTraffic-AlarmHigh-f07c9216-ba66-4cb8-975e-4c2d89af7b7d',
   'AlarmARN': 'arn:aws:cloudwatch:us-east-1:117859797117:alarm:TargetTracking-endpoint/tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092/variant/AllTraffic-AlarmHigh-f07c9216-ba66-4cb8-975e-4c2d89af7b7d'},
  {'AlarmName': 'TargetTracking-endpoint/tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092/variant/AllTraffic-AlarmLow-b7d2a6af-2e6c-4ca0-b0d3-073f92f8ebcf',
   'AlarmARN': 'arn:aws:cloudwatch:us-east-1:117859797117:alarm:TargetTracking-endpoint/tensorflow-training-2021-04-14-10-42-44-832-tf-1618414092/variant/AllTraffic-AlarmLow-b7d2a6af-2e

In [9]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(
            region, tensorflow_endpoint_name
        )
    )
)

In [10]:
%%time

waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=tensorflow_endpoint_name)

CPU times: user 14.9 ms, sys: 6.5 ms, total: 21.4 ms
Wall time: 127 ms


# Test the Deployed Model

In [11]:
import json
from sagemaker.tensorflow.model import TensorFlowPredictor
from sagemaker.serializers import JSONLinesSerializer
from sagemaker.deserializers import JSONLinesDeserializer

predictor = TensorFlowPredictor(
    endpoint_name=tensorflow_endpoint_name,
    sagemaker_session=sess,
    model_name="saved_model",
    model_version=0,
    content_type="application/jsonlines",
    accept_type="application/jsonlines",
    serializer=JSONLinesSerializer(),
    deserializer=JSONLinesDeserializer(),
)

content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Waiting for the Endpoint to be ready to Serve Predictions

In [12]:
import time

time.sleep(30)

# Run a Lot of Predictions and Watch the SageMaker Endpoint Scale Out

In [13]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(
            region, tensorflow_endpoint_name
        )
    )
)

In [None]:
inputs = [{"features": ["This is great!"]}, {"features": ["This is bad."]}]

for i in range(0, 100000):
    predicted_classes = predictor.predict(inputs)

    for predicted_class in predicted_classes:
        print("Predicted star_rating: {}".format(predicted_class))

Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted

# Delete Endpoint
To save cost, we should delete the endpoint.

In [None]:
# sm.delete_endpoint(
#      EndpointName=tensorflow_endpoint_name
# )