# Autoscaling a SageMaker Endpoint

In [4]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
autoscale = boto3.Session().client(service_name="application-autoscaling", region_name=region)

In [5]:
%store -r tensorflow_endpoint_name

In [6]:
try:
    tensorflow_endpoint_name
    print("[OK]")
except NameError:
    print("+++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous notebook before you continue.")
    print("+++++++++++++++++++++++++++++++")

[OK]


In [7]:
print(tensorflow_endpoint_name)

tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787


# Copy the Model to the Notebook

In [8]:
autoscale.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=1,
    MaxCapacity=2,
    RoleARN=role,
    SuspendedState={
        "DynamicScalingInSuspended": False,
        "DynamicScalingOutSuspended": False,
        "ScheduledScalingSuspended": False,
    },
)

{'ResponseMetadata': {'RequestId': '6f4b857b-3c94-4304-a90f-fc799ab65f42',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6f4b857b-3c94-4304-a90f-fc799ab65f42',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'date': 'Tue, 06 Apr 2021 10:38:33 GMT'},
  'RetryAttempts': 0}}

In [9]:
# check the target is available
autoscale.describe_scalable_targets(
    ServiceNamespace="sagemaker",
    MaxResults=100,
)

{'ScalableTargets': [{'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787/variant/AllTraffic',
   'ScalableDimension': 'sagemaker:variant:DesiredInstanceCount',
   'MinCapacity': 1,
   'MaxCapacity': 2,
   'RoleARN': 'arn:aws:iam::117859797117:role/aws-service-role/sagemaker.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_SageMakerEndpoint',
   'CreationTime': datetime.datetime(2021, 4, 6, 10, 38, 33, 650000, tzinfo=tzlocal()),
   'SuspendedState': {'DynamicScalingInSuspended': False,
    'DynamicScalingOutSuspended': False,
    'ScheduledScalingSuspended': False}}],
 'ResponseMetadata': {'RequestId': 'd8c6669c-c93b-4ebb-8f97-9dd1959b62f2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd8c6669c-c93b-4ebb-8f97-9dd1959b62f2',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '567',
   'date': 'Tue, 06 Apr 2021 10:38:43 GMT'},
  'RetryAttempts': 0}}

In [10]:
autoscale.put_scaling_policy(
    PolicyName="bert-reviews-autoscale-policy",
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + tensorflow_endpoint_name + "/variant/AllTraffic",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 2.0,
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance",
        },
        "ScaleOutCooldown": 60,
        "ScaleInCooldown": 300,
    },
)

{'PolicyARN': 'arn:aws:autoscaling:us-east-1:117859797117:scalingPolicy:9cd7ef02-709f-4a1f-b80f-de36ad4a1518:resource/sagemaker/endpoint/tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787/variant/AllTraffic:policyName/bert-reviews-autoscale-policy',
 'Alarms': [{'AlarmName': 'TargetTracking-endpoint/tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787/variant/AllTraffic-AlarmHigh-96a23cf1-6926-4773-a4aa-fc44f96de03f',
   'AlarmARN': 'arn:aws:cloudwatch:us-east-1:117859797117:alarm:TargetTracking-endpoint/tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787/variant/AllTraffic-AlarmHigh-96a23cf1-6926-4773-a4aa-fc44f96de03f'},
  {'AlarmName': 'TargetTracking-endpoint/tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787/variant/AllTraffic-AlarmLow-dc470410-1e2c-4c30-bd54-e6c24e289f27',
   'AlarmARN': 'arn:aws:cloudwatch:us-east-1:117859797117:alarm:TargetTracking-endpoint/tensorflow-training-2021-04-05-11-23-57-968-tf-1617701787/variant/AllTraffic-AlarmLow-dc470410-1e

In [11]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(
            region, tensorflow_endpoint_name
        )
    )
)

In [12]:
%%time

waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=tensorflow_endpoint_name)

CPU times: user 6.72 ms, sys: 16.2 ms, total: 23 ms
Wall time: 137 ms


# Test the Deployed Model

In [13]:
import json
from sagemaker.tensorflow.model import TensorFlowPredictor
from sagemaker.serializers import JSONLinesSerializer
from sagemaker.deserializers import JSONLinesDeserializer

predictor = TensorFlowPredictor(
    endpoint_name=tensorflow_endpoint_name,
    sagemaker_session=sess,
    model_name="saved_model",
    model_version=0,
    content_type="application/jsonlines",
    accept_type="application/jsonlines",
    serializer=JSONLinesSerializer(),
    deserializer=JSONLinesDeserializer(),
)

content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Waiting for the Endpoint to be ready to Serve Predictions

In [14]:
import time

time.sleep(30)

# Run a Lot of Predictions and Watch the SageMaker Endpoint Scale Out

In [15]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(
            region, tensorflow_endpoint_name
        )
    )
)

In [None]:
inputs = [{"features": ["This is great!"]}, {"features": ["This is bad."]}]

for i in range(0, 100000):
    predicted_classes = predictor.predict(inputs)

    for predicted_class in predicted_classes:
        print("Predicted star_rating: {}".format(predicted_class))

Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted_label': 1}
Predicted star_rating: {'predicted_label': 5}
Predicted star_rating: {'predicted

# Delete Endpoint
To save cost, we should delete the endpoint.

In [None]:
# sm.delete_endpoint(
#      EndpointName=tensorflow_endpoint_name
# )

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}