# A/B testing, traffic shifting and autoscaling

In [None]:
!pip install --disable-pip-version-check -q sagemaker==2.35.0
!conda install -q -y pytorch==1.6.0 -c pytorch
!pip install --disable-pip-version-check -q transformers==3.5.1

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
cw = boto3.Session().client(service_name='cloudwatch', region_name=region)
autoscale = boto3.Session().client(service_name="application-autoscaling", region_name=region)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Canary Rollouts and A/B Testing

Canary rollouts are used to release new models safely to only a small subset of users such as 5%. They are useful if you want to test in live production without affecting the entire user base. Since the majority of traffic goes to the existing model, the cluster size of the canary model can be relatively small since it’s only receiving 5% traffic.

Instead of `deploy()`, we can create an `Endpoint Configuration` with multiple variants for canary rollouts and A/B testing.

Create a custom `SentimentPredictor` that encapsulates a JSONLines serializer and deserializer.

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONLinesSerializer
from sagemaker.deserializers import JSONLinesDeserializer

class SentimentPredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(endpoint_name, 
                         sagemaker_session=sagemaker_session, 
                         serializer=JSONLinesSerializer(),
                         deserializer=JSONLinesDeserializer())

In [None]:
import time
from sagemaker.pytorch.model import PyTorchModel

timestamp = int(time.time())

In [None]:
model_a_s3_uri = 's3://dlai-practical-data-science/models/ab/variant_a/model.tar.gz'
model_b_s3_uri = 's3://dlai-practical-data-science/models/ab/variant_b/model.tar.gz'

In [None]:
model_name_a = '{}-{}'.format('a', timestamp)

model_a = PyTorchModel(name=model_name_a,
                       model_data=model_a_s3_uri,
                       predictor_cls=SentimentPredictor,
                       entry_point='inference.py',
                       source_dir='src',
                       framework_version='1.6.0',
                       py_version='py3',
                       role=role)

In [None]:
model_name_b = '{}-{}'.format('b', timestamp)

model_b = PyTorchModel(name=model_name_b,
                       model_data=model_b_s3_uri,
                       predictor_cls=SentimentPredictor,
                       entry_point='inference.py',
                       source_dir='src',
                       framework_version='1.6.0',
                       py_version='py3',
                       role=role)

In [None]:
inference_instance_type = 'ml.m5.large'

In [None]:
inference_image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=region,
    version="1.6.0",
    py_version='py3',
    instance_type=inference_instance_type,
    image_scope="inference",
)
print(inference_image_uri)

In [None]:
sess.create_model(
    name=model_name_a, 
    role=role, 
    container_defs=sagemaker.container_def(
        model_data_url=model_a_s3_uri, 
        image_uri=inference_image_uri
    )    
)

In [None]:
sess.create_model(
    name=model_name_b,
    role=role, 
    container_defs=sagemaker.container_def(
        model_data_url=model_b_s3_uri, 
        image_uri=inference_image_uri
    )
)

In [None]:
from sagemaker.session import production_variant

variantA = production_variant(
    model_name=model_name_a,
    instance_type=inference_instance_type,
    initial_instance_count=1,
    variant_name='VariantA',
    initial_weight=50,
)
print(variantA)

variantB = production_variant(
    model_name=model_name_b,
    instance_type=inference_instance_type,
    initial_instance_count=1,
    variant_name='VariantB',
    initial_weight=50,
)
print(variantB)

In [None]:
endpoint_config_name = '{}-{}'.format('ab', timestamp)

endpoint_config = sm.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, ProductionVariants=[variantA, variantB]
)

In [None]:
from pprint import pprint

model_ab_endpoint_name = '{}-{}'.format('ab', timestamp)

endpoint_response = sm.create_endpoint(EndpointName=model_ab_endpoint_name, 
                                       EndpointConfigName=endpoint_config_name)

pprint(endpoint_response)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpointConfig/{}">REST Endpoint Configuration</a></b>'.format(
            region, endpoint_config_name
        )
    )
)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, model_ab_endpoint_name)))


Wait for the endpoint to deploy.

### _This cell will take approximately 5-10 minutes to run._

In [None]:
%%time

waiter = sm.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=model_ab_endpoint_name)

_Wait until the ^^ Endpoint ^^ is deployed_

# Test Model
Here, we will pass sample strings of text to the endpoint in order to see the sentiment. We give you one example of each, however, feel free to play around and change the strings yourself!

In [None]:
inputs = [
    {"features": ["I love this product!"]},
    {"features": ["OK, but not great."]},
    {"features": ["This is not the right product."]},
]

predictor = SentimentPredictor(endpoint_name=model_ab_endpoint_name,
                               sagemaker_session=sess)

predicted_classes = predictor.predict(inputs)

for predicted_class in predicted_classes:
    print("Predicted class {} with probability {}".format(predicted_class['predicted_label'], predicted_class['probability']))

# Review the REST Endpoint Performance Metrics in a Dataframe

Amazon SageMaker emits metrics such as Latency and Invocations (full list of metrics [here](https://alpha-docs-aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html)) for each variant in Amazon CloudWatch. Let’s query CloudWatch to get the InvocationsPerVariant to show how invocations are split across variants.

In [None]:
from datetime import datetime, timedelta

import boto3
import pandas as pd


def get_invocation_metrics_for_endpoint_variant(
    endpoint_name, namespace_name, metric_name, variant_name, start_time, end_time
):
    metrics = cw.get_metric_statistics(
        Namespace=namespace_name,
        MetricName=metric_name,
        StartTime=start_time,
        EndTime=end_time,
        Period=60,
        Statistics=["Sum"],
        Dimensions=[{"Name": "EndpointName", "Value": endpoint_name}, {"Name": "VariantName", "Value": variant_name}],
    )

    if metrics["Datapoints"]:
        return (
            pd.DataFrame(metrics["Datapoints"])
            .sort_values("Timestamp")
            .set_index("Timestamp")
            .drop("Unit", axis=1)
            .rename(columns={"Sum": variant_name})
        )
    else:
        return pd.DataFrame()


def plot_endpoint_metrics_for_variants(endpoint_name, namespace_name, metric_name, start_time=None):
    try:
        start_time = start_time or datetime.now() - timedelta(minutes=60)
        end_time = datetime.now()

        metrics_variantA = get_invocation_metrics_for_endpoint_variant(
            endpoint_name=model_ab_endpoint_name,
            namespace_name=namespace_name,
            metric_name=metric_name,
            variant_name=variantA["VariantName"],
            start_time=start_time,
            end_time=end_time,
        )

        metrics_variantB = get_invocation_metrics_for_endpoint_variant(
            endpoint_name=model_ab_endpoint_name,
            namespace_name=namespace_name,
            metric_name=metric_name,
            variant_name=variantB["VariantName"],
            start_time=start_time,
            end_time=end_time,
        )

        metrics_variants = metrics_variantA.join(metrics_variantB, how="outer")
        metrics_variants.plot()
    except:
        pass

Run some predictions and view the invocation metrics.

_This will take 1-2 minutes._

In [None]:
%%time

for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, model_ab_endpoint_name)))


# Show the Metrics for Each Variant
If you see `Metrics not yet available`, please be patient as metrics may take a few mins to appear in CloudWatch.

Also, make sure the predictions ran successfully above.

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(20)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="/aws/sagemaker/Endpoints", metric_name="CPUUtilization"
# )

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(5)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="Invocations"
# )

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(5)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="InvocationsPerInstance"
# )

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(5)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="ModelLatency"
# )

# Shift All Traffic to Variant B
_**No downtime** occurs during this traffic-shift activity._

This may take a few minutes.  Please be patient.

In [None]:
updated_endpoint_config = [
    {
        "VariantName": variantA["VariantName"],
        "DesiredWeight": 0,
    },
    {
        "VariantName": variantB["VariantName"],
        "DesiredWeight": 100,
    },
]

In [None]:
sm.update_endpoint_weights_and_capacities(
    EndpointName=model_ab_endpoint_name, DesiredWeightsAndCapacities=updated_endpoint_config
)

# Wait for the ^^ Endpoint Update ^^ to Complete Above.

_There is no down-time while the update is applying._ 

This may take a few minutes.  Please be patient.

![](img/autoscale-endpoint-updating.png)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, model_ab_endpoint_name)))


In [None]:
waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=model_ab_endpoint_name)

Run some predictions and view the invocation metrics.

_This will take 1-2 minutes._

In [None]:
%%time

for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, model_ab_endpoint_name)))


In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

time.sleep(20)
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="/aws/sagemaker/Endpoints", metric_name="CPUUtilization"
)

# Show the Metrics for Each Variant
If you see `Metrics not yet available`, please be patient as metrics may take a few mins to appear in CloudWatch.

Also, make sure the predictions ran successfully above.

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

time.sleep(20)
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="/aws/sagemaker/Endpoints", metric_name="CPUUtilization"
)

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(5)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="Invocations"
# )

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(5)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="InvocationsPerInstance"
# )

In [None]:
# import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# time.sleep(5)
# plot_endpoint_metrics_for_variants(
#     endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="ModelLatency"
# )

# Configure Variant B to Autoscale (but not Variant A since A is no longer taking traffic)

In [None]:
autoscale.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + model_ab_endpoint_name + "/variant/VariantB",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=1,
    MaxCapacity=2,
    RoleARN=role,
    SuspendedState={
        "DynamicScalingInSuspended": False,
        "DynamicScalingOutSuspended": False,
        "ScheduledScalingSuspended": False,
    },
)

In [None]:
# check the target is available
autoscale.describe_scalable_targets(
    ServiceNamespace="sagemaker",
    MaxResults=100,
)

In [None]:
autoscale.put_scaling_policy(
    PolicyName="bert-reviews-autoscale-policy",
    ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + model_ab_endpoint_name + "/variant/VariantB",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 2.0,
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance",
        },
        "ScaleOutCooldown": 60,
        "ScaleInCooldown": 300,
    },
)

# Generate Traffic
This may take a few minutes.  Please be patient.

In [None]:
%%time

for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)

Review the autoscaling.

![](img/autoscale-instance-count.png)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, model_ab_endpoint_name)))
