In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Deploying an auto-scaling model with AI Platform Prediction 

This notebook demonstrates how to deploy a pre-trained model to the AI Platform Prediction service. The notebook will show how to create a new model as well as a new model version. The model version will have auto-scaling settings turned on, so that new nodes will be created and removed as the load changes.

We will use a [Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder-large/5) model from TensorFlow Hub. This model will create word embeddings from a model that has been trained on a variety of data sources.

The notebook itself is adapted from the Universal Sentence Encoder [sample notebook](https://colab.sandbox.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb).

The main changes to the sample notebook are:
* Creation of AI Platform Prediction model and model version
* Update to `embed()` function to use AI Platform Prediction for inference, rather than the local model
* Streamlining of some non-essential content

## Constants

In [None]:
# Change these parameters!

PROJECT = 'YOUR-PROJECT-ID'  # Update with your project
BUCKET = 'gs://YOUR-BUCKET-NAME'  # Update with your bucket
REGION = 'us-central1'  # Update with your region

In [None]:
# These parameters don't need to be changed

MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
MODEL_NAME = 'universal_sentence_encoder'

## Imports

In [None]:
from google.api_core.client_options import ClientOptions
from googleapiclient import discovery

import tensorflow_hub as hub

import datetime
import logging
import numpy as np
import seaborn as sns

## Download TensorFlow Hub Model

In [None]:
# Reduce logging output
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Download model and return path
model = hub.resolve(MODULE_URL)

print(f"model file {model} saved")

## Deploy AI Platform Prediction model and model version

In [None]:
# Create AI Platform Prediction model

!gcloud ai-platform models create '{MODEL_NAME}' \
  --region='{REGION}'

In [None]:
# Create model version string with the current datetime

now = datetime.datetime.now()
MODEL_VERSION = 'v' + datetime.datetime.strftime(now, '%m%d%Y%H%M%S')

By default, the service will use **60% utilization** as the threshold to determine whether to scale up or down nodes. This setting can be changed by setting metric targets for either CPU or GPUs. For this notebook, we will add these parameters to the `config.yaml` file, which will then be specified with the `--config` parameter on the `gcloud` CLI.

Alternatively, you can use [gcloud beta ai-platform versions create](https://cloud.google.com/sdk/gcloud/reference/beta/ai-platform/versions/create#--metric-targets) to specify the parameters directly without `config.yaml`:
```
  --metric-targets cpu-usage=80 \
  --metric-targets gpu-duty-cycle=80 \
  --min-nodes 2 \
  --max-nodes 4
```

In [None]:
# Write scaling parameters to config.yaml

CONFIG = '''autoScaling:
  minNodes: 2
  maxNodes: 4
  metrics:
    - name: CPU_USAGE
      target: 80
    - name: GPU_DUTY_CYCLE
      target: 80'''

!echo '{CONFIG}' > config.yaml

In [None]:
# Create a new model version. This may take several minutes.

!gcloud ai-platform versions create {MODEL_VERSION} \
  --model={MODEL_NAME} \
  --region={REGION} \
  --origin={model} \
  --staging-bucket={BUCKET} \
  --runtime-version=2.2 \
  --framework='TENSORFLOW' \
  --python-version=3.7 \
  --machine-type=n1-standard-4 \
  --accelerator count=1,type=nvidia-tesla-t4 \
  --config=config.yaml

## Use service to make predictions

In [None]:
# Initialize client

endpoint = f'https://{REGION}-ml.googleapis.com'  # Use regional endpoint
client_options = ClientOptions(api_endpoint=endpoint)
service = googleapiclient.discovery.build('ml', 'v1', client_options=client_options, cache_discovery=False)

In [None]:
# Helper function to invoke the prediction service from
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/ml_engine/online_prediction/predict.py

def predict_json(project, model, instances, version=None):
    """Send json data to a deployed model for prediction.
    Args:
        project (str): project where the AI Platform Model is deployed.
        model (str): model name.
        instances ([Mapping[str: Any]]): Keys should be the names of Tensors
            your deployed model expects as inputs. Values should be datatypes
            convertible to Tensors, or (potentially nested) lists of datatypes
            convertible to tensors.
        version: str, version of the model to target.
    Returns:
        Mapping[str: any]: dictionary of prediction results defined by the
            model.
    """

    name = 'projects/{}/models/{}'.format(project, model)

    if version is not None:
        name += '/versions/{}'.format(version)

    response = service.projects().predict(
        name=name,
        body={'instances': instances}
    ).execute()

    if 'error' in response:
        raise RuntimeError(response['error'])

    return response['predictions']

In [None]:
def embed(input):
    return predict_json(PROJECT, MODEL_NAME, input)

In [None]:
# Helper functions for plotting

def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
        corr,
        xticklabels=labels,
        yticklabels=labels,
        vmin=0,
        vmax=1,
        cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")


def run_and_plot(messages_):
    message_embeddings_ = embed(messages_)
    plot_similarity(messages_, message_embeddings_, 90)

In [None]:
# Plot the textual similarity between various messages

messages = [
    # Smartphones
    "I like my phone",
    "My phone is not good.",
    "Your cellphone looks great.",

    # Weather
    "Will it snow tomorrow?",
    "Recently a lot of hurricanes have hit the US",
    "Global warming is real",

    # Food and health
    "An apple a day, keeps the doctors away",
    "Eating strawberries is healthy",
    "Is paleo better than keto?",

    # Asking about age
    "How old are you?",
    "what is your age?",
]

run_and_plot(messages)

## Cleanup

In [None]:
# Delete model version resource
!gcloud ai-platform versions delete {MODEL_VERSION} --model {MODEL_NAME} --region {REGION} --quiet 

# Delete model resource
!gcloud ai-platform models delete {MODEL_NAME} --region {REGION} --quiet