In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/tabular_optimized_online_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在 Colab 中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/tabular_optimized_online_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在 GitHub 上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/tabular_optimized_online_prediction.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在 Vertex AI Workbench 中打开
    </a>
  </td>
</table>

利用优化的TensorFlow运行时训练一个表格形式的Criteo模型，并部署到Vertex AI Predictions。

## 概述

在这个示例中，您将学习如何使用TensorFlow Keras或Estimator API使用Criteo Kaggle数据集训练表格模型。
接下来，您将使用基于开源的TensorFlow 2.7容器和经过优化的TensorFlow运行时容器将训练好的模型导出到Vertex AI预测服务，运行这些模型的性能评估并比较预测结果。

有关Vertex AI预测优化的TensorFlow运行时容器的更多信息，请参考https://cloud.google.com/vertex-ai/docs/predictions/optimized-tensorflow-runtime。

### 数据集

在这个示例中，您将使用Criteo Kaggle数据集，数据集大小约为4GB。


### 目标

在这个笔记本中，您将学习如何使用优化的TensorFlow运行时将训练好的表格模型部署到Vertex AI预测，并将其性能与基于开源的TensorFlow容器进行比较。

执行的步骤包括：
* 下载并解压Criteo Kaggle数据集
* 使用Keras API构建和训练模型
* 设置私有端点
* 使用TensorFlow 2.7容器将模型部署到Vertex AI预测
* 使用优化的TensorFlow容器将模型部署到Vertex AI预测
* 对两个模型进行基准测试并验证它们的预测结果

您可以在Colab中训练模型并将其上传到Vertex AI预测。由于本教程使用私有端点演示Vertex AI预测，您必须使用Jupyter VM来运行基准测试。

### 成本

本教程使用Google Cloud的以下可计费组件：

* Vertex AI
* 云存储

了解有关[Vertex AI价格](https://cloud.google.com/vertex-ai/pricing)和[云存储价格](https://cloud.google.com/storage/pricing)，并使用[定价计算器](https://cloud.google.com/products/calculator/)生成基于您预期使用情况的成本估算。

### 设置本地开发环境

**如果您正在使用Colab或Vertex AI Workbench笔记本**，您的环境满足运行此笔记本的要求。您可以跳过此步骤。

如果您没有在使用Colab或Vertex AI Workbench笔记本，则您的环境必须具备以下条件才能满足此笔记本的要求。

- Google Cloud SDK
- Git
- Python 3
- virtualenv
- 在使用Python 3的虚拟环境中运行Jupyter笔记本

Google Cloud指南提供了[设置Python开发环境的详细说明](https://cloud.google.com/python/setup)和[Jupyter安装指南](https://jupyter.org/install)来满足这些要求。以下是简要说明：

1. [安装并初始化Cloud SDK。](https://cloud.google.com/sdk/docs/)
2. [安装Python 3。](https://cloud.google.com/python/setup#installing_python)
3. [安装virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)，并创建一个使用Python 3的虚拟环境。激活虚拟环境。
4. 要安装Jupyter，请在终端shell中运行`pip3 install jupyter`。
5. 要启动Jupyter，请在终端shell中运行`jupyter notebook`。
6. 在Jupyter Notebook Dashboard中打开此笔记本。

安装额外的软件包

安装额外的软件包依赖项，这些依赖项未在您的笔记本环境中安装，例如TensorFlow Serving API、Vertex AI SDK。

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Vertex AI Workbench Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [None]:
! pip3 install {USER_FLAG} --upgrade tensorflow==2.7.0 -q
! pip3 install {USER_FLAG} --upgrade tensorflow-serving-api==2.7.0 -q
! pip3 install {USER_FLAG} --upgrade google-cloud-aiplatform -q
! pip3 install {USER_FLAG} --upgrade google-cloud-storage -q

重启内核

在安装了额外的包之后，您必须重新启动笔记本内核，以便它可以找到这些包。

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

在你开始之前

## 设置你的 Google Cloud 项目

**无论你使用哪种笔记本环境，以下步骤都是必须的。**

1. [选择或创建一个 Google Cloud 项目](https://console.cloud.google.com/cloud-resource-manager)。当你第一次创建一个账号时，你将获得$300的信用用于计算和存储成本。

1. [确保你的项目启用了结算功能](https://cloud.google.com/billing/docs/how-to/modify-project)。

1. [启用 Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

1. [启用 Service Networking API](https://console.cloud.google.com/flows/enableapi?apiid=servicenetworking.googleapis.com)。

1. [启用 Cloud DNS API](https://console.cloud.google.com/flows/enableapi?apiid=dns.googleapis.com)。

1. 如果你在本地运行这个笔记本，你必须安装[Cloud SDK](https://cloud.google.com/sdk)。

1. 在下面的单元格中输入你的项目 ID。然后运行这个单元格，确保
Cloud SDK 在本笔记本中的所有命令中使用正确的项目。

**注意**: Jupyter 运行以 `!` 开头的命令作为 shell 命令，并且它会将以 `$` 开头的 Python 变量插入到这些命令中。

设置您的项目ID

**如果您不知道您的项目ID**，您可以尝试使用`gcloud`获取您的项目ID。

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

将你的项目ID设置在这里。

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

时间戳

如果您正在进行实时教程会话，可能会使用共享的测试账户或项目。为了避免用户之间在创建的资源上发生名称冲突，请为每个实例会话创建一个时间戳，然后将其附加到您在本教程中创建的资源名称中。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 验证您的 Google Cloud 帐户

**如果您正在使用 Vertex AI Workbench 笔记本**，您的环境已经进行了验证。请跳过此步骤。

如果您正在使用Colab，请运行下面的单元格，并按提示进行身份验证以通过oAuth验证您的帐户。

否则，请按照以下步骤操作：

1. 在Cloud Console中，转到[**创建服务帐号密钥**页面](https://console.cloud.google.com/apis/credentials/serviceaccountkey)。

2. 点击**创建服务帐号**。

3. 在**服务帐号名称**字段中输入名称，然后点击**创建**。

4. 在**授予此服务帐号对项目的访问权限**部分，点击**角色**下拉列表。在过滤框中输入"Vertex AI"，并选择**Vertex AI管理员**。在过滤框中输入"存储对象管理员"，并选择**存储对象管理员**。

5. 点击**创建**。包含您密钥的JSON文件将下载到本地环境。

6. 将您的服务帐号密钥路径输入为下一个单元格中的`GOOGLE_APPLICATION_CREDENTIALS`变量，然后运行该单元格。

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Vertex AI Workbench Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Vertex AI Workbench Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### 创建云存储桶

**以下步骤适用于所有笔记本环境。**

要使Vertex AI Prediction为您的模型提供服务，必须首先将其上传到云存储桶。

在下面的单元格中设置您的云存储桶的名称。它必须在所有的云存储桶中是唯一的。

您可以更改 `REGION` 变量，该变量用于本笔记本其余部分的操作。我们建议您[选择一个Vertex AI服务可用的地区](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions)。

In [None]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}
REGION = "[your-region]"  # @param {type:"string"}

In [None]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

if REGION == "[your-region]":
    REGION = "us-central1"

只有当您的存储桶尚不存在时：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

你的云存储桶的最后一步是通过检查其内容来验证对云存储桶的访问权限。

In [None]:
! gsutil ls -al $BUCKET_URI

导入库并定义常量

In [None]:
import json
import math
import os
import re
import sys
from urllib.parse import urlparse

import grpc
import numpy as np
import requests as r
import tensorflow as tf
from tensorflow_serving.apis import (predict_pb2, prediction_log_pb2,
                                     prediction_service_pb2_grpc)

logging = tf.get_logger()
logging.propagate = False
logging.setLevel("INFO")

In [None]:
LOCAL_DIRECTORY = "~/criteo"  # @param {type:"string"}
HIDDEN_LAYERS_STR = "2048,2048,1024,512,256"  # @param {type:"string"}

HIDDEN_LAYERS = list(map(lambda x: int(x), HIDDEN_LAYERS_STR.split(",")))
LOCAL_DIRECTORY_FULL = os.path.expanduser(LOCAL_DIRECTORY)

下载数据集

请按照[Criteo网站](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/)上的说明下载数据。

如果数据不可用，您可以使用以下网址下载。

In [None]:
!mkdir -p $LOCAL_DIRECTORY_FULL/data

In [None]:
!cd $LOCAL_DIRECTORY_FULL/data && curl -O https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10082655/dac.tar.gz

In [None]:
!cd $LOCAL_DIRECTORY_FULL/data && tar xvzf dac.tar.gz

In [None]:
!head -n 3 $LOCAL_DIRECTORY_FULL/data/train.txt

## 读取和转换数据集

在模型可以训练之前，变量必须进行预处理。

数值通过减去它们的平均值并除以它们的标准差进行归一化处理。
每个数值特征的平均值和标准差都是预先计算的。每个分类特征的词汇大小也是预先计算的。

In [None]:
COLUMN_NAMES = [
    "label",
    "int1",
    "int2",
    "int3",
    "int4",
    "int5",
    "int6",
    "int7",
    "int8",
    "int9",
    "int10",
    "int11",
    "int12",
    "int13",
    "cat1",
    "cat2",
    "cat3",
    "cat4",
    "cat5",
    "cat6",
    "cat7",
    "cat8",
    "cat9",
    "cat10",
    "cat11",
    "cat12",
    "cat13",
    "cat14",
    "cat15",
    "cat16",
    "cat17",
    "cat18",
    "cat19",
    "cat20",
    "cat21",
    "cat22",
    "cat23",
    "cat24",
    "cat25",
    "cat26",
]

# Precalculated, see
# https://github.com/vlasenkoalexey/criteo_nbdev/blob/master/04_data_reader.ipynb
NUM_AVERAGE = {
    "int1": 3.5024133170753995,
    "int2": 105.8484197976657,
    "int3": 26.91304102061112,
    "int4": 7.322680248873331,
    "int5": 18538.99166487135,
    "int6": 116.06185085211605,
    "int7": 16.333130032135013,
    "int8": 12.517042137556762,
    "int9": 106.10982343805145,
    "int10": 0.6175294977722183,
    "int11": 2.7328343170173173,
    "int12": 0.9910356287721245,
    "int13": 8.21746116117401,
}
NUM_STDDEV = {
    "int1": 9.429076407105086,
    "int2": 391.4578226870704,
    "int3": 397.97258302273474,
    "int4": 8.793230712645805,
    "int5": 69394.60184622335,
    "int6": 382.5664493712363,
    "int7": 66.0497552451171,
    "int8": 16.688884567787586,
    "int9": 220.28309398647906,
    "int10": 0.6840505553977025,
    "int11": 5.199070884811354,
    "int12": 5.597723872237179,
    "int13": 16.211932558173785,
}
VOCABULARY_SIZE = {
    "cat1": 1460,
    "cat2": 583,
    "cat3": 10131226,
    "cat4": 2202607,
    "cat5": 305,
    "cat6": 23,
    "cat7": 12517,
    "cat8": 633,
    "cat9": 3,
    "cat10": 93145,
    "cat11": 5683,
    "cat12": 8351592,
    "cat13": 3194,
    "cat14": 27,
    "cat15": 14992,
    "cat16": 5461305,
    "cat17": 10,
    "cat18": 5652,
    "cat19": 2172,
    "cat20": 3,
    "cat21": 7046546,
    "cat22": 17,
    "cat23": 15,
    "cat24": 286180,
    "cat25": 104,
    "cat26": 142571,
}

In [None]:
@tf.function
def transform_row(*row_tuple):
    row_dict = dict(
        zip(list(column_name for column_name in COLUMN_NAMES), list(row_tuple))
    )
    dict_without_label = dict(row_dict)
    label = dict_without_label.pop("label")
    return (dict_without_label, label)


def read_gcs(batch_size=64):
    file_name = os.path.join(LOCAL_DIRECTORY_FULL, "data", "train.txt")
    record_defaults = list(
        tf.int64
        if column_name == "label"
        else tf.constant(0, dtype=tf.int64)
        if column_name.startswith("int")
        else tf.constant("", dtype=tf.string)
        for column_name in COLUMN_NAMES
    )
    dataset = tf.data.experimental.CsvDataset(
        file_name, record_defaults, field_delim="\t", header=False
    )

    transformed_ds = (
        dataset.batch(batch_size).shuffle(500).map(transform_row).prefetch(50)
    )

    return transformed_ds

In [None]:
for row in read_gcs(batch_size=3).take(2):
    print(row)

训练和保存Keras模型

有关如何使用TensorFlow Keras API训练表格模型的概述，请参阅https://github.com/tensorflow/docs/blob/r2.4/site/en/tutorials/structured_data/feature_columns.ipynb

In [None]:
def make_norm_fn(column_name):
    avg = NUM_AVERAGE[column_name]
    stddev = NUM_STDDEV[column_name]
    return lambda v: (tf.dtypes.cast(v, tf.float32) - avg) / stddev


def create_feature_columns():
    linear_feature_columns = []
    categorical_feature_columns = []

    for column_name in COLUMN_NAMES:
        if column_name.startswith("int"):
            linear_feature_columns.append(
                tf.feature_column.numeric_column(
                    column_name,
                    dtype=tf.dtypes.int64,
                    normalizer_fn=make_norm_fn(column_name),
                )
            )

        if column_name.startswith("cat"):
            column_vocabulary_size = VOCABULARY_SIZE[column_name]
            hash_bucket_size = min(column_vocabulary_size, 100000)
            embedding_dimension = int(
                min(50, math.floor(6 * column_vocabulary_size**0.25))
            )
            categorical_feature_columns.append(
                tf.feature_column.embedding_column(
                    tf.feature_column.categorical_column_with_hash_bucket(
                        column_name, hash_bucket_size, dtype=tf.dtypes.string
                    ),
                    embedding_dimension,
                )
            )

    return linear_feature_columns + categorical_feature_columns

In [None]:
def create_keras_model_sequential():
    feature_columns = create_feature_columns()

    feature_layer = tf.keras.layers.DenseFeatures(feature_columns, name="feature_layer")
    Dense = tf.keras.layers.Dense
    Dropout = tf.keras.layers.Dropout
    BatchNormalization = tf.keras.layers.BatchNormalization
    dense_layers = []
    for c in HIDDEN_LAYERS:
        dense_layers.append(BatchNormalization())
        dense_layers.append(Dense(c, activation=tf.nn.relu))
        dense_layers.append(Dropout(0.05))
    model = tf.keras.Sequential(
        [feature_layer] + dense_layers + [Dense(1, activation=tf.nn.sigmoid)]
    )

    logging.info("compiling sequential keras model")
    # Compile Keras model
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=["accuracy"],
    )
    return model


model = create_keras_model_sequential()

训练模型。预期损失约为0.35。

In [None]:
model.fit(read_gcs(batch_size=256).take(1000), epochs=3)

验证模型。预期损失约为0.45。

In [None]:
model.evaluate(read_gcs(batch_size=256).skip(1000).take(1000))

In [None]:
model.summary()

In [None]:
model.save(os.path.join(LOCAL_DIRECTORY_FULL, "keras"), include_optimizer=False)

请检查模型签名以查看预测请求应具有哪些字段。

In [None]:
!saved_model_cli show --dir $LOCAL_DIRECTORY_FULL/keras --all

##（可选）训练和保存评估器模型

训练模型的另一个选择是使用TensorFlow的Estimator API。更多信息请参见https://github.com/tensorflow/docs/blob/r2.4/site/en/tutorials/estimator/premade.ipynb

以下代码仅供示例目的。您可以使用Keras模型进行部署。

In [None]:
feature_columns = create_feature_columns()
estimator = tf.estimator.DNNClassifier(
    optimizer=tf.optimizers.Adam(learning_rate=0.01),
    feature_columns=feature_columns,
    hidden_units=HIDDEN_LAYERS,
    dropout=0.05,
    batch_norm=True,
    n_classes=2,
)

In [None]:
tf.estimator.train_and_evaluate(
    estimator,
    train_spec=tf.estimator.TrainSpec(
        input_fn=lambda: read_gcs(batch_size=256).take(2000)
    ),
    eval_spec=tf.estimator.EvalSpec(input_fn=lambda: read_gcs().skip(500).take(100)),
)

In [None]:
!rm -r -f $LOCAL_DIRECTORY_FULL/estimator

In [None]:
tf.compat.v1.disable_eager_execution()  # You'll have to restart Runtime after running this
spec_dict = {}
for column_name in COLUMN_NAMES:
    if column_name.startswith("int"):
        spec_dict[column_name] = tf.compat.v1.placeholder(
            name=column_name, shape=(1,), dtype=tf.int64
        )
    if column_name.startswith("cat"):
        spec_dict[column_name] = tf.compat.v1.placeholder(
            name=column_name, shape=(), dtype=tf.string
        )

serving_input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(spec_dict)
estimator_base_path = os.path.join(LOCAL_DIRECTORY_FULL, "estimator")
estimator_path = estimator.export_saved_model(estimator_base_path, serving_input_fn)
estimator_path = estimator_path.decode("ascii")
estimator_path

In [None]:
!saved_model_cli show --dir $estimator_path --all

## 生成预测请求

现在我们可以生成发送到我们的模型进行推断的请求。
请求是以JSON Lines格式生成的，每行一个请求。

In [None]:
!mkdir -p $LOCAL_DIRECTORY_FULL/requests

In [None]:
def wrap_value(value, wrap_value):
    if wrap_value:
        return [value]
    else:
        return value


def row_to_dict(row, wrap_values):
    d = {}
    for key, value in row[0].items():
        if "int" in key:
            d[key] = [wrap_value(v, wrap_values) for v in value.numpy().tolist()]
        if "cat" in key:
            d[key] = [
                wrap_value(v.decode(), wrap_values) for v in value.numpy().tolist()
            ]
    return d


def export_requests_jsonl(file_name, rows=100, batch_size=64, wrap_values=True):
    with tf.io.gfile.GFile(file_name, mode="w") as f:
        for row in read_gcs(batch_size):
            d = row_to_dict(row, wrap_values)
            f.write(json.dumps(d))
            f.write("\n")
            rows -= 1
            if rows == 0:
                break

In [None]:
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_1.jsonl"),
    rows=1,
    batch_size=1,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_512.jsonl"),
    rows=1,
    batch_size=512,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_10_1.jsonl"),
    rows=10,
    batch_size=1,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_10_512.jsonl"),
    rows=10,
    batch_size=512,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_10_1024.jsonl"),
    rows=10,
    batch_size=1024,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_100_1.jsonl"),
    rows=100,
    batch_size=1,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_100_512.jsonl"),
    rows=100,
    batch_size=512,
)
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_100_1024.jsonl"),
    rows=100,
    batch_size=1024,
)

如果您想要导出Estimator模型的请求，您必须将`wrap_values`设置为`False`。

In [None]:
export_requests_jsonl(
    os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_estimator_10_1.jsonl"),
    rows=10,
    batch_size=1,
    wrap_values=False,
)

##（可选）生成热身请求

TensorFlow运行时具有延迟初始化的组件。懒惰初始化可能会导致在加载模型后发送给模型的第一个请求的延迟很高。这种延迟可能比单个推理请求的延迟高出几个数量级。

有关SavedModel预热的更多信息，请参阅https://www.tensorflow.org/tfx/serving/saved_model_warmup。

对于使用优化的TensorFlow运行时的Vertex AI预测，当模型预编译时，针对每个新批量大小的第一个请求会有较高的延迟。当`allow_precompilation`标志设置为true时启用预编译。

为了减少高延迟，提供一个热身请求，让运行时在启动时加载。热身文件应包含你预期在生产中接收的各种批量大小。

请注意，使用多个批量大小提供热身请求会增加每个节点启动的时间。

如果您希望模型接收多个批量大小，可以使用一组`allowed_batch_sizes` 使用自动服务器端请求批处理。有关更多信息，请参阅https://www.tensorflow.org/tfx/serving/serving_config#batching_configuration。

要为在Vertex AI预测上运行的模型启用自动批处理，请将批处理配置放入与 saved_model.pb 相同的GCS目录下的[config/batching_parameters_config](https://cloud.google.com/vertex-ai/docs/training/exporting-model-artifacts#enable_server-side_request_batching_for_tensorflow) 文件中。

In [None]:
!mkdir -p $LOCAL_DIRECTORY_FULL/keras/assets.extra

In [None]:
def build_grpc_request(
    row_dict, model_name="default", signature_name="serving_default"
):
    """Generate gRPC inference request with payload."""

    request = predict_pb2.PredictRequest()
    request.model_spec.name = model_name
    request.model_spec.signature_name = signature_name
    for key, value in row_dict.items():
        proto = None
        if "cat" in key:
            proto = tf.make_tensor_proto(value, dtype=tf.string)
        else:
            proto = tf.make_tensor_proto(value, dtype=tf.int64)
        request.inputs[key].CopyFrom(proto)
    return request


def export_warmup_file(
    request_files, export_path, model_name="default", signature_name="serving_default"
):
    with tf.io.TFRecordWriter(export_path) as writer:
        for request_file_path in request_files:
            with open(request_file_path) as f:
                row_dict = json.loads(f.readline())
                request = build_grpc_request(row_dict, model_name, signature_name)
            log = prediction_log_pb2.PredictionLog(
                predict_log=prediction_log_pb2.PredictLog(request=request)
            )
            writer.write(log.SerializeToString())


export_warmup_file(
    [
        os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_1.jsonl"),
        os.path.join(LOCAL_DIRECTORY_FULL, "requests", "requests_1_512.jsonl"),
    ],
    os.path.join(
        LOCAL_DIRECTORY_FULL, "keras", "assets.extra", "tf_serving_warmup_requests"
    ),
)

将模型部署到Vertex AI预测

要将模型部署到Vertex AI预测服务，您必须将其放入一个GCS存储桶中。

In [None]:
!gsutil rm -r $BUCKET_URI/*

In [None]:
!gsutil cp -r $LOCAL_DIRECTORY_FULL/keras/* $BUCKET_URI

将Vertex AI Python客户端库导入到您的笔记本环墍中。

In [None]:
from google.cloud.aiplatform import gapic as aip

定义节点类型以用于部署。有关 Vertex AI 预测选项的更多信息，请查看 [配置计算资源](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute)。

In [None]:
DEPLOY_COMPUTE = "n1-standard-16"
DEPLOY_GPU = aip.AcceleratorType.NVIDIA_TESLA_T4

AI 平台 Python 客户端库以客户端/服务器模型运行。

在此示例中使用以下客户端:
- 模型服务用于管理模型。
- 端点服务用于部署。

In [None]:
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
PARENT = f"projects/{PROJECT_ID}/locations/{REGION}"

client_options = {"api_endpoint": API_ENDPOINT}
model_service_client = aip.ModelServiceClient(client_options=client_options)
endpoint_service_client = aip.EndpointServiceClient(client_options=client_options)

### 设置私有端点用于在线预测

您训练的Criteo模型的吞吐量和延迟对网络性能敏感。

请注意，批量大小为512的单个请求占用约200Kb的空间。

In [None]:
!ls -alh $LOCAL_DIRECTORY_FULL/requests/requests_1_512.jsonl

为了获得最佳性能，请使用Vertex AI预测私有端点。

要使用私有端点，请在您的项目和托管虚拟机运行您的模型的Vertex AI预测服务项目之间设置VPC对等网络。这样可以消除网络流量中的额外跳跃，并允许使用高效的gRPC协议。

有关私有端点的更多信息，请参阅https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints。

有关Vertex AI中VPC对等连接的更多信息，请参阅https://cloud.google.com/vertex-ai/docs/general/vpc-peering。

**重要提示：每个VPC网络只能设置一个到servicenetworking.googleapis.com的VPC对等连接。**

为简单起见，您可以将VPC互连设置为默认网络。您可以为您的项目创建不同的网络。

如果您要与任何其他网络建立VPC互连，请确保该网络已经存在，并且您的虚拟机正在该网络上运行。

In [None]:
# This is for display only; you can name the range anything.
PEERING_RANGE_NAME = "vertex-ai-prediction-peering-range"
NETWORK = "default"

In [None]:
# NOTE: `prefix-length=16` means a CIDR block with mask /16 will be
# reserved for use by Google services, such as Vertex AI.
!gcloud compute addresses create $PEERING_RANGE_NAME \
  --global \
  --prefix-length=16 \
  --description="peering range for Google service" \
  --network=$NETWORK \
  --purpose=VPC_PEERING

创建 VPC 连接。

In [None]:
!gcloud services vpc-peerings connect \
  --service=servicenetworking.googleapis.com \
  --network=$NETWORK \
  --ranges=$PEERING_RANGE_NAME \
  --project=$PROJECT_ID

如果在运行此命令时收到权限错误，请尝试使用您的用户帐户运行它。

要使用您的用户帐户运行此命令，请执行以下操作：
- 在上面的单元格中的命令之前添加 `echo` (`echo gcloud services vpc-peering ...`)。
- 运行该单元格并复制其输出
- 打开新的终端窗口，并运行 `gcloud auth login` 以使用您的用户帐户进行身份验证。
- 粘贴并运行在终端中复制的命令。

检查您对等连接的状态。

In [None]:
!gcloud compute networks peerings list --network $NETWORK

### 上传模型到Vertex AI预测

了解有关[model_service.upload_model](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.model_service.ModelServiceClient#google_cloud_aiplatform_v1_services_model_service_ModelServiceClient_upload_model)的更多信息。

`artifact_uri`参数应指向保存您的模型的`saved_model.pb`文件所在的GCS路径。

`image_uri`指定要使用的docker镜像。在此处，我们使用TF2.7 GPU和Vertex AI Prediction优化的TensorFlow运行时镜像上传相同的模型。

为了能够通过gRPC发送请求到您的模型，您需要设置`model_name`参数，并相应地更新`predict_route`和`health_route`。

请注意，Vertex AI Prediction中的gRPC支持仍处于实验阶段。

In [None]:
tf27_cpu_model_dict = {
    "display_name": "Criteo Kaggle TF2.7 CPU model",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-7:latest",
        "args": [
            "--port=8500",
            "--rest_api_port=8080",
            "--model_name=default",
            "--model_base_path=$(AIP_STORAGE_URI)",
        ],
        "ports": [{"container_port": 8080}],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}
tf27_cpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf27_cpu_model_dict)
    .result(timeout=180)
    .model
)
tf27_cpu_model

In [None]:
tf27_gpu_model_dict = {
    "display_name": "Criteo Kaggle TF2.7 GPU model",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-7:latest",
        "args": [
            "--port=8500",
            "--rest_api_port=8080",
            "--model_name=default",
            "--model_base_path=$(AIP_STORAGE_URI)",
        ],
        "ports": [{"container_port": 8080}],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}
tf27_gpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf27_gpu_model_dict)
    .result(timeout=180)
    .model
)
tf27_gpu_model

要使用Vertex AI Prediction优化的TensorFlow运行时部署模型，请使用`us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.nightly:latest`容器。

模型应用了两种优化选项。
- *allow_precompilation* - 启用模型预编译以提高性能。请注意，当第一个具有新批量大小的请求到达时，模型预编译会发生，并且在预编译完成后发送该请求的响应。为了减轻这一问题，请指定一个热身文件（请参阅此colab中的之前部分）。模型预编译适用于不同类型的模型，在大多数情况下对性能有积极影响。但我们建议您在生产环境中启用之前先为您的模型尝试一下。
- *allow_precision_affecting_optimizations* - 启用影响精度的优化。在某些情况下，这会使模型运行速度明显加快，但损失对模型预测能力的影响非常小。使用此优化时，您应该评估对模型的精度影响。

有关可用优化的TensorFlow运行时容器和选项列表，请参阅https://cloud.google.com/vertex-ai/docs/predictions/optimized-tensorflow-runtime。

In [None]:
tf_opt_gpu_model_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.nightly:latest",
        "args": [
            "--model_name=default",
            "--allow_precompilation=true",
            "--allow_precision_affecting_optimizations=false",
        ],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}

tf_opt_gpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf_opt_gpu_model_dict)
    .result(timeout=180)
    .model
)
tf_opt_gpu_model

In [None]:
tf_opt_lossy_gpu_model_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model with lossy optimizations",
    "artifact_uri": BUCKET_URI,
    "container_spec": {
        "image_uri": "us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-gpu.nightly:latest",
        "args": [
            "--model_name=default",
            "--allow_precompilation=true",
            "--allow_precision_affecting_optimizations=true",
        ],
        "predict_route": "/v1/models/default:predict",
        "health_route": "/v1/models/default",
    },
}

tf_opt_lossy_gpu_model = (
    model_service_client.upload_model(parent=PARENT, model=tf_opt_lossy_gpu_model_dict)
    .result(timeout=180)
    .model
)
tf_opt_lossy_gpu_model

列出所有的型号。

In [None]:
model_service_client.list_models(parent=PARENT)

创建端点

了解更多关于[endpoint_service.create_endpoint](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.endpoint_service.EndpointServiceClient#google_cloud_aiplatform_v1_services_endpoint_service_EndpointServiceClient_create_endpoint)。

In [None]:
project_number = re.match(r"projects/(\d+)/.+", tf27_cpu_model)[1]
full_network_name = f"projects/{project_number}/global/networks/{NETWORK}"
full_network_name

In [None]:
tf27_cpu_endpoint_dict = {
    "display_name": "Criteo Kaggle TF2.7 CPU private endpoint",
    "network": full_network_name,
}
tf27_cpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf27_cpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf27_cpu_endpoint

In [None]:
tf27_gpu_endpoint_dict = {
    "display_name": "Criteo Kaggle TF2.7 GPU private endpoint",
    "network": full_network_name,
}
tf27_gpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf27_gpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf27_gpu_endpoint

In [None]:
tf_opt_gpu_endpoint_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU private endpoint",
    "network": full_network_name,
}
tf_opt_gpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf_opt_gpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf_opt_gpu_endpoint

In [None]:
tf_opt_lossy_gpu_endpoint_dict = {
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU with lossy optimizations private endpoint",
    "network": full_network_name,
}
tf_opt_lossy_gpu_endpoint = (
    endpoint_service_client.create_endpoint(
        parent=PARENT, endpoint=tf_opt_lossy_gpu_endpoint_dict
    )
    .result(timeout=300)
    .name
)
tf_opt_lossy_gpu_endpoint

部署模型到端点

了解有关[enpoint_service.deploy_model](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.endpoint_service.EndpointServiceClient#google_cloud_aiplatform_v1_services_endpoint_service_EndpointServiceClient_deploy_model)的更多信息。

In [None]:
tf27_cpu_deployed_model_dict = {
    "model": tf27_cpu_model,
    "display_name": "Criteo Kaggle TF2.7 CPU deployed model",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_count": 0,
        },
    },
}

tf27_cpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf27_cpu_endpoint, deployed_model=tf27_cpu_deployed_model_dict
).result()
tf27_cpu_deployed_model

In [None]:
tf27_gpu_deployed_model_dict = {
    "model": tf27_gpu_model,
    "display_name": "Criteo Kaggle TF2.7 GPU deployed model",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": 1,
        },
    },
}

tf27_gpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf27_gpu_endpoint, deployed_model=tf27_gpu_deployed_model_dict
).result()
tf27_gpu_deployed_model

In [None]:
tf_opt_gpu_deployed_model_dict = {
    "model": tf_opt_gpu_model,
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": 1,
        },
    },
}

tf_opt_gpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf_opt_gpu_endpoint, deployed_model=tf_opt_gpu_deployed_model_dict
).result()
tf_opt_gpu_deployed_model

In [None]:
tf_opt_lossy_gpu_deployed_model_dict = {
    "model": tf_opt_lossy_gpu_model,
    "display_name": "Criteo Kaggle optimized TensorFlow runtime GPU model with lossy optimizations",
    "dedicated_resources": {
        "min_replica_count": 1,
        "max_replica_count": 1,
        "machine_spec": {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": 1,
        },
    },
}

tf_opt_lossy_gpu_deployed_model = endpoint_service_client.deploy_model(
    endpoint=tf_opt_lossy_gpu_endpoint,
    deployed_model=tf_opt_lossy_gpu_deployed_model_dict,
).result()
tf_opt_lossy_gpu_deployed_model

比较部署模型的性能

要访问私有端点，发送请求的虚拟机必须部署在您设置VPC对等连接的同一网络中。因此，您不能从Colab发送请求到使用私有端点部署的模型。

为了获得最佳性能，请确保虚拟机与您的模型位于同一地区。

导入用于对模型进行基准测试的辅助函数。

In [None]:
!curl https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/vertex_endpoints/optimized_tensorflow_runtime/benchmark.py -o benchmark.py

In [None]:
from benchmark import benchmark

该代码以给定的QPS异步且均匀地发送指定数量的请求，然后记录观察到的延迟。接下来，将对延迟结果进行汇总并计算百分位数。
模型能够处理的`实际QPS`是指模型处理发送请求所需的时间除以请求数量得出的值。
通过为`send_request`和`build_request`函数提供不同的实现，可以在本地或使用gRPC和REST协议在Vertex AI Prediction上运行的模型进行基准测试。

该基准测试的主要目标是测量模型在不同负载下的延迟和模型能够处理的最大吞吐量。为了找到最大吞吐量，逐渐增加QPS直到`实际QPS`停止增加并且延迟急剧增加。

在生产部署中，工作负载并不均匀，因此最大模型吞吐量可能会较低。
我们并不试图在此模拟生产工作负载。该基准测试旨在比较在不同环境中运行的同一模型的延迟和吞吐量。

关于部署模型的详细信息可以使用[endpoint_service_client.get_endpoint](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.endpoint_service.EndpointServiceClient#google_cloud_aiplatform_v1_services_endpoint_service_EndpointServiceClient_get_endpoint) API来访问。

In [None]:
tf_opt_gpu_endpoint_dict = endpoint_service_client.get_endpoint(
    name=tf_opt_gpu_endpoint
)
tf_opt_gpu_endpoint_dict

首先，请验证您是否可以访问您的模型。【Shǒuxiān, qǐng yànzhèng nín shìfǒu kěyǐ fǎngwèn nín de móxíng。】

In [None]:
health_url = tf_opt_gpu_endpoint_dict.deployed_models[
    0
].private_endpoints.health_http_uri
health_url

In [None]:
!curl $health_url

定义助手方法，使用REST协议针对私有端点运行基准测试。
应该发送请求的URI可以在`deployed_model.private_endpoints.predict_http_uri`中找到。

In [None]:
def build_rest_request(
    row_dict, model_name="default", signature_name="serving_default"
):
    payload = json.dumps({"signature_name": signature_name, "inputs": row_dict})
    return payload


def benchmark_rest_private_endpoint(
    endpoint_name, qps_list, model_name=None, duration_seconds=5
):
    endpoint_dict = endpoint_service_client.get_endpoint(name=endpoint_name)
    predict_uri = endpoint_dict.deployed_models[0].private_endpoints.predict_http_uri

    def send_rest_request(request):
        res = r.post(predict_uri, data=request)
        assert res.status_code == 200
        return res

    return benchmark(
        send_rest_request,
        build_rest_request,
        f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl",
        qps_list,
        duration_seconds,
        model_name=model_name,
    )

您还可以使用gRPC协议在私有端点上部署的模型进行基准测试。

gRPC地址与`predict_http_uri`或`predict_http_uri`的主机名相同。
gRPC目的地的格式为`<endpoint_id>-<deployed_model_id>`，作为“grpc_destination”头部传递。

请注意，在Vertex AI预测中，对gRPC的支持仍处于实验阶段。

In [None]:
def parse_endpoint_dict(endpoint_dict):
    endpoint_id = re.match(r".+/endpoints/(\d+)", endpoint_dict.name)[1]
    deployed_model_id = endpoint_dict.deployed_models[0].id
    grpc_destination = f"{endpoint_id}-{deployed_model_id}"
    predict_uri = urlparse(
        endpoint_dict.deployed_models[0].private_endpoints.predict_http_uri
    )
    grpc_uri = f"{predict_uri.netloc}:8500"
    return (grpc_uri, grpc_destination)


def benchmark_grpc_private_endpoint(endpoint_name, qps_list, duration_seconds=5):
    endpoint_dict = endpoint_service_client.get_endpoint(name=endpoint_name)
    grpc_uri, grpc_destinaion = parse_endpoint_dict(endpoint_dict)

    grpc_metadata = []
    grpc_metadata.append(("grpc-destination", grpc_destinaion))
    grpc_channel = grpc.insecure_channel(grpc_uri)
    grpc_stub = prediction_service_pb2_grpc.PredictionServiceStub(grpc_channel)

    def send_grpc_request(request):
        return grpc_stub.Predict(request, 60, metadata=grpc_metadata)

    return benchmark(
        send_grpc_request,
        build_grpc_request,
        f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl",
        qps_list,
        duration_seconds,
        model_name="default",
    )

现在我们可以为每个端点运行基准测试，并比较结果。

In [None]:
tf27_cpu_results = benchmark_grpc_private_endpoint(
    tf27_cpu_endpoint, [10, 20, 30, 40, 50, 55]
)
tf27_cpu_results

In [None]:
tf27_gpu_results = benchmark_grpc_private_endpoint(
    tf27_gpu_endpoint, [10, 20, 30, 40, 50, 60, 70, 75]
)
tf27_gpu_results

In [None]:
tf_opt_gpu_results = benchmark_grpc_private_endpoint(
    tf_opt_gpu_endpoint, [10, 50, 100, 150, 200, 250, 275, 300, 325, 350]
)
tf_opt_gpu_results

In [None]:
tf_opt_lossy_gpu_results = benchmark_grpc_private_endpoint(
    tf_opt_lossy_gpu_endpoint, [10, 50, 100, 200, 300, 400, 500, 600, 700, 800]
)
tf_opt_lossy_gpu_results

合并并可视化结果。

In [None]:
import matplotlib
import matplotlib.pyplot as plt


def build_graph(x_key, y_key, results_dict, axis):
    matplotlib.rcParams["figure.figsize"] = [10.0, 7.0]

    fig, ax = plt.subplots(facecolor=(1, 1, 1))
    ax.set_xlabel("QPS")
    ax.set_ylabel("Latency(ms)")
    for title, results in results_dict.items():
        x = np.array(results[x_key])
        y = np.array(results[y_key])
        ax.plot(x, y, label=title)
    ax.legend()
    ax.axis(axis)
    ax.set_title(f"Criteo model {y_key} latency, batch size 512")
    return fig

In [None]:
fig = build_graph(
    "actual_qps",
    "p50",
    {
        "TF2.7 CPU": tf27_cpu_results,
        "TF2.7 GPU": tf27_gpu_results,
        "TF opt GPU": tf_opt_gpu_results,
        "TF opt GPU lossy": tf_opt_lossy_gpu_results,
    },
    (0, 800, 0, 60),
)
fig.savefig("criteo_p50_latency_512.png", bbox_inches="tight")

In [None]:
fig = build_graph(
    "actual_qps",
    "p99",
    {
        "TF2.7 CPU": tf27_cpu_results,
        "TF2.7 GPU": tf27_gpu_results,
        "TF opt GPU": tf_opt_gpu_results,
        "TF opt GPU lossy": tf_opt_lossy_gpu_results,
    },
    (0, 800, 0, 100),
)
fig.savefig("criteo_p99_latency_512.png", bbox_inches="tight")

您可以看到Vertex AI Prediction优化的TensorFlow运行时与TensorFlow 2.7相比，具有显着更高的吞吐量和更低的延迟。

## （可选）使用MLPerf推理loadgen比较部署模型的性能

MLPerf 推断是一个基准套件，用于衡量系统在各种部署场景中运行模型的速度。MLPerf 现在是衡量模型性能的行业标准方式。您可以按照 https://github.com/tensorflow/tpu/tree/master/models/experimental/inference/load_test 上的说明来运行已部署模型的 MLPerf 推断基准测试。

## (Optional) 比较预测结果

在本样本中，使用优化的TensorFlow运行时进行顶点预测，设置`allow_precision_affecting_optimizations`标志为`true`以获得额外的加速。现在让我们检查这些优化如何影响预测结果。

我们比较在优化的TensorFlow运行时上对51200个请求进行模型预测的结果，该运行时在TF2.7上进行了损失优化。

In [None]:
def get_predictions(endpoint, requests_file_path):
    responses = []

    endpoint_dict = endpoint_service_client.get_endpoint(name=endpoint)
    pridict_uri = endpoint_dict.deployed_models[0].private_endpoints.predict_http_uri

    with tf.io.gfile.GFile(requests_file_path, "r") as f:
        for line in f:
            row_dict = json.loads(line)
            request = build_rest_request(row_dict)
            response = r.post(pridict_uri, data=request)
            for output in json.loads(response.text)["outputs"]:
                responses.append(output[0])

    return np.array(responses)

In [None]:
tf27_gpu_predictions = get_predictions(
    tf27_gpu_endpoint, f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl"
)

In [None]:
tf_opt_lossy_gpu_predictions = get_predictions(
    tf_opt_lossy_gpu_endpoint, f"{LOCAL_DIRECTORY_FULL}/requests/requests_100_512.jsonl"
)

In [None]:
np.average(tf_opt_lossy_gpu_predictions - tf27_gpu_predictions) * 100

In [None]:
np.max(np.abs(tf_opt_lossy_gpu_predictions - tf27_gpu_predictions)) * 100

您可以看到，平均结果在少于0.0016%的情况下是不同的。在最坏的情况下，差异为0.05%。

清理

完成后，可以安全地移除您创建的端点和部署的模型。

In [None]:
def cleanup(endpoint, model_name, deployed_model_id):
    response = endpoint_service_client.undeploy_model(
        endpoint=endpoint, deployed_model_id=deployed_model_id
    )
    print("running undeploy_model operation:", response.operation.name)
    print(response.result())

    response = endpoint_service_client.delete_endpoint(name=endpoint)
    print("running delete_endpoint operation:", response.operation.name)
    print(response.result())

    response = model_service_client.delete_model(name=model_name)
    print("running delete_model operation:", response.operation.name)
    print(response.result())

In [None]:
cleanup(tf27_cpu_endpoint, tf27_cpu_model, tf27_cpu_deployed_model)
cleanup(tf27_gpu_endpoint, tf27_gpu_model, tf27_gpu_deployed_model)
cleanup(tf_opt_gpu_endpoint, tf_opt_gpu_model, tf_opt_gpu_deployed_model)
cleanup(
    tf_opt_lossy_gpu_endpoint, tf_opt_lossy_gpu_model, tf_opt_lossy_gpu_deployed_model
)

您现在也可以从GCS存储桶中删除模型。

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    !gsutil rm -r $BUCKET_URI