In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Vertex SDK：使用自定义容器（即预构建容器）训练和部署 TensorFlow 模型

## 安装

安装最新（预览）版本的Vertex SDK。

In [None]:
! pip3 install -U google-cloud-aiplatform --user

也要安装Google的*云存储*库。

In [None]:
! pip3 install google-cloud-storage

### 重新启动内核

一旦您安装了Vertex SDK和Google *cloud-storage*，您需要重新启动笔记本内核以便它可以找到这些软件包。

In [None]:
import os

if not os.getenv("AUTORUN") and False:
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 开始之前

### GPU 运行时

*如果可以选择，请确保在 GPU 运行时下运行此笔记本。在 Colab 中，选择* **运行时 > 更改运行时类型 > GPU**

### 设置您的 GCP 项目

**无论您的笔记本环境如何，都需要执行以下步骤。**

1. [选择或创建一个 GCP 项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建帐户时，您将获得 $300 的免费信用额度，以支付计算/存储成本。

2. [确保为您的项目启用计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 Vertex API 和 Compute Engine API。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 已经安装在 Google Cloud 笔记本中。

5. 在下面的单元格中输入您的项目 ID。然后运行该单元格，以确保 Cloud SDK 对本笔记本中的所有命令使用正确的项目。

**注意**：Jupyter 以 `!` 为前缀运行的行作为 shell 命令，并将以 `$` 为前缀的 Python 变量插值到这些命令中。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### 区域

您还可以更改`REGION`变量，该变量用于本笔记本的其余操作。以下是 Vertex AI 支持的区域。我们建议尽可能选择距离您最近的区域。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您不能使用多区域存储存储桶来进行 Vertex 的训练。并非所有区域都支持所有 Vertex 服务。有关每个区域的最新支持，请参阅[Vertex AI 服务的区域支持](https://cloud.google.com/vertex-ai/docs/general/locations)。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您正在进行实时教程会话，您可能正在使用共享测试帐户或项目。为了避免在创建的资源上发生名称冲突，您为每个实例会话创建一个时间戳，并附加到将在此教程中创建的资源名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 验证您的GCP帐户

**如果您正在使用Google Cloud笔记本**，您的环境已经通过验证。请跳过此步骤。

*注意：如果您在Vertex笔记本上并运行该单元格，该单元格会知道跳过执行身份验证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Vertex, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

### 创建云存储桶

**无论您的笔记本环境如何，都需要执行以下步骤。**

本教程旨在使用位于公共云存储桶中的训练数据以及本地云存储桶用于批量预测。您也可以使用存储在本地云存储桶中的自有训练数据。

请在下方设置您的云存储桶名称。它必须在所有云存储桶中是唯一的。

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP

只有在您的存储桶尚未存在时才能执行：运行以下单元格来创建您的云存储存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

最后，通过检查Cloud Storage存储桶的内容来验证访问权限：

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### 设置变量

接下来，设置一些在教程中使用的变量。
### 导入库并定义常量

#### 导入Vertex SDK

将Vertex SDK导入到我们的Python环境中。

In [None]:
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip
from google.protobuf import json_format
from google.protobuf.json_format import MessageToJson, ParseDict
from google.protobuf.struct_pb2 import Struct, Value

设置以下常量给 Vertex AI：

- `API_ENDPOINT`：用于数据集、模型、作业、管道和端点服务的 Vertex AI API 服务端点。
- `API_PREDICT_ENDPOINT`：预测的 Vertex AI API 服务端点。
- `PARENT`：用于数据集、模型和端点资源的 Vertex AI 位置根路径。

In [None]:
# API Endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# Vertex AI location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

## 客户

Vertex SDK 以客户端/服务器模型工作。在您的一边（Python脚本）您将创建一个客户端，用来向服务器（Vertex）发送请求并接收响应。

您将在本教程中使用几个客户端，因此请提前设置它们。

- 数据集服务用于管理数据集。
- 模型服务用于管理模型。
- 管道服务用于训练。
- 终端服务用于部署。
- 作业服务用于批处理作业和定制训练。
- 预测服务用于提供结果。*注意*：预测有一个不同的服务端点。

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}


def create_model_client():
    client = aip.ModelServiceClient(client_options=client_options)
    return client


def create_endpoint_client():
    client = aip.EndpointServiceClient(client_options=client_options)
    return client


def create_prediction_client():
    client = aip.PredictionServiceClient(client_options=client_options)
    return client


def create_job_client():
    client = aip.JobServiceClient(client_options=client_options)
    return client


clients = {}
clients["model"] = create_model_client()
clients["endpoint"] = create_endpoint_client()
clients["prediction"] = create_prediction_client()
clients["job"] = create_job_client()

for client in clients.items():
    print(client)

准备一个训练师脚本

### 包装组装

In [None]:
! rm -rf cifar
! mkdir cifar
! touch cifar/README.md

setup_cfg = "[egg_info]\n\
tag_build =\n\
tag_date = 0"
! echo "$setup_cfg" > cifar/setup.cfg

setup_py = "import setuptools\n\
# Requires TensorFlow Datasets\n\
setuptools.setup(\n\
    install_requires=[\n\
        'tensorflow_datasets==1.3.0',\n\
    ],\n\
    packages=setuptools.find_packages())"
! echo "$setup_py" > cifar/setup.py

pkg_info = "Metadata-Version: 1.0\n\
Name: Custom Training CIFAR-10\n\
Version: 0.0.0\n\
Summary: Demonstration training script\n\
Home-page: www.google.com\n\
Author: Google\n\
Author-email: aferlitsch@google.com\n\
License: Public\n\
Description: Demo\n\
Platform: Vertex AI"
! echo "$pkg_info" > cifar/PKG-INFO

! mkdir cifar/trainer
! touch cifar/trainer/__init__.py

###写入Docker文件内容

In [None]:
%%writefile cifar/Dockerfile

FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-1
WORKDIR /root

WORKDIR /

# Copies the trainer code to the docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.task"]


### 任务.py 内容

In [None]:
%%writefile cifar/trainer/task.py
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
import argparse
import os
import sys

tfds.disable_progress_bar()

parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', dest='model_dir',
                    default='/tmp/saved_model', type=str, help='Model dir.')
parser.add_argument('--lr', dest='lr',
                    default=0.01, type=float,
                    help='Learning rate.')
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--steps', dest='steps',
                    default=200, type=int,
                    help='Number of steps per epoch.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='distributed training strategy')
args = parser.parse_args()

print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
print('DEVICES', device_lib.list_local_devices())

if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

print('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))

BUFFER_SIZE = 10000
BATCH_SIZE = 64

def make_datasets_unbatched():
  def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.0
    return image, label

  datasets, info = tfds.load(name='cifar10',
                            with_info=True,
                            as_supervised=True)
  return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE).repeat()

def build_and_compile_cnn_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(32, 32, 3)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(10, activation='softmax')
  ])
  model.compile(
      loss=tf.keras.losses.sparse_categorical_crossentropy,
      optimizer=tf.keras.optimizers.SGD(learning_rate=args.lr),
      metrics=['accuracy'])
  return model

NUM_WORKERS = strategy.num_replicas_in_sync
GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS
train_dataset = make_datasets_unbatched().batch(GLOBAL_BATCH_SIZE)

with strategy.scope():
  model = build_and_compile_cnn_model()

model.fit(x=train_dataset, epochs=args.epochs, steps_per_epoch=args.steps)
model.save(args.model_dir)


### 本地构建容器

In [None]:
TRAIN_IMAGE = f"gcr.io/{PROJECT_ID}/cifar_migration:v1"

! docker build cifar -t $TRAIN_IMAGE

### 注册您的自定义容器

In [None]:
! docker push $TRAIN_IMAGE

## 训练模型

### [projects.locations.customJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/create)

### [项目.位置.自定义作业.创建](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/create)

请求

In [None]:
JOB_NAME = "custom_container_" + TIMESTAMP

WORKER_POOL_SPEC = [
    {
        "replica_count": 1,
        "machine_spec": {"machine_type": "n1-standard-4", "accelerator_count": 0},
        "container_spec": {
            "image_uri": TRAIN_IMAGE,
            "args": [
                "--model-dir=" + "gs://" + BUCKET_NAME + "/" + JOB_NAME,
                "--epochs=" + str(20),
                "--steps=" + str(100),
            ],
        },
    }
]

CUSTOM_JOB = {
    "display_name": JOB_NAME,
    "job_spec": {"worker_pool_specs": WORKER_POOL_SPEC},
}

training_job = aip.CustomJob(**CUSTOM_JOB)

print(
    MessageToJson(
        aip.CreateCustomJobRequest(parent=PARENT, custom_job=training_job).__dict__[
            "_pb"
        ]
    )
)

```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "customJob": {
    "displayName": "custom_container_20210226022223",
    "jobSpec": {
      "workerPoolSpecs": [
        {
          "machineSpec": {
            "machineType": "n1-standard-4"
          },
          "replicaCount": "1",
          "containerSpec": {
            "imageUri": "gcr.io/migration-ucaip-training/cifar_migration:v1",
            "args": [
              "--model-dir=gs://migration-ucaip-trainingaip-20210226022223/custom_container_20210226022223",
              "--epochs=20",
              "--steps=100"
            ]
          }
        }
      ]
    }
  }
}
```

呼叫

In [None]:
request = clients["job"].create_custom_job(parent=PARENT, custom_job=training_job)

### 回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

*示例输出*：
```
{
  "name": "projects/116273516712/locations/us-central1/customJobs/957560278583607296",
  "displayName": "custom_container_20210226022223",
  "jobSpec": {
    "workerPoolSpecs": [
      {
        "machineSpec": {
          "machineType": "n1-standard-4"
        },
        "replicaCount": "1",
        "diskSpec": {
          "bootDiskType": "pd-ssd",
          "bootDiskSizeGb": 100
        },
        "containerSpec": {
          "imageUri": "gcr.io/migration-ucaip-training/cifar_migration:v1",
          "args": [
            "--model-dir=gs://migration-ucaip-trainingaip-20210226022223/custom_container_20210226022223",
            "--epochs=20",
            "--steps=100"
          ]
        }
      }
    ]
  },
  "state": "JOB_STATE_PENDING",
  "createTime": "2021-02-26T02:27:53.406955Z",
  "updateTime": "2021-02-26T02:27:53.406955Z"
}
```

In [None]:
# The full unique ID for the custom training job
custom_training_id = request.name
# The short numeric ID for the custom training job
custom_training_short_id = custom_training_id.split("/")[-1]

print(custom_training_id)

### [projects.locations.customJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/get)

### [projects.locations.customJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/get)

#### 电话

In [None]:
request = clients["job"].get_custom_job(name=custom_training_id)

#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/customJobs/957560278583607296",
  "displayName": "custom_container_20210226022223",
  "jobSpec": {
    "workerPoolSpecs": [
      {
        "machineSpec": {
          "machineType": "n1-standard-4"
        },
        "replicaCount": "1",
        "diskSpec": {
          "bootDiskType": "pd-ssd",
          "bootDiskSizeGb": 100
        },
        "containerSpec": {
          "imageUri": "gcr.io/migration-ucaip-training/cifar_migration:v1",
          "args": [
            "--model-dir=gs://migration-ucaip-trainingaip-20210226022223/custom_container_20210226022223",
            "--epochs=20",
            "--steps=100"
          ]
        }
      }
    ]
  },
  "state": "JOB_STATE_PENDING",
  "createTime": "2021-02-26T02:27:53.406955Z",
  "updateTime": "2021-02-26T02:27:53.406955Z"
}

In [None]:
while True:
    response = clients["job"].get_custom_job(name=custom_training_id)
    if response.state != aip.PipelineState.PIPELINE_STATE_SUCCEEDED:
        print("Training job has not completed:", response.state)
        if response.state == aip.PipelineState.PIPELINE_STATE_FAILED:
            break
    else:
        print("Training Time:", response.end_time - response.start_time)
        break
    time.sleep(60)

# model artifact output directory on Google Cloud Storage
model_artifact_dir = (
    response.job_spec.worker_pool_specs[0].container_spec.args[0].split("=")[-1]
)
print("artifact location  " + model_artifact_dir)

部署模型

加载保存的模型

In [None]:
import tensorflow as tf

model = tf.keras.models.load_model(model_artifact_dir)

### 图像数据的服务功能

In [None]:
CONCRETE_INPUT = "numpy_inputs"


def _preprocess(bytes_input):
    decoded = tf.io.decode_jpeg(bytes_input, channels=3)
    decoded = tf.image.convert_image_dtype(decoded, tf.float32)
    resized = tf.image.resize(decoded, size=(32, 32))
    rescale = tf.cast(resized / 255.0, tf.float32)
    return rescale


@tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
def preprocess_fn(bytes_inputs):
    decoded_images = tf.map_fn(
        _preprocess, bytes_inputs, dtype=tf.float32, back_prop=False
    )
    return {
        CONCRETE_INPUT: decoded_images
    }  # User needs to make sure the key matches model's input


m_call = tf.function(model.call).get_concrete_function(
    [tf.TensorSpec(shape=[None, 32, 32, 3], dtype=tf.float32, name=CONCRETE_INPUT)]
)


@tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
def serving_fn(bytes_inputs):
    images = preprocess_fn(bytes_inputs)
    prob = m_call(**images)
    return prob


tf.saved_model.save(
    model,
    model_artifact_dir,
    signatures={
        "serving_default": serving_fn,
    },
)

获取服务函数签名

In [None]:
loaded = tf.saved_model.load(model_artifact_dir)

input_name = list(
    loaded.signatures["serving_default"].structured_input_signature[1].keys()
)[0]

print("Serving function input:", input_name)

*示例输出*：
```
正在提供函数输入：bytes_inputs
```

### [projects.locations.models.upload](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models/upload)

### [projects.locations.models.upload](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models/upload)

#### 请求

In [None]:
container_spec = {
    "image_uri": "gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-1:latest",
    "env": [{"name": "exmple_env_name", "value": "example_env_value"}],
    "ports": [{"container_port": 8080}],
}

model = {
    "display_name": "custom_container_TF" + TIMESTAMP,
    "metadata_schema_uri": "",
    "artifact_uri": model_artifact_dir,
    "container_spec": container_spec,
}

print(MessageToJson(aip.UploadModelRequest(parent=PARENT, model=model).__dict__["_pb"]))

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "model": {
    "displayName": "custom_container_TF20210226022223",
    "containerSpec": {
      "imageUri": "gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-1:latest",
      "env": [
        {
          "name": "example_env_name",
          "value": "example_env_value"
        }
      ],
      "ports": [
        {
          "containerPort": 8080
        }
      ]
    },
    "artifactUri": "gs://migration-ucaip-trainingaip-20210226022223/custom_container_20210226022223"
  }
}

#### 电话

In [None]:
request = clients["model"].upload_model(parent=PARENT, model=model)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  “model”：“projects/116273516712/locations/us-central1/models/394223297069318144”
}

In [None]:
model_id = result.model

批量进行预测

生成批量预测文件

In [None]:
import cv2
import numpy as np
from tensorflow.keras.datasets import cifar10

(_, _), (x_test, y_test) = cifar10.load_data()
x_test = (x_test / 255.0).astype(np.float32)

print(x_test.shape, y_test.shape)

test_image_1, test_label_1 = x_test[0], y_test[0]
test_image_2, test_label_2 = x_test[1], y_test[1]

cv2.imwrite("tmp1.jpg", (test_image_1 * 255).astype(np.uint8))
cv2.imwrite("tmp2.jpg", (test_image_2 * 255).astype(np.uint8))

! gsutil cp tmp1.jpg gs://$BUCKET_NAME/tmp1.jpg
! gsutil cp tmp2.jpg gs://$BUCKET_NAME/tmp2.jpg

test_item_1 = "gs://" + BUCKET_NAME + "/" + "tmp1.jpg"
test_item_2 = "gs://" + BUCKET_NAME + "/" + "tmp2.jpg"

创建批量输入文件

现在让我们创建一个批量输入文件，您将把它存储在您的本地云存储桶中。批量输入文件可以是CSV或JSONL格式。在本教程中，您将使用JSONL格式。对于JSONL文件，您需要为每个数据项（实例）的每一行创建一个字典条目。该字典包含以下键/值对：

- `content`：图片的云存储路径。
- `mime_type`：内容类型。在我们的示例中，它是一个`jpeg`文件。

In [None]:
import base64
import json

gcs_input_uri = "gs://" + BUCKET_NAME + "/" + "test.jsonl"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    bytes = tf.io.read_file(test_item_1)
    b64str = base64.b64encode(bytes.numpy()).decode("utf-8")
    data = {input_name: {"b64": b64str}}
    f.write(json.dumps(data) + "\n")

    bytes = tf.io.read_file(test_item_2)
    b64str = base64.b64encode(bytes.numpy()).decode("utf-8")
    data = {input_name: {"b64": b64str}}
    f.write(json.dumps(data) + "\n")

! gsutil cat $gcs_input_uri

{"bytes_inputs": {"b64": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAAgACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD9qIntrti9vhg3KkLwR69Kbc3FrYskd1LGjOjsqNjJCjLH8Mj8xXw3+yr+3v8ABbUZL2/8L/G/4ja2L0raac/xAvEbTmndtyLFKOd5AwcZwCSccV6X8Xv22/jD4K+L2n+BPA/7H+qeP4v7LSb/AISLQNYjW0ieTmWLfIoUBQiksxA6VxwxtN0VOWn4nTPC1Y1XBHpuqftI6BZ+MrDw/FZSw2dyzRyXl3p8g/eblCgbcjBG/k8dPevU1tCWIKj/AL5r5+8aftTfCqx+H9leeM/i1pXw51aWJvtWkWF1b6ldQnkqnmRqyg9c7fXGag/Zm/aY+HL69d6MPjvr/jVNWm32M19pcgSwREyVZygAJO7PbAFZ08TUjNqpt32/AdSiuVOK2PyC/Zs/4LOfs7/s+fAbQvgz4K/Ywu7rw94Bd4op9WsbfUZ1u5CGlupHBBLSMCd2MYAA4Fe0eGf+Dm/4deO9EuvDvhvSLjSWt7MpPaw+DfNiihYgNvRWK4/hyRjn3r8WvjN8MviF4C+LPiPTvhtZ6lDo8l86W6QswDID0IHUA5x7Ve/ZF1f9pX4C/Gq1+Ifw90PV7e6mgms71o7QP58EowyMrgqwJCnB9K3w+UQxleFF4hw52lzSb5Y3aXM7Juy3dtbHRRzrCu0qlKEl17/fc/W6f/gsjpGtX40z4Zadp1280IVYYPAdsv70nO8ZQnPPToK7z4a/tKftD/ETU7TQPEur6nbpdgMmnrFHak5PUwwquPq3Wvk34QwftUfE/GtfE3xmnhm0LAiy0SwhiupgezSxouzPfb+dfdv7DPwl0rQtcivhZx4Ub1eWQtJu6lmZslmPqfWnmXD+DyjESgsSq1usYyjF+a5tWvkh18+w+IXJQpJeZ//Z"}}
{"bytes_inputs": {"b64": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAAgACADASIAAhEBAxEB/8QAHwAAAQUBAQEB...

### [projects.locations.batchPredictionJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/create)

### [projects.locations.batchPredictionJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/create)

#### 请求

In [None]:
batch_prediction_job = {
    "display_name": "custom_container_TF" + TIMESTAMP,
    "model": model_id,
    "input_config": {
        "instances_format": "jsonl",
        "gcs_source": {"uris": [gcs_input_uri]},
    },
    "model_parameters": ParseDict(
        {"confidenceThreshold": 0.5, "maxPredictions": 2}, Value()
    ),
    "output_config": {
        "predictions_format": "jsonl",
        "gcs_destination": {
            "output_uri_prefix": "gs://" + f"{BUCKET_NAME}/batch_output/"
        },
    },
    "dedicated_resources": {
        "machine_spec": {"machine_type": "n1-standard-2", "accelerator_type": 0},
        "starting_replica_count": 1,
        "max_replica_count": 1,
    },
}

print(
    MessageToJson(
        aip.CreateBatchPredictionJobRequest(
            parent=PARENT, batch_prediction_job=batch_prediction_job
        ).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "batchPredictionJob": {
    "displayName": "custom_container_TF20210226022223",
    "model": "projects/116273516712/locations/us-central1/models/394223297069318144",
    "inputConfig": {
      "instancesFormat": "jsonl",
      "gcsSource": {
        "uris": [
          "gs://migration-ucaip-trainingaip-20210226022223/test.jsonl"
        ]
      }
    },
    "modelParameters": {
      "confidenceThreshold": 0.5,
      "maxPredictions": 2.0
    },
    "outputConfig": {
      "predictionsFormat": "jsonl",
      "gcsDestination": {
        "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210226022223/batch_output/"
      }
    },
    "dedicatedResources": {
      "machineSpec": {
        "machineType": "n1-standard-2"
      },
      "startingReplicaCount": 1,
      "maxReplicaCount": 1
    }
  }
}

#### 电话

In [None]:
request = clients["job"].create_batch_prediction_job(
    parent=PARENT, batch_prediction_job=batch_prediction_job
)

回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

```
{
  "名称": "projects/116273516712/locations/us-central1/batchPredictionJobs/2465140253845880832",
  "显示名称": "custom_container_TF20210226022223",
  "模型": "projects/116273516712/locations/us-central1/models/394223297069318144",
  "输入配置": {
    "实例格式": "jsonl",
    "gcs来源": {
      "uris": [
        "gs://migration-ucaip-trainingaip-20210226022223/test.jsonl"
      ]
    }
  },
  "模型参数": {
    "最大预测值": 2.0,
    "置信度阈值": 0.5
  },
  "输出配置": {
    "预测格式": "jsonl",
    "gcs目的地": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210226022223/batch_output/"
    }
  },
  "专用资源": {
    "机器规格": {
      "机器类型": "n1-standard-2"
    },
    "起始副本计数": 1,
    "最大副本计数": 1
  },
  "手动批处理调优参数": {},
  "状态": "待处理作业状态",
  "创建时间": "2021-02-26T09:39:46.357554Z",
  "更新时间": "2021-02-26T09:39:46.357554Z"
}
```

In [None]:
# The fully qualified ID for the batch job
batch_job_id = request.name
# The short numeric ID for the batch job
batch_job_short_id = batch_job_id.split("/")[-1]

print(batch_job_id)

### [projects.locations.batchPredictionJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/get)

### [projects.locations.batchPredictionJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/get)

电话

In [None]:
request = clients["job"].get_batch_prediction_job(name=batch_job_id)

#### 响应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/batchPredictionJobs/2465140253845880832",
  "displayName": "custom_container_TF20210226022223",
  "model": "projects/116273516712/locations/us-central1/models/394223297069318144",
  "inputConfig": {
    "instancesFormat": "jsonl",
    "gcsSource": {
      "uris": [
        "gs://migration-ucaip-trainingaip-20210226022223/test.jsonl"
      ]
    }
  },
  "modelParameters": {
    "confidenceThreshold": 0.5,
    "maxPredictions": 2.0
  },
  "outputConfig": {
    "predictionsFormat": "jsonl",
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210226022223/batch_output/"
    }
  },
  "dedicatedResources": {
    "machineSpec": {
      "machineType": "n1-standard-2"
    },
    "startingReplicaCount": 1,
    "maxReplicaCount": 1
  },
  "manualBatchTuningParameters": {},
  "state": "JOB_STATE_PENDING",
  "createTime": "2021-02-26T09:39:46.357554Z",
  "updateTime": "2021-02-26T09:39:46.357554Z"
}

In [None]:
def get_latest_predictions(gcs_out_dir):
    """ Get the latest prediction subfolder using the timestamp in the subfolder name"""
    folders = !gsutil ls $gcs_out_dir
    latest = ""
    for folder in folders:
        subfolder = folder.split("/")[-2]
        if subfolder.startswith("prediction-"):
            if subfolder > latest:
                latest = folder[:-1]
    return latest


while True:
    response = clients["job"].get_batch_prediction_job(name=batch_job_id)
    if response.state != aip.JobState.JOB_STATE_SUCCEEDED:
        print("The job has not completed:", response.state)
        if response.state == aip.JobState.JOB_STATE_FAILED:
            break
    else:
        folder = get_latest_predictions(
            response.output_config.gcs_destination.output_uri_prefix
        )
        ! gsutil ls $folder/prediction*

        ! gsutil cat $folder/prediction*
        break
    time.sleep(60)

*示例输出*：
```
gs://migration-ucaip-trainingaip-20210226022223/batch_output/prediction-custom_container_TF20210226022223-2021_02_26T01_39_46_305Z/prediction.errors_stats-00000-of-00001
gs://migration-ucaip-trainingaip-20210226022223/batch_output/prediction-custom_container_TF20210226022223-2021_02_26T01_39_46_305Z/prediction.results-00000-of-00001
{"instance": {"bytes_inputs": {"b64": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAAgACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3...
{"instance": {"bytes_inputs": {"b64": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAAgACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3...
```

做在线预测

### [projects.locations.endpoints.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/create)

### [projects.locations.endpoints.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/create)

### 请求

In [None]:
endpoint = {"display_name": "custom_container_TF" + TIMESTAMP}

print(
    MessageToJson(
        aip.CreateEndpointRequest(parent=PARENT, endpoint=endpoint).__dict__["_pb"]
    )
)

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "endpoint": {
    "displayName": "custom_container_TF20210226022223"
  }
}
```
*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "endpoint": {
    "displayName": "custom_container_TF20210226022223"
  }
}
```

#### 电话

In [None]:
request = clients["endpoint"].create_endpoint(parent=PARENT, endpoint=endpoint)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*范例输出*：
```
{
    "name": "projects/116273516712/locations/us-central1/endpoints/2977125644296519680"
}
```

In [None]:
# The full unique ID for the endpoint
endpoint_id = result.name
# The short numeric ID for the endpoint
endpoint_short_id = endpoint_id.split("/")[-1]

print(endpoint_id)

### [projects.locations.endpoints.deployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/deployModel)

### [projects.locations.endpoints.deployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/deployModel)

#### 请求

In [None]:
deployed_model = {
    "model": model_id,
    "display_name": "custom_container_TF" + TIMESTAMP,
    "dedicated_resources": {
        "min_replica_count": 1,
        "machine_spec": {"machine_type": "n1-standard-4", "accelerator_count": 0},
    },
}

print(
    MessageToJson(
        aip.DeployModelRequest(
            endpoint=endpoint_id,
            deployed_model=deployed_model,
            traffic_split={"0": 100},
        ).__dict__["_pb"]
    )
)

```json
{
  "endpoint": "projects/116273516712/locations/us-central1/endpoints/2977125644296519680",
  "deployedModel": {
    "model": "projects/116273516712/locations/us-central1/models/394223297069318144",
    "displayName": "custom_container_TF20210226022223",
    "dedicatedResources": {
      "machineSpec": {
        "machineType": "n1-standard-4"
      },
      "minReplicaCount": 1
    }
  },
  "trafficSplit": {
    "0": 100
  }
}
```

### 电话

In [None]:
request = clients["endpoint"].deploy_model(
    endpoint=endpoint_id, deployed_model=deployed_model, traffic_split={"0": 100}
)

#### 回应

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "部署模型": {
    "id": "1297564458264035328"
  }
}

In [None]:
# The unique ID for the deployed model
deployed_model_id = result.deployed_model.id

print(deployed_model_id)

### [projects.locations.endpoints.predict](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/predict)

### [项目.位置.端点.预测](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/predict)

### 为在线预测准备文件

In [None]:
import base64

import cv2

test_image = x_test[0]
test_label = y_test[0]

print(test_image.shape)

cv2.imwrite("tmp.jpg", (test_image * 255).astype(np.uint8))
bytes = tf.io.read_file("tmp.jpg")
b64str = base64.b64encode(bytes.numpy()).decode("utf-8")

#### 请求

In [None]:
instances_list = [{"bytes_inputs": {"b64": b64str}}]

prediction_request = aip.PredictRequest(endpoint=endpoint_id)
prediction_request.instances.append(instances_list)

print(MessageToJson(prediction_request.__dict__["_pb"]))

{
  "endpoint": "projects/116273516712/locations/us-central1/endpoints/2977125644296519680",
  "instances": [
    [
      {
        "bytes_inputs": {
          "b64": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCAAgACADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD570PxBpmp6nfaEl48lzpUqpewPCU8lpEDqMsOeD26Z55Fa+s3HhnR/Aj6xZjV7rWrW4ke/wBMtLRGRLTaux1cuPnLlhtIAAUEE5490/ao8E6F4b8P3NxZeGksNW1z4h62Iby2t1/eC3ZoozJxwSiKQOhEZJ5JrqZtI8MftFfs56j8YI/hvo/gq1u9C0ywlbTbFoLa+1SOFWlgPGRmNiQzNkiPOflyf1WHFdark0K8UlUbkvJWel1vqmn5n5MuD6MM7qUJzbpxUXazvJSWtmuzTR8iaBoXirx54H1Hxo10mhx2V/8AZltpEE7ByAV8w8YLdRjAHAz1NcSNcXUtev8AwVrE0DajaQ+YZLY4jnXPJXrkjPPTPXGDXvXwi+F3hvwh8Ffip4i1a7GqX7a1b6fp0c84SKO3Wz3FiCdpHnSHDZ2/KAOtfP8A4v8Ah1qOoWul/Efwu4sL+wk8u2IkUi7JRhtwM5RgBkHpz0xXy+F4gzNY6Mqs3NTfvR6a6adj6bGcPZX/AGfKFKEYcqupemurufqP8c9Il/aA8BeHNS+HHh/7Ze634p0rUtMhsFWUJNdsFlR8HAAWWRXBPrmvGvi5+y/B+z1+0ZqHwW+PXx08LaL4VtJI75dOtPEksgfe8krskKIDCZWdCUkyU2MRuVga5X9lr9qAfsk/tCWPjTW9Ol1XwzpurtdXei27gBJTEyJcxBsDcu/OOAwBHBwa8S+JXxltPi3431/x34y8TT/2tqmpy3V1d6h8/mOzFiN46LkgDpgcdOK/HcPxo/qMalONqkn70ei816307I/Xa/C0XjXTrO8EtJdfR/cUfiz4m8aaBJefD/4NXcd4CJ7f/hI7bVXitZ4HkPzSQMvMxRUUTAEqFGCM4EPw/wDAsnhjwZEmrzte6ipKmWeYSbAV+bYTjAJBPTgNjNbOk+HYdL0qPxPcWsN5BK2FaO43q3fHUH8eld34kku/hP4LsvHPiPRtPvZNSkU6fYSFStvED8zsqjLsq5IBwOB1Jri/4iFn2BxSq0Yxulyq8eZLp1f4ms+BMkx2FlRquVm7u0uVvrbRH//Z"
        }
      }
    ]
  ]
}

#### 电话

In [None]:
request = clients["prediction"].predict(endpoint=endpoint_id, instances=instances_list)

中文： #### 响应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

*示例输出*：
```
{
   "predictions": [
     [
       0.0441863947,
       0.0965465382,
       0.131534964,
       0.111121736,
       0.133242667,
       0.0896093696,
       0.160808861,
       0.116257407,
       0.0309255011,
       0.0857665
     ]
   ],
   "deployedModelId": "1297564458264035328"
}
```

### [projects.locations.endpoints.undeployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/undeployModel)

### [projects.locations.endpoints.undeployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/undeployModel)

电话

In [None]:
request = clients["endpoint"].undeploy_model(
    endpoint=endpoint_id, deployed_model_id=deployed_model_id, traffic_split={}
)

#### 回应

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

示例输出：
```
{}
```

清理工作

要清理此项目中使用的所有GCP资源，您可以[删除用于本教程的GCP项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。

In [None]:
delete_model = True
delete_endpoint = True
delete_custom_job = True
delete_batchjob = True
delete_bucket = True

# Delete the model using the Vertex AI fully qualified identifier for the model
try:
    if delete_model:
        clients["model"].delete_model(name=model_id)
except Exception as e:
    print(e)

# Delete the endpoint using the Vertex AI fully qualified identifier for the endpoint
try:
    if delete_endpoint:
        clients["endpoint"].delete_endpoint(name=endpoint_id)
except Exception as e:
    print(e)

# Delete the custom training using the Vertex AI fully qualified identifier for the custom training
try:
    if delete_custom_job:
        clients["job"].delete_custom_job(name=custom_training_id)
except Exception as e:
    print(e)

# Delete the batch job using the Vertex AI fully qualified identifier for the batch job
try:
    if delete_batchjob:
        clients["job"].delete_batch_prediction_job(name=batch_job_id)
except Exception as e:
    print(e)

if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r gs://$BUCKET_NAME