In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用自定义容器的 Vertex AI TensorBoard 自定义训练

<table align="left">

  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tensorboard/tensorboard_custom_training_with_custom_container.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> <br> 在 Colab 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Ftensorboard%2Ftensorboard_custom_training_with_custom_container.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"> <br> 在 Colab Enterprise 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/tensorboard/tensorboard_custom_training_with_custom_container.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> <br>
      在 Workbench 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/tensorboard/tensorboard_custom_training_with_custom_container.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"> <br>
      在 GitHub 上查看
    </a>
  </td>                                                                                               
</table>

## 概述

### 什么是Vertex AI TensorBoard

Vertex AI TensorBoard是[开源TensorBoard](https://www.tensorflow.org/tensorboard/get_started)的企业级托管版本，TensorBoard是谷歌的一个用于机器学习实验可视化的开源项目。

Vertex AI TensorBoard提供各种详细的可视化功能，包括：

* 跟踪和可视化诸如损失和准确度随时间变化的指标，
* 可视化模型计算图（操作和层），
* 查看权重、偏差或其他张量随时间变化的直方图，
* 投影嵌入到一个较低维度空间，
* 显示图像、文本和音频样本。

除了来自TensorBoard的强大可视化功能之外，Vertex AI TensorBoard还提供：

* 对实验的仪表板的持久、可共享链接，
* 项目中所有实验的可搜索列表，
* 与Vertex AI服务进行模型训练的紧密集成，
* 企业级安全、隐私和合规性。

通过Vertex AI TensorBoard，您可以跟踪、可视化和比较机器学习实验，并与团队共享。

了解更多关于[Vertex AI TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-introduction)和[自定义训练](https://cloud.google.com/vertex-ai/docs/training/custom-training)的信息。

### 目标

在本教程中，您将学习如何使用自定义容器创建一个自定义训练任务，并在近实时内监视您的训练过程在 Vertex AI TensorBoard 上进行。

本教程使用以下 Google Cloud ML 服务和资源：

- Vertex AI 训练
- Vertex AI TensorBoard

执行的步骤包括：

* 创建 Docker 存储库和配置。
* 使用您定制的训练代码创建自定义容器镜像。
* 设置服务帐户和 Google Cloud 存储桶。
* 使用您的自定义容器创建和启动您的自定义训练任务。

数据集

本教程中使用的数据集是由TensorFlow提供的[花卉数据集](https://www.tensorflow.org/datasets/catalog/tf_flowers)。不需要其他数据集。

### 成本

本教程使用的谷歌云计费组件有：

* Vertex AI
* 云存储
* 谷歌Artifact Registry

了解[Vertex AI 定价](https://cloud.google.com/vertex-ai/pricing), [云存储 定价](https://cloud.google.com/storage/pricing), 以及 [Google Artifact Registry 定价](https://cloud.google.com/artifact-registry/pricing)。

请使用[Pricing Calculator](https://cloud.google.com/products/calculator/)根据您的预期使用情况生成成本估算。

开始吧

### 为Python安装Vertex AI SDK和其他所需的软件包

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform 

重启运行时（仅限Colab）

为了使用新安装的包，您必须重启Google Colab上的运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ 内核将重新启动。在继续下一步之前等待它完成。⚠️</b>
</div>

### 验证您的笔记本环境（仅限Colab）

在Google Colab上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并初始化Python的Vertex AI SDK

要开始使用Vertex AI，您必须拥有一个现有的Google Cloud项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

LOCATION = "us-central1"  # @param {type: "string"}

UUID

为了避免在创建资源时用户之间的名称冲突，为每个会话实例创建一个UUID。将这些UUID附加到本教程中创建的资源的相应名称中。

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

创建一个云存储桶

创建一个存储桶来存储中间产物，如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

如果您的存储桶尚不存在：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### 导入 ai 平台

In [None]:
import os

import google.cloud.aiplatform as aiplatform

### 初始化 Python 的 Vertex AI SDK
为您的项目和对应的存储桶初始化 Python 的 Vertex AI SDK。

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### 启用Artifact Registry API
首先，您必须为您的项目启用Artifact Registry API服务。

查看更多关于[启用服务页面](https://cloud.google.com/artifact-registry/docs/enable-service)的信息。

In [None]:
! gcloud services enable artifactregistry.googleapis.com

if os.getenv("IS_TESTING"):
    ! sudo apt-get update --yes && sudo apt-get --only-upgrade --yes install google-cloud-sdk-cloud-run-proxy google-cloud-sdk-harbourbridge google-cloud-sdk-cbt google-cloud-sdk-gke-gcloud-auth-plugin google-cloud-sdk-kpt google-cloud-sdk-local-extract google-cloud-sdk-minikube google-cloud-sdk-app-engine-java google-cloud-sdk-app-engine-go google-cloud-sdk-app-engine-python google-cloud-sdk-spanner-emulator google-cloud-sdk-bigtable-emulator google-cloud-sdk-nomos google-cloud-sdk-package-go-module google-cloud-sdk-firestore-emulator kubectl google-cloud-sdk-datastore-emulator google-cloud-sdk-app-engine-python-extras google-cloud-sdk-cloud-build-local google-cloud-sdk-kubectl-oidc google-cloud-sdk-anthos-auth google-cloud-sdk-app-engine-grpc google-cloud-sdk-pubsub-emulator google-cloud-sdk-datalab google-cloud-sdk-skaffold google-cloud-sdk google-cloud-sdk-terraform-tools google-cloud-sdk-config-connector
    ! gcloud components update --quiet


### 创建Docker存储库

在您的`位置`中创建一个名为`DOCKER_REPOSITORY`的Docker存储库。
此Docker存储库将在清理部分末尾被删除。

In [None]:
DOCKER_REPOSITORY = "my-docker-repo-unique"  # @param {type:"string"}

print("Docker repository to create:", DOCKER_REPOSITORY)

In [None]:
! gcloud  artifacts repositories create  $DOCKER_REPOSITORY --project={PROJECT_ID} \
--repository-format=docker \
--location={LOCATION} --description="Repository for TensorBoard Custom Training Job" 

In [None]:
! gcloud artifacts repositories list --project={PROJECT_ID}

创建一个自定义容器镜像并推送到 Artifact Registry

In [None]:
# Create a folder for the image.
! mkdir tb-custom-container
%cd tb-custom-container

创建一个训练代码
在task.py文件中编写您自己的训练代码。您可以使用以下代码作为示例。

In [None]:
%%writefile task.py

import logging
import os

import tensorflow as tf
import tensorflow_datasets as tfds

IMG_WIDTH = 128

def normalize_img(image):
    """Normalizes image.

    * Resizes image to IMG_WIDTH x IMG_WIDTH pixels
    * Casts values from `uint8` to `float32`
    * Scales values from [0, 255] to [0, 1]

    Returns:
      A tensor with shape (IMG_WIDTH, IMG_WIDTH, 3). (3 color channels)
    """
    image = tf.image.resize_with_pad(image, IMG_WIDTH, IMG_WIDTH)
    return image / 255.


def normalize_img_and_label(image, label):
    """Normalizes image and label.

    * Performs normalize_img on image
    * Passes through label unchanged

    Returns:
      Tuple (image, label) where
      * image is a tensor with shape (IMG_WIDTH, IMG_WIDTH, 3). (3 color
        channels)
      * label is an unchanged integer [0, 4] representing flower type
    """
    return normalize_img(image), label

logging.info('Loading and preprocessing data ...')
dataset = tfds.load('tf_flowers:3.*.*',
                    split='train',
                    try_gcs=True,
                    shuffle_files=True,
                    as_supervised=True)
dataset = dataset.map(normalize_img_and_label,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.cache()
dataset = dataset.shuffle(1000)
dataset = dataset.batch(128)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

logging.info('Creating and training model ...')

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16,
                           3,
                           padding='same',
                           activation='relu',
                           input_shape=(IMG_WIDTH, IMG_WIDTH, 3)),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(5)  # 5 classes
])

logging.info('Compiling model ...')
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

log_dir = "logs"
if 'AIP_TENSORBOARD_LOG_DIR' in os.environ:
  log_dir=os.environ['AIP_TENSORBOARD_LOG_DIR']

tensorboard_callback = tf.keras.callbacks.TensorBoard(
  log_dir=log_dir,
  histogram_freq=1)

logging.info('Training model ...')
model.fit(dataset, epochs=13, callbacks=[tensorboard_callback])

logging.info('Model training done')

创建您自己的 `Dockerfile` 来指定构建容器所需的所有指令。您可以使用以下 `Dockerfile` 作为示例。

In [None]:
%%writefile Dockerfile

# Specifies base image and tag
FROM us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest
RUN pip install tensorflow-datasets
WORKDIR /root


# Installs additional packages as you need.

# Copies the trainer code to the docker image.
COPY task.py /root/task.py

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "task.py"]

使用 `gcloud builds` 和您的训练代码以及 `Dockerfile` 构建您的容器镜像。

*请注意*，这一步骤可能需要几分钟时间。

In [None]:
IMAGE_NAME = "tensorboard-custom-container"
IMAGE_TAG = "v1"
IMAGE_URI = "{}-docker.pkg.dev/{}/{}/{}:{}".format(
    LOCATION, PROJECT_ID, DOCKER_REPOSITORY, IMAGE_NAME, IMAGE_TAG
)

! gcloud builds submit --project {PROJECT_ID} --region={LOCATION} --tag {IMAGE_URI} --timeout=20m

设置服务账号和权限

服务账号用于创建自定义训练作业。如果您不想使用您项目的Compute Engine服务账号，请将SERVICE_ACCOUNT设置为另一个服务账号ID。您可以按照此[创建服务账号](https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating)中的说明创建一个服务账号。

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = ! gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

用您的容器创建一个自定义培训任务。
创建一个TensorBoard实例供自定义培训任务使用。

In [None]:
TENSORBOARD_NAME = "[your-tensorboard-name]"  # @param {type:"string"}

if (
    TENSORBOARD_NAME == ""
    or TENSORBOARD_NAME is None
    or TENSORBOARD_NAME == "[your-tensorboard-name]"
):
    TENSORBOARD_NAME = PROJECT_ID + "-tb-" + UUID

tensorboard = aiplatform.Tensorboard.create(
    display_name=TENSORBOARD_NAME, project=PROJECT_ID, location=LOCATION
)
TENSORBOARD_RESOURCE_NAME = tensorboard.gca_resource.name
print("TensorBoard resource name:", TENSORBOARD_RESOURCE_NAME)

运行以下示例请求，使用您刚构建并上传到Artifact Registry的容器来创建自己的自定义训练工作，并将训练结果流式传输到TensorBoard。

In [None]:
JOB_NAME = "tensorboard-example-job-{}".format(UUID)
BASE_OUTPUT_DIR = "{}/{}".format(BUCKET_URI, JOB_NAME)

job = aiplatform.CustomContainerTrainingJob(
    display_name=JOB_NAME,
    container_uri=IMAGE_URI,
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
)

job.run(
    machine_type="n1-standard-8",
    replica_count=1,
    service_account=SERVICE_ACCOUNT,
    tensorboard=TENSORBOARD_RESOURCE_NAME,
)

在 Google Cloud 控制台上，您可以在 Vertex AI > 训练 > 定制作业 中监视您的训练作业。在每个定制训练作业中，可以通过`OPEN TENSORBOARD`按钮查看近实时更新的 TensorBoard。
了解更多，请查看[查看 Vertex AI TensorBoard 数据](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-view)。

清理

要清理本项目中使用的所有Google Cloud资源，您可以删除用于教程的[Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，**如果您在笔记本中创建了单独的资源**，可以按照以下步骤删除它们：

In [None]:
import os

# Delete docker repository.
! gcloud artifacts repositories delete $DOCKER_REPOSITORY --project {PROJECT_ID} --location {LOCATION} --quiet

# Delete TensorBoard instance.
! gcloud ai tensorboards delete {TENSORBOARD_RESOURCE_NAME}

# Delete custom job.
job.delete()

# Delete GCS bucket.
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI