In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

顶点AI：跟踪自定义训练作业的参数和指标

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ml_metadata/sdk-metric-parameter-tracking-for-custom-jobs.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ml_metadata/sdk-metric-parameter-tracking-for-custom-jobs.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/ml_metadata/sdk-metric-parameter-tracking-for-custom-jobs.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在Vertex AI工作台中打开
    </a>
  </td>  
</table>

## 概述

本笔记本展示了如何跟踪 Vertex AI 自定义训练作业的指标和参数，以及如何使用这些数据进行详细分析。

了解更多关于[Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata)、[自定义训练](https://cloud.google.com/vertex-ai/docs/training/custom-training)和[Vertex AI 实验](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments)的信息。

### 目标

在这本笔记本中，您将学习如何使用 Python 的 Vertex AI SDK 来进行以下操作：

这个教程使用以下 Google Cloud ML 服务和资源：
- Vertex AI Dataset
- Vertex AI Model
- Vertex AI Endpoint
- Vertex AI Custom Training Job
- Vertex AI Experiments

执行的步骤包括：
- 跟踪自定义训练任务的训练参数和预测指标。
- 提取并分析实验中的所有参数和指标。

数据集

这个示例使用了“鲍鱼数据集”。有关此数据集的更多信息，请访问：https://archive.ics.uci.edu/ml/datasets/abalone

成本

本教程使用Google Cloud的可计费组件：

- Vertex AI
- Cloud Storage

了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[Cloud Storage定价](https://cloud.google.com/storage/pricing)，并使用[Pricing计算器](https://cloud.google.com/products/calculator/)根据您的预期使用量生成成本估算。

### 安装

安装执行此笔记本所需的软件包。

In [None]:
! pip3 install --upgrade tensorflow \
                         google-cloud-aiplatform \
                         scikit-learn -q

只有合作完成：取消以下单元格的注释以重新启动内核。

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

在开始之前

设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 参考支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

区域

您也可以更改Vertex AI使用的`REGION`变量。了解更多关于[Vertex AI区域](https://cloud.google.com/vertex-ai/docs/general/locations)。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### 验证您的Google云账户

根据您的Jupyter环境，您可能需要手动验证。请按照以下相关说明进行操作。

1. 顶点AI工作台
* 由于您已经通过身份验证，无需进行任何操作。

2. 本地JupyterLab实例，取消注释并运行:

In [None]:
# ! gcloud auth login

3. 合作，取消注释并运行:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

查看如何在https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples 上为您的服务帐号授予云存储权限。

创建一个云存储桶

创建一个存储桶，用来存储中间产物，比如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

只有在您的存储桶尚不存在的情况下才能运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

导入库并定义常量

导入所需的库。

In [None]:
import os

import pandas as pd
from google.cloud import aiplatform
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.python.keras.utils import data_utils

初始化Vertex AI并设置一个实验

定义实验名称。

In [None]:
EXPERIMENT_NAME = "my-experiment-unique"

初始化*客户端*以用于Vertex AI。

In [None]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=BUCKET_URI,
    experiment=EXPERIMENT_NAME,
)

## 在 Vertex AI 自定义训练作业中跟踪参数和指标

### 下载数据集到云存储###

In [None]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv
!gsutil cp abalone_train.csv {BUCKET_URI}/data/

gcs_csv_path = f"{BUCKET_URI}/data/abalone_train.csv"

### 从CSV数据创建一个顶点AI表格数据集

顶点AI数据集可用于创建自动ML模型或自定义模型。

In [None]:
ds = aiplatform.TabularDataset.create(display_name="abalone", gcs_source=[gcs_csv_path])

ds.resource_name

### 编写训练脚本

接下来，您需要编写用于示例自定义训练任务的训练脚本。

In [None]:
%%writefile training_script.py

import pandas as pd
import argparse
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--num_units', dest='num_units',
                    default=64, type=int,
                    help='Number of unit for first layer.')
args = parser.parse_args()

col_names = ["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Age"]
target = "Age"

def aip_data_to_dataframe(wild_card_path):
    return pd.concat([pd.read_csv(fp.numpy().decode(), names=col_names)
                      for fp in tf.data.Dataset.list_files([wild_card_path])])

def get_features_and_labels(df):
    return df.drop(target, axis=1).values, df[target].values

def data_prep(wild_card_path):
    return get_features_and_labels(aip_data_to_dataframe(wild_card_path))


model = tf.keras.Sequential([layers.Dense(args.num_units), layers.Dense(1)])
model.compile(loss='mse', optimizer='adam')

model.fit(*data_prep(os.environ["AIP_TRAINING_DATA_URI"]),
          epochs=args.epochs ,
          validation_data=data_prep(os.environ["AIP_VALIDATION_DATA_URI"]))
print(model.evaluate(*data_prep(os.environ["AIP_TEST_DATA_URI"])))

# save as Vertex AI Managed model
tf.saved_model.save(model, os.environ["AIP_MODEL_DIR"])

启动一个定制训练作业并在Vertex ML Metadata上跟踪其训练参数

In [None]:
job = aiplatform.CustomTrainingJob(
    display_name="train-abalone-dist-1-replica",
    script_path="training_script.py",
    container_uri="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest",
    requirements=["gcsfs==0.7.1"],
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest",
)

开始一个新的实验运行来追踪训练参数并开始训练任务。请注意，此操作将耗时约10分钟。

In [None]:
aiplatform.start_run(
    "custom-training-run-unique"
)  # Change this to your desired run name
parameters = {"epochs": 10, "num_units": 64}
aiplatform.log_params(parameters)

model = job.run(
    ds,
    replica_count=1,
    model_display_name="abalone-model",
    args=[f"--epochs={parameters['epochs']}", f"--num_units={parameters['num_units']}"],
)

部署模型并计算预测指标

接下来，将您的Vertex AI模型资源部署到Vertex AI终端节点资源上。这个操作将需要10-20分钟。

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4")

### 预测数据集准备和在线预测

一旦模型部署完成，使用“鲍鱼测试”数据集进行在线预测，并计算预测指标。

准备预测数据集。

In [None]:
def read_data(uri):
    dataset_path = data_utils.get_file("abalone_test.data", uri)
    col_names = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Age",
    ]
    dataset = pd.read_csv(
        dataset_path,
        names=col_names,
        na_values="?",
        comment="\t",
        sep=",",
        skipinitialspace=True,
    )
    return dataset


def get_features_and_labels(df):
    target = "Age"
    return df.drop(target, axis=1).values, df[target].values


test_dataset, test_labels = get_features_and_labels(
    read_data(
        "https://storage.googleapis.com/download.tensorflow.org/data/abalone_test.csv"
    )
)

进行在线预测。

In [None]:
prediction = endpoint.predict(test_dataset.tolist())
prediction

计算并跟踪预测评估指标。

In [None]:
mse = mean_squared_error(test_labels, prediction.predictions)
mae = mean_absolute_error(test_labels, prediction.predictions)

aiplatform.log_metrics({"mse": mse, "mae": mae})

提取在这次实验中创建的所有参数和度量标准。

In [None]:
aiplatform.get_experiment_df()

在云控制台中查看数据

参数和指标也可以在云控制台上查看。

In [None]:
print("Vertex AI Experiments:")
print(
    f"https://console.cloud.google.com/ai/platform/experiments/experiments?folder=&organizationId=&project={PROJECT_ID}"
)

清理

要清理此项目中使用的所有 Google Cloud 资源，您可以删除用于本教程的[Google Cloud 项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除本教程中创建的各个资源：
训练作业
模型
云存储桶

* Vertex AI 数据集
* 训练作业
* 模型
* 终端
* 云存储桶

In [None]:
# Warning: Setting this to true will delete everything in your bucket
delete_bucket = False

# Delete dataset
ds.delete()

# Delete experiment
experiment = aiplatform.Experiment(
    experiment_name=EXPERIMENT_NAME, project=PROJECT_ID, location=REGION
)
experiment.delete()

# Delete the training job
job.delete()

# Undeploy model from endpoint
endpoint.undeploy_all()

# Delete the endpoint
endpoint.delete()

# Delete the model
model.delete()


if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI