In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 自动ML表格工作流管道

<a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl_tabular_on_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在 Colab 中运行
</a>

<a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl_tabular_on_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"> 在 GitHub 中查看
</a>

<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/automl/automl_tabular_on_vertex_pipelines.ipynb">
        <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> 在 Vertex AI Workbench 中打开
</a>

## 概述

在本教程中，您将使用两个Vertex AI Tabular Workflows管道来训练AutoML模型，使用不同的配置。您将看到如何使用`get_automl_tabular_pipeline_and_parameters`来自定义默认的AutoML Tabular管道，以及如何通过使用上一个管道运行的调整结果来减少AutoML模型的训练时间和成本的`get_skip_architecture_search_pipeline_and_parameters`。

了解更多关于[端到端AutoML的Tabular工作流程](https://cloud.google.com/vertex-ai/docs/tabular-data/tabular-workflows/e2e-automl)。

### 目标

在本教程中，您将学习如何使用从[Google Cloud流水线组件](https://cloud.google.com/vertex-ai/docs/pipelines/introduction)（GCPC）下载的[Vertex AI Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction)创建两个回归模型。这些流水线将是由Google维护的Vertex AI表格工作流程流水线。这些流水线将展示不同的方式来定制Vertex表格训练过程。

本教程使用以下Google Cloud ML服务：

- `自动ML训练`
- `Vertex AI数据集`

执行的步骤包括：

- 创建一个训练流水线，将搜索空间从默认值减少以节省时间。
- 创建一个训练流水线，重用先前流水线的架构搜索结果以节省时间。

数据集

您将使用的数据集是[银行营销](https://archive.ics.uci.edu/ml/datasets/bank+marketing)。这些数据是葡萄牙银行机构直接营销活动（电话呼叫）的数据。二元分类的目标是预测客户是否会订阅定期存款。对于此笔记本，我们随机选择了原始数据集中90％的行，并将它们保存在托管在云存储上的train.csv文件中。要下载该文件，请点击[这里](https://storage.googleapis.com/cloud-samples-data/vertex-ai/tabular-workflows/datasets/bank-marketing/train.csv)。

### 成本

本教程使用 Google Cloud 的计费组件：

* Vertex AI
* Cloud Storage

了解 [Vertex AI
价格](https://cloud.google.com/vertex-ai/pricing) 和 [Cloud Storage
价格](https://cloud.google.com/storage/pricing)，使用 [价格
计算器](https://cloud.google.com/products/calculator/)
根据您预计的使用情况生成成本估算。

## 安装

安装最新版本的用于Python的Vertex AI SDK。

In [None]:
!pip3 install --upgrade --quiet google-cloud-pipeline-components==1.0.25 \
                                google-cloud-aiplatform

### 只有 Colab: 取消注释以下单元格以重新启动内核

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

在开始之前

设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下步骤：
- 运行 `gcloud config list`。
- 运行 `gcloud projects list`。
- 参考支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

## 关于服务账号和权限的注意事项

**默认情况下不需要任何配置**，如果遇到任何与权限相关的问题，请确保服务账号具有[端到端AutoML文档](https://cloud.google.com/vertex-ai/docs/tabular-data/tabular-workflows/service-accounts#e2e-automl)中列出的所需角色。

区域

您还可以更改 Vertex AI 使用的 `REGION` 变量。了解有关 [Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations) 的更多信息。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### 认证您的Google Cloud帐户

根据您的Jupyter环境，您可能需要手动进行认证。请按照以下相关说明进行操作。

**1. Vertex AI Workbench**
* 不需要做任何操作，因为您已经认证通过。

**2. 本地JupyterLab实例，取消注释并运行：**

In [None]:
# ! gcloud auth login

3. 协作，取消注释并运行:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

4. 服务账户或其他
* 请参考如何为您的服务账户授予云存储权限，链接地址：https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples。

### 创建一个云存储桶

创建一个存储桶来存储中间产物，如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

只有在您的存储桶不存在时：运行以下单元格以创建您的云存储桶。

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

#### 服务账号

您可以使用服务账号来创建Vertex AI管道作业。如果您不想使用项目的Compute Engine服务账号，请将`SERVICE_ACCOUNT`设置为另一个服务账号ID。

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"

In [None]:
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

设置Vertex AI Pipelines的服务帐户访问权限
运行以下命令，将您的服务帐户访问权限授予读取和写入管道工件的存储桶，该存储桶是在上一步中创建的。每个服务帐户只需要运行此步骤一次。

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

导入库并定义常量

In [None]:
import json
# Import required modules
import os
import uuid
from typing import Any, Dict, List

from google.cloud import aiplatform, storage
from google_cloud_pipeline_components.experimental.automl.tabular import \
    utils as automl_tabular_utils

初始化 Python 的 Vertex AI SDK，用于您的项目。

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

### 定义辅助函数

In [None]:
def get_bucket_name_and_path(uri):
    no_prefix_uri = uri[len("gs://") :]
    splits = no_prefix_uri.split("/")
    return splits[0], "/".join(splits[1:])


def download_from_gcs(uri):
    bucket_name, path = get_bucket_name_and_path(uri)
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(path)
    return blob.download_as_string()


def write_to_gcs(uri: str, content: str):
    bucket_name, path = get_bucket_name_and_path(uri)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(path)
    blob.upload_from_string(content)


def generate_auto_transformation(column_names: List[str]) -> List[Dict[str, Any]]:
    transformations = []
    for column_name in column_names:
        transformations.append({"auto": {"column_name": column_name}})
    return transformations


def write_auto_transformations(uri: str, column_names: List[str]):
    transformations = generate_auto_transformation(column_names)
    write_to_gcs(uri, json.dumps(transformations))


def get_task_detail(
    task_details: List[Dict[str, Any]], task_name: str
) -> List[Dict[str, Any]]:
    for task_detail in task_details:
        if task_detail.task_name == task_name:
            return task_detail


def get_deployed_model_uri(
    task_details,
):
    ensemble_task = get_task_detail(task_details, "model-upload")
    return ensemble_task.outputs["model"].artifacts[0].uri


def get_no_custom_ops_model_uri(task_details):
    ensemble_task = get_task_detail(task_details, "automl-tabular-ensemble")
    return download_from_gcs(
        ensemble_task.outputs["model_without_custom_ops"].artifacts[0].uri
    )


def get_feature_attributions(
    task_details,
):
    ensemble_task = get_task_detail(task_details, "model-evaluation-2")
    return download_from_gcs(
        ensemble_task.outputs["evaluation_metrics"]
        .artifacts[0]
        .metadata["explanation_gcs_path"]
    )


def get_evaluation_metrics(
    task_details,
):
    ensemble_task = get_task_detail(task_details, "model-evaluation")
    return download_from_gcs(
        ensemble_task.outputs["evaluation_metrics"].artifacts[0].uri
    )


def load_and_print_json(s):
    parsed = json.loads(s)
    print(json.dumps(parsed, indent=2, sort_keys=True))

### 定义培训规范

In [None]:
run_evaluation = True  # @param {type:"boolean"}
run_distillation = False  # @param {type:"boolean"}
root_dir = os.path.join(BUCKET_URI, "automl_tabular_pipeline")
prediction_type = "classification"
optimization_objective = "minimize-log-loss"
target_column = "deposit"
data_source_csv_filenames = "gs://cloud-samples-data/vertex-ai/tabular-workflows/datasets/bank-marketing/train.csv"
data_source_bigquery_table_path = None  # format: bq://bq_project.bq_dataset.bq_table

timestamp_split_key = None  # timestamp column name when using timestamp split
stratified_split_key = None  # target column name when using stratified split
training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

predefined_split_key = None
if predefined_split_key:
    training_fraction = None
    validation_fraction = None
    test_fraction = None

weight_column = None

features = [
    "age",
    "job",
    "marital",
    "education",
    "default",
    "balance",
    "housing",
    "loan",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
]
transformations = generate_auto_transformation(features)
transform_config_path = os.path.join(root_dir, f"transform_config_{uuid.uuid4()}.json")
write_to_gcs(transform_config_path, json.dumps(transformations))

## 与VPC相关的配置

如果需要使用自定义的Dataflow子网络，您可以通过`dataflow_subnetwork`参数进行设置。要求如下：
1. `dataflow_subnetwork`必须是完全限定的子网络名称。
   [[参考链接](https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications)]
1. 以下服务账号必须在指定的Dataflow子网络上分配[Compute Network User角色](https://cloud.google.com/compute/docs/access/iam#compute.networkUser)：
    1. Compute Engine默认服务账号：PROJECT_NUMBER-compute@developer.gserviceaccount.com
    2. Dataflow服务账号：service-PROJECT_NUMBER@dataflow-service-producer-prod.iam.gserviceaccount.com

如果您的项目已启用VPC-SC，请确保：

1. 用于VPC-SC的Dataflow子网络已正确配置用于Dataflow。
   [[参考链接](https://cloud.google.com/dataflow/docs/guides/routes-firewall)]
2. `dataflow_use_public_ips`设置为False。

In [None]:
# Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used.
# Fully qualified subnetwork name is in the form of
# https://www.googleapis.com/compute/v1/projects/HOST_PROJECT_ID/regions/REGION_NAME/subnetworks/SUBNETWORK_NAME
# reference: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_subnetwork = None  # @param {type:"string"}
# Specifies whether Dataflow workers use public IP addresses.
dataflow_use_public_ips = True  # @param {type:"boolean"}

自定义搜索空间并更改训练配置

我们将创建一个跳过评估的 AutoML Tables 流水线，其中包括以下自定义设置：
- 限制超参数搜索空间
- 更改机器类型和调整/训练并行性

In [None]:
study_spec_parameters_override = [
    {
        "parameter_id": "model_type",
        "categorical_value_spec": {
            "values": [
                "nn"
            ]  # The default value is ["nn", "boosted_trees"], this reduces the search space
        },
    }
]

worker_pool_specs_override = [
    {"machine_spec": {"machine_type": "n1-standard-8"}},  # override for TF chief node
    {},  # override for TF worker node, since it's not used, leave it empty
    {},  # override for TF ps node, since it's not used, leave it empty
    {
        "machine_spec": {
            "machine_type": "n1-standard-4"  # override for TF evaluator node
        }
    },
]

# Number of weak models in the final ensemble model is
# stage_2_num_selected_trials * 5. If unspecified, 5 is the default value for
# stage_2_num_selected_trials.
stage_2_num_selected_trials = 5

# The pipeline output a TF saved model contains the following TF custom op:
# - https://github.com/google/struct2tensor
#
# There are a few ways to run the model:
# - Official prediction server docker image
#   Please follow the "Run the model server" section in
#   https://cloud.google.com/vertex-ai/docs/export/export-model-tabular#run-server
# - Python or cpp runtimes like TF serving
#   Please set export_additional_model_without_custom_ops so the pipeline
#   outputs an additional model does does not depend on struct2tensor.
#   - `get_no_custom_ops_model_uri` shows how to get the model artifact URI.
#   - The input to the model is a dictionary of feature name to tensor. Use
#     `saved_model_cli show --dir {saved_model.pb's path} --signature_def serving_default --tag serve`
#     to find out more details.
export_additional_model_without_custom_ops = False

train_budget_milli_node_hours = 1000  # 1 hour

(
    template_path,
    parameter_values,
) = automl_tabular_utils.get_automl_tabular_pipeline_and_parameters(
    PROJECT_ID,
    REGION,
    root_dir,
    target_column,
    prediction_type,
    optimization_objective,
    transform_config_path,
    train_budget_milli_node_hours,
    data_source_csv_filenames=data_source_csv_filenames,
    data_source_bigquery_table_path=data_source_bigquery_table_path,
    weight_column=weight_column,
    predefined_split_key=predefined_split_key,
    timestamp_split_key=timestamp_split_key,
    stratified_split_key=stratified_split_key,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    study_spec_parameters_override=study_spec_parameters_override,
    stage_1_tuner_worker_pool_specs_override=worker_pool_specs_override,
    cv_trainer_worker_pool_specs_override=worker_pool_specs_override,
    run_evaluation=run_evaluation,
    run_distillation=run_distillation,
    dataflow_subnetwork=dataflow_subnetwork,
    dataflow_use_public_ips=dataflow_use_public_ips,
    export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
)

job_id = "automl-tabular-{}".format(uuid.uuid4())
job = aiplatform.PipelineJob(
    display_name=job_id,
    location=REGION,  # launches the pipeline job in the specified region
    template_path=template_path,
    job_id=job_id,
    pipeline_root=root_dir,
    parameter_values=parameter_values,
    enable_caching=False,
)

job.run()


pipeline_task_details = job.gca_resource.job_detail.task_details

if export_additional_model_without_custom_ops:
    print(
        "trained model without custom TF ops:",
        get_no_custom_ops_model_uri(pipeline_task_details),
    )

if run_evaluation:
    print("evaluation metrics:")
    load_and_print_json(get_evaluation_metrics(pipeline_task_details))

    print("feature attributions:")
    load_and_print_json(get_feature_attributions(pipeline_task_details))

跳过架构搜索
不必每次都进行架构搜索，我们可以重复使用现有的架构搜索结果。这可以有助于：
1. 减少输出模型的变化
2. 减少训练成本

现有的架构搜索结果存储在`automl-tabular-stage-1-tuner`组件的`tuning_result_output`输出中。我们可以手动输入它或通过程序获取它。

In [None]:
stage_1_tuner_task = get_task_detail(
    pipeline_task_details, "automl-tabular-stage-1-tuner"
)

stage_1_tuning_result_artifact_uri = (
    stage_1_tuner_task.outputs["tuning_result_output"].artifacts[0].uri
)

运行跳过架构搜索管道

In [None]:
(
    template_path,
    parameter_values,
) = automl_tabular_utils.get_skip_architecture_search_pipeline_and_parameters(
    PROJECT_ID,
    REGION,
    root_dir,
    target_column,
    prediction_type,
    optimization_objective,
    transform_config_path,
    train_budget_milli_node_hours,
    data_source_csv_filenames=data_source_csv_filenames,
    data_source_bigquery_table_path=data_source_bigquery_table_path,
    weight_column=weight_column,
    predefined_split_key=predefined_split_key,
    timestamp_split_key=timestamp_split_key,
    stratified_split_key=stratified_split_key,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    stage_1_tuning_result_artifact_uri=stage_1_tuning_result_artifact_uri,
    run_evaluation=run_evaluation,
    dataflow_subnetwork=dataflow_subnetwork,
    dataflow_use_public_ips=dataflow_use_public_ips,
)

job_id = "automl-tabular-skip-architecture-search-{}".format(uuid.uuid4())
job = aiplatform.PipelineJob(
    display_name=job_id,
    location=REGION,  # launches the pipeline job in the specified region
    template_path=template_path,
    job_id=job_id,
    pipeline_root=root_dir,
    parameter_values=parameter_values,
    enable_caching=False,
)

job.run()

# Get model URI
skip_architecture_search_pipeline_task_details = (
    job.gca_resource.job_detail.task_details
)

if export_additional_model_without_custom_ops:
    print(
        "trained model without custom TF ops:",
        get_no_custom_ops_model_uri(pipeline_task_details),
    )

清理顶点和BigQuery资源

要清理此项目中使用的所有Google Cloud资源，您可以[删除用于教程的Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在此教程中创建的各个资源：

- 云存储存储桶

In [None]:
if os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI