In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# GCP上的端到端机器学习: MLOps阶段2: 使用Vertex AI实验开始自动记录XGBoost模型

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage2/get_started_with_vertex_experiments_autologging_xgboost.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage2/get_started_with_vertex_experiments_autologging_xgboost.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/ml_ops/stage2/get_started_with_vertex_experiments_autologging_xgboost.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在Vertex AI工作台中打开
    </a>
  </td>
</table>
<br/><br/><br/>

## 概述

本教程演示了如何使用DIY代码使用“Vertex AI实验”来实现实验参数和指标的自动记录。

### 目标

在本教程中，您将学习如何创建一个用于训练XGBoost模型的实验，并使用附带的自助（DIY）代码自动记录参数和指标。

该教程使用以下Google Cloud ML服务和资源：

- `Vertex AI Experiments`

执行的步骤包括：

- 构建自助记录代码。
- 构建包含自动记录调用的训练包。
- 训练模型。
- 查看实验。
- 删除实验。

### 数据集

本教程使用的数据集是来自[TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview)的[Iris数据集](https://www.tensorflow.org/datasets/catalog/iris)。该数据集不需要任何特征工程。本教程中数据集的版本存储在一个公共的云存储桶中。训练好的模型可以预测三种不同种类的鸢尾花品种：山鸢尾(setosa)、维吉尼亚(virginica)或者变色鸢尾(versicolor)。

### 成本

本教程使用 Google Cloud 的收费组件：

* Vertex AI
* Cloud Storage

了解[Vertex AI 价格定价](https://cloud.google.com/vertex-ai/pricing)和[Cloud Storage 价格定价](https://cloud.google.com/storage/pricing)，并使用[Pricing Calculator](https://cloud.google.com/products/calculator/)根据您预期的使用情况生成成本估算。

### 设置本地开发环境

如果您正在使用Colab或Vertex Workbench AI笔记本，您的环境已经满足运行此笔记本的所有要求。您可以跳过此步骤。

否则，请确保您的环境符合此笔记本的要求。您需要以下内容：

- 云存储SDK
- Git
- Python 3
- virtualenv
- 在使用Python 3的虚拟环境中运行的Jupyter笔记本

[设置Python开发环境](https://cloud.google.com/python/setup)和[Jupyter安装指南](https://jupyter.org/install)提供了满足这些要求的详细说明。以下步骤提供了一组简化的说明：

1. [安装并初始化SDK](https://cloud.google.com/sdk/docs/)。

2. [安装Python 3](https://cloud.google.com/python/setup#installing_python)。

3. [安装virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)并创建一个使用Python 3的虚拟环境。激活虚拟环境。

4. 要安装Jupyter，请在终端窗口中运行`pip3 install jupyter`。

5. 要启动Jupyter，请在终端窗口中运行`jupyter notebook`。

6. 在Jupyter Notebook仪表板中打开此笔记本。

## 安装

安装以下包以执行此笔记本。

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade --quiet {USER_FLAG} google-cloud-aiplatform \
                                             xgboost \
                                             scikit-learn \
                                             numpy

重新启动内核

安装额外的包后，您需要重新启动笔记本内核，以便它可以找到这些包。

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 开始之前

### GPU 运行时

本教程不需要 GPU 运行时。

### 设置您的 Google 云项目

**无论您的笔记本环境如何，都需要执行以下步骤。**

1. [选择或创建 Google 云项目](https://console.cloud.google.com/cloud-resource-manager)。您第一次创建账户时，您将获得 300 美元的免费信用额度用于您的计算/存储成本。

2. [确保您的项目已启用计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用以下 API：Vertex AI API、Compute Engine API 和 Cloud Storage。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component,storage-component.googleapis.com)

4. 如果您在本地运行此笔记本，您需要安装 [Cloud SDK](https://cloud.google.com/sdk)。

5. 在下面的单元格中输入您的项目 ID。然后运行该单元格，以确保 Cloud SDK 在此笔记本中的所有命令中使用正确的项目。

**注意**：Jupyter 运行以 `!` 为前缀的行作为 shell 命令，并且会插入以 `$` 为前缀的 Python 变量。

设置您的项目ID

**如果您不知道您的项目ID**，您可以使用`gcloud`来获取您的项目ID。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### 区域

您还可以更改 `REGION` 变量，该变量用于本笔记本的其余部分操作。以下是 Vertex AI 支持的区域。我们建议您选择最靠近您的区域。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您可能不能使用多区域存储桶进行 Vertex AI 训练。并非所有区域都支持所有 Vertex AI 服务。

了解更多有关 [Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

UUID

如果您正在进行实时教程会话，您可能正在使用共享测试帐户或项目。为了避免在创建的资源上的用户之间发生名称冲突，您可以为每个实例会话创建一个uuid，并将其附加到您在本教程中创建的资源的名称上。

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

### 认证您的Google Cloud帐户

**如果您正在使用Vertex AI Workbench笔记本**，则您的环境已经经过身份验证。

**如果您正在使用Colab**，运行下面的单元格，并按照提示进行身份验证。

**否则**，请按照以下步骤进行操作：

在Cloud Console中，转到[创建服务帐户密钥](https://console.cloud.google.com/apis/credentials/serviceaccountkey)页面。

**单击创建服务帐户**。

在**服务帐户名称**字段中输入名称，然后单击**创建**。

在**将此服务帐户授予对项目的访问权限**部分，单击角色下拉列表。在筛选框中输入“Vertex”，然后选择**Vertex管理员**。在筛选框中输入“存储对象管理员”，然后选择**存储对象管理员**。

单击创建。包含您的密钥的JSON文件将下载到本地环境。

在下面的单元格中将服务帐户密钥的路径输入为GOOGLE_APPLICATION_CREDENTIALS变量，然后运行该单元格。

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### 导入库并定义常量

In [None]:
import google.cloud.aiplatform as aiplatform
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score

# to suppress lint message (unused)
precision_score, recall_score

初始化Python的Vertex AI SDK

为您的项目和相应的存储桶初始化Python的Vertex AI SDK。

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

## 用于自动记录XGBoost模型的DIY代码

以下代码实现了XGBoost模型的自动记录。

- `autologging()`: 初始化实验并使用堆注入来替换堆上的`xgboost.train()`符号，使用重定向包装函数`VertexXGBtrain`。

- `VertexXGBtrain`: 用于XGBoost train()函数的包装函数。自动记录超参数并调用底层函数。

- `VertexSKLaccuracy_score`: 用于scikit-learn accuracy_score()函数的包装函数。自动调用底层函数并记录指标结果。

In [None]:
def autolog(
    project: str = None,
    location: str = None,
    staging_bucket: str = None,
    experiment: str = None,
    run: str = None,
    framework: str = "tf",
):
    """
    Enable automatic logging of parameters and metrics in Vertex AI Experiments,
    for corresponding framework.

        project: The project ID
        location : The region
        staging_bucket: temporary bucket
        experiment: The name of the experiment
        run: The name of the run within the experiment
        framework: The ML framework for which a model is being trained.
    """
    # autologging
    if framework == "tf":
        try:
            globals()["Sequential"] = VertexTFSequential
            if "tf" in globals():
                tf.keras.Sequential = VertexTFSequential
            if "tensorflow" in globals():
                tensorflow.keras.Sequential = VertexTFSequential
        except:
            pass

        try:
            globals()["Model"] = VertexTFModel
            if "tf" in globals():
                tf.keras.Model = VertexTFModel
            if "tensorflow" in globals():
                tensorflow.keras.Model = VertexTFModel
        except:
            pass
    elif framework == "xgb":
        global real_xgb_train
        global real_accuracy_score, real_precision_score, real_recall_score
        import sklearn

        try:
            if "xgboost" in globals():
                real_xgb_train = xgboost.train
                xgboost.train = VertexXGBtrain
        except:
            pass

        try:
            if "xgb" in globals():
                real_xgb_train = xgb.train
                xgb.train = VertexXGBtrain
        except:
            pass

        try:
            global accuracy_score, precision_score, recall_score
            if "accuracy_score" in globals():
                real_accuracy_score = sklearn.metrics.accuracy_score
                sklearn.metrics.accuracy_score = VertexSKLaccuracy_score
                accuracy_score = VertexSKLaccuracy_score
            if "precision_score" in globals():
                real_precision_score = sklearn.metrics.precision_score
                sklearn.metrics.precision_score = VertexSKLprecision_score
                precision_score = VertexSKLprecision_score
            if "recall_score" in globals():
                real_recall_score = sklearn.metrics.recall_score
                sklearn.metrics.recall_score = VertexSKLrecall_score
                recall_score = VertexSKLrecall_score
        except:
            pass

    if project:
        aiplatform.init(
            project=project, location=location, staging_bucket=staging_bucket
        )

    if experiment:
        aiplatform.init(experiment=experiment)
    if run:
        aiplatform.start_run(run)


def VertexXGBtrain(
    params,
    dtrain,
    num_boost_round=10,
    evals=None,
    obj=None,
    maximize=None,
    early_stopping_rounds=None,
    evals_result=None,
    verbose_eval=True,
    callbacks=None,
    custom_metric=None,
):
    """
    Wrapper function for autologging training parameters with Vertex AI Experiments
    Args:
        same as underlying xgb.train() method
    """
    global real_xgb_train

    aiplatform.log_params({"train.num_boost_round": int(num_boost_round)})

    if params:
        if "booster" in params:
            aiplatform.log_params({"train.booster": int(params["booster"])})

        # booster parameters
        if "eta" in params:
            aiplatform.log_params({"train.eta": int(params["eta"])})
        if "max_depth" in params:
            aiplatform.log_params({"train.max_depth": int(params["max_depth"])})
        if "max_leaf_nodes" in params:
            aiplatform.log_params(
                {"train.max_leaf_nodes": int(params["max_leaf_nodes"])}
            )
        if "gamma" in params:
            aiplatform.log_params({"train.gamma": int(params["gamma"])})
        if "alpha" in params:
            aiplatform.log_params({"train.alpha": int(params["alpha"])})

    return real_xgb_train(
        params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=evals,
        obj=obj,
        maximize=maximize,
        early_stopping_rounds=early_stopping_rounds,
        evals_result=evals_result,
        verbose_eval=verbose_eval,
        callbacks=callbacks,
        custom_metric=custom_metric,
    )


def VertexSKLaccuracy_score(labels, predictions):
    """
    Wrapper function for autologging training metrics with Vertex AI Experiments
    Args:
        same as underlying accuracy_score function
    """
    global real_accuracy_score
    accuracy = real_accuracy_score(labels, predictions)
    aiplatform.log_metrics({"accuracy": accuracy})
    return accuracy


def VertexSKLprecision_score(
    y_true,
    y_pred,
    *,
    labels=None,
    pos_label=1,
    average="binary",
    sample_weight=None,
    zero_division="warn",
):
    """
    Wrapper function for autologging training metrics with Vertex AI Experiments
    Args:
        same as underlying precision_score function
    """
    global real_precision_score
    precision = real_precision_score(
        y_true,
        y_pred,
        labels=labels,
        pos_label=pos_label,
        average=average,
        sample_weight=sample_weight,
        zero_division=zero_division,
    )
    aiplatform.log_metrics({"precision": precision})
    return precision


def VertexSKLrecall_score(
    y_true,
    y_pred,
    *,
    labels=None,
    pos_label=1,
    average="binary",
    sample_weight=None,
    zero_division="warn",
):
    """
    Wrapper function for autologging training metrics with Vertex AI Experiments
    Args:
        same as underlying recall_score function
    """
    global real_recall_score
    recall = real_recall_score(
        y_true,
        y_pred,
        labels=labels,
        pos_label=pos_label,
        average=average,
        sample_weight=sample_weight,
        zero_division=zero_division,
    )
    aiplatform.log_metrics({"recall": recall})
    return recall


class VertexXGBBooster(xgb.Booster):
    """
    WIP
    """

    def __init__(self, params=None, cache=None, model_file=None):
        super().__init__(params, cache, model_file)

    def boost(
        self, dtrain: xgb.core.DMatrix, grad: np.ndarray, hess: np.ndarray
    ) -> None:
        return super().boost(dtrain, grad, hess)

    def eval(
        self, data: xgb.core.DMatrix, name: str = "eval", iteration: int = 0
    ) -> str:
        return super().eval(data, name, iteration)

    def update(self, dtrain: xgb.core.DMatrix, iteration: int, fobj=None) -> None:
        return super().update(dtrain, iteration, fobj)

### 使用Vertex AI实验训练XGBoost模型

在下面的代码中，您将构建、训练和评估一个XGBoost表格模型。该Python脚本包括以下调用以集成`Vertex AI实验`：

- 命令行参数：参数`experiment`和`run`用于传递实验和运行名称。
- `autologging()`: 初始化实验并进行堆注入。
- `aiplatform.start_execution()`: 初始化用于链接工件的上下文。
- `aiplatform.end_run()`: 结束实验。

*注意:* 函数`xgb.train`和`accuracy_score`将通过堆注入分别重定向到`VertexXGBtrain`和`VertexSKLaccuracy_score`。当对`train()`和`accuracy()`函数进行后续调用时，它们将作为相应的`VertexXGBtrain`和`VertexSKLaccuracy_score`函数执行。

In [None]:
EXPERIMENT_NAME = f"myexperiment{UUID}"
RUN_NAME = "run-1"

DATASET_DIR = "gs://cloud-samples-data/ai-platform/iris"
DATASET_DATA_URL = DATASET_DIR + "/iris_data.csv"
DATASET_LABELS_URL = DATASET_DIR + "/iris_target.csv"

BOOSTED_ROUNDS = 20

import logging
import os
import subprocess
import sys

import hypertune
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split


def get_data():
    # gsutil outputs everything to stderr so we need to divert it to stdout.
    subprocess.check_call(
        ["gsutil", "cp", DATASET_DATA_URL, "data.csv"], stderr=sys.stdout
    )
    # gsutil outputs everything to stderr so we need to divert it to stdout.
    subprocess.check_call(
        ["gsutil", "cp", DATASET_LABELS_URL, "labels.csv"], stderr=sys.stdout
    )

    # Load data into pandas, then use `.values` to get NumPy arrays
    data = pd.read_csv("data.csv").values
    labels = pd.read_csv("labels.csv").values

    # Convert one-column 2D array into 1D array for use with XGBoost
    labels = labels.reshape((labels.size,))

    train_data, test_data, train_labels, test_labels = train_test_split(
        data, labels, test_size=0.2, random_state=7
    )

    # Load data into DMatrix object
    dtrain = xgb.DMatrix(train_data, label=train_labels)
    return dtrain, test_data, test_labels


def train_model(dtrain):
    logging.info("Start training ...")
    # Train XGBoost model
    params = {"max_depth": 3, "objective": "multi:softmax", "num_class": 3}
    model = xgb.train(params=params, dtrain=dtrain, num_boost_round=BOOSTED_ROUNDS)
    logging.info("Training completed")
    return model


def evaluate_model(model, test_data, test_labels):
    dtest = xgb.DMatrix(test_data)
    pred = model.predict(dtest)
    predictions = [round(value) for value in pred]
    # evaluate predictions
    accuracy = accuracy_score(test_labels, predictions)

    logging.info(f"Evaluation completed with model accuracy: {accuracy}")

    # report metric for hyperparameter tuning
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag="accuracy", metric_value=accuracy
    )
    return accuracy


# autologging
autolog(experiment=EXPERIMENT_NAME, run=RUN_NAME, framework="xgb")

with aiplatform.start_execution(
    schema_title="system.ContainerExecution", display_name="example_training"
) as execution:
    dtrain, test_data, test_labels = get_data()
    model = train_model(dtrain)
    accuracy = evaluate_model(model, test_data, test_labels)

aiplatform.end_run()

获取实验结果

接下来，您可以将实验名称作为参数传递给方法`get_experiment_df()`来将实验结果获取为一个 pandas 数据帧。

In [None]:
experiment_df = aiplatform.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == EXPERIMENT_NAME]
experiment_df.T

#### 删除实验

由于实验是在训练脚本中创建的，要删除实验，您可以使用`list()`方法获取项目中的所有实验，然后根据实验名称进行筛选。

In [None]:
experiments = aiplatform.Experiment.list()
for experiment in experiments:
    if experiment.name == EXPERIMENT_NAME:
        experiment.delete()

清理

要清理此项目中使用的所有Google Cloud资源，您可以删除用于本教程的[Google Cloud项目]（https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects）。

否则，您可以删除本教程中创建的各个资源。

In [None]:
# There are no resources to cleanup