In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI 管道: 评估自定义表格分类模型的 BatchPrediction 结果

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_evaluation/custom_tabular_classification_model_evaluation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在Colab中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fmodel_evaluation%2Fcustom_tabular_classification_model_evaluation.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在Colab企业版中打开
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/model_evaluation/custom_tabular_classification_model_evaluation.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在Workbench中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_evaluation/custom_tabular_classification_model_evaluation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在GitHub上查看
    </a>
  </td>
</table>

## 概述

本笔记本演示了如何使用Vertex AI分类模型评估组件来评估在Vertex AI模型注册表中保存的自定义训练的表格分类模型。模型评估可以帮助您根据评估指标确定模型的性能，并在必要时改进模型。

了解更多关于[Vertex AI自定义训练](https://cloud.google.com/vertex-ai/docs/training/custom-training)和[Vertex AI模型评估](https://cloud.google.com/vertex-ai/docs/evaluation/introduction)。

### 目标

在本教程中，您将训练一个scikit-learn RandomForest模型，将模型保存到Vertex AI模型注册表中，并学习如何通过使用Google Cloud Pipeline Components Python SDK的Vertex AI管道作业来评估模型。

本教程使用以下Vertex AI服务和资源：

- Vertex AI模型注册表
- Vertex AI管道
- Vertex AI批量预测

执行的步骤包括：

- 从公共来源获取数据集。
- 在本地预处理数据，并将测试数据保存在BigQuery中。
- 使用scikit-learn Python包在本地训练一个RandomForest分类模型。
- 在Artifact Registry中为预测创建自定义容器。
- 将模型上传到Vertex AI模型注册表中。
- 创建并运行一个Vertex AI管道，其中：
    - 将训练好的模型导入到管道中。
    - 在BigQuery中的测试数据上运行“批量预测”作业。
    - 使用google-cloud-pipeline-components Python SDK中的评估组件评估模型。
    - 将分类指标导入到Vertex AI模型注册表中的模型资源中。
- 打印和可视化分类评估指标。
- 清理此笔记本中创建的资源。

### 数据集

这篇笔记中使用的**人口普查收入数据集**，存储在BigQuery的公共数据集中。最初，它是由[UC Irvine机器学习仓库](https://archive.ics.uci.edu/ml/datasets.php)提供的。与数据集关联的基本任务是确定一个人是否年收入超过5万美元。更多信息，请查看[其UCI网页上的详细信息](https://archive.ics.uci.edu/ml/datasets/Census+Income)。

成本
本教程使用Google Cloud的计费组件:
* Artifact Registry
* BigQuery
* Cloud Build
* Cloud Storage
* Vertex AI

了解有关[Artifact Registry 价格](https://cloud.google.com/artifact-registry/pricing), [BigQuery 价格](https://cloud.google.com/bigquery/pricing), [Cloud Build 价格](https://cloud.google.com/build/pricing), [Cloud Storage 价格](https://cloud.google.com/storage/pricing), [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing), 并使用[定价计算器](https://cloud.google.com/products/calculator/) 根据您的使用情况生成成本估算。

开始吧

### 为Python安装Vertex AI SDK和其他所需的软件包

In [None]:
# Install the latest versions of the following packages
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-pipeline-components==1.0.26 \
                                 matplotlib \
                                 pyarrow 
# Install the specified versions of the following packages
! pip3 install --quiet scikit-learn==1.0 \
                       pandas \
                       joblib==1.2.0 \
                       numpy==1.23.3 \
                       db-dtypes

重新启动运行时（仅适用于Colab）

为了使用新安装的软件包，您必须在Google Colab上重新启动运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️内核即将重新启动。在继续进行下一步之前，请等待它完成。⚠️</b>
</div>

### 验证您的笔记本环境（仅限Colab）

在Google Colab上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

设置Google Cloud项目信息
了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

创建一个云存储桶

创建一个存储桶来存储中间产物，例如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

如果您的存储桶尚不存在：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

### 初始化用于Python的Vertex AI SDK

要开始使用Vertex AI，您必须拥有现有的Google Cloud项目，并启用[Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。使用您的项目和创建的存储桶初始化Python的Vertex AI SDK。

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### 服务账户

您可以使用服务账户来创建Vertex AI Pipeline作业。如果您不想使用项目的计算引擎服务账户，将`SERVICE_ACCOUNT`设置为另一个服务账户ID。

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

设置 Vertex AI 管道的服务帐户访问权限

运行以下命令，将您的服务帐户访问权限赋予您在上一步中创建的存储桶中读取和写入管道工件的权限。每个服务帐户只需运行此步骤一次。

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### 导入库

导入 Vertex AI Python SDK 和其他必需的 Python 库。

In [None]:
import joblib
import kfp
import matplotlib.pyplot as plt
from google.cloud import aiplatform_v1, bigquery
from kfp.v2 import compiler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import LabelBinarizer

### 初始化Python的BigQuery SDK

使用您的项目和已创建的存储桶来初始化Python的BigQuery SDK。

In [None]:
bq_client = bigquery.Client(
    project=PROJECT_ID,
    credentials=aiplatform.initializer.global_config.credentials,
)

### 定义常量

在下一个单元格中，定义您在本次会话中使用的常量。

In [None]:
# Define the public bigquery data source
DATA_SOURCE = "bigquery-public-data.ml_datasets.census_adult_income"

# Define the dataset name for storing the test data
PREDICTION_INPUT_DATASET_ID = "adult_income_prediction"

# Define the table name for storing the test data for batch prediction
PREDICTION_INPUT_TABLE_ID = "adult_income_test_data"

# Set the folder path inside GCS bucket where you store model artifacts
MODEL_ARTIFACT_DIR = "sklearn-income-pred-model"

# Set the name of the local folder where you store your prediction application
SRC_DIR = "src"

# Define the feature columns that you use from the dataset
COLUMNS = (
    "age",
    "workclass",
    "functional_weight",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
)

# Categorical columns are columns that have string values and
# need to be turned into a numerical value to be used for training
CATEGORICAL_COLUMNS = (
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native_country",
)

# Target column in the dataset
TARGET = "income_bracket"

# Save the individual class labels in a constant
CLASS_LABELS = [" <=50K", " >50K"]

# Set the test ratio for splitting
TEST_SIZE = 0.25

# Set a random state
RANDOM_STATE = 36

# Set a sample size for batch prediction
BATCH_SAMPLE_SIZE = 3000

# Set the name for your repository in artifact registry
REPOSITORY = "sklearn-income-prediction-repo-unique"  # @param {type:"string"}

# Set the name for your prediction image in artifact registry
IMAGE = "sklearn-fastapi-server"

# Set a display name for your Vertex AI Model
MODEL_DISPLAY_NAME = "skl_inc_pred_model-unique"  # @param {type:"string"}

# Set a display name for your Vertex AI Pipeline
PIPELINE_DISPLAY_NAME = (
    "income_classification_multiclass-unique"  # @param {type:"string"}
)

# Path where the compiled pipeline needs to be written
PIPELINE_PACKAGE_PATH = "custom_tabular_classify_pipeline_config.json"

# Set the GCS path to your root directory for Vertex AI pipelines
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/income_classification_task"

获取数据集

从BigQuery公共数据集中下载人口普查数据。在本教程中，您只使用数据集中的2万条数据进行训练和测试。

In [None]:
# Define the SQL query to fetch the dataset
query = f"""
SELECT * FROM `{DATA_SOURCE}` LIMIT 20000
"""
# Download the dataset to a dataframe
df = bq_client.query(query).to_dataframe()
df.head()

## 分割数据

将数据分成训练集和测试集。您在训练集上训练随机森林分类模型，并使用测试数据进行评估。

In [None]:
# Split the dataset
X_train, X_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)
# Print the shapes of train and test sets
print(X_train.shape, X_test.shape)

## 在BigQuery中保存测试数据

在BigQuery中创建一个数据集，并将测试数据集保存在数据集内的一个表中。这个数据集在运行评估管道时被进一步使用，用于创建数据样本和存储预测结果。

### 在BigQuery中创建数据集

In [None]:
# Create a bigquery dataset
bq_dataset = bigquery.Dataset(f"{PROJECT_ID}.{PREDICTION_INPUT_DATASET_ID}")
bq_dataset = bq_client.create_dataset(bq_dataset, exists_ok=True)
print(f"Created dataset {bq_client.project}.{bq_dataset.dataset_id}")

### 配置存储测试数据的架构

In [None]:
schema_config = []
for i in COLUMNS:
    if X_test[i].dtype == "int64":
        schema_config.append(bigquery.SchemaField(i, "INTEGER"))
    elif X_test[i].dtype in ["object", "category"]:
        schema_config.append(bigquery.SchemaField(i, "STRING"))

schema_config.append(bigquery.SchemaField(TARGET, "STRING"))

###将测试数据加载到表中

In [None]:
table_ref = bq_dataset.table(PREDICTION_INPUT_TABLE_ID)
job_config = bigquery.LoadJobConfig(
    schema=schema_config, write_disposition="WRITE_TRUNCATE"
)

job = bq_client.load_table_from_dataframe(X_test, table_ref)

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

创建用于训练的预处理流水线

由于数据集包含分类和数值数据，因此需要进行某些预处理步骤。但是，在用于训练分类模型之前，你的数据集需要仅包含数值数值。因此，您可以使用LabelBinarizer将分类数据编码为数值数据。

为了简化预测服务，以下代码将这些步骤封装在一个scikit-learn Pipeline中。您可以使用scikit-learn中包含的`joblib`版本或`pickle`导出Pipeline对象，类似于导出scikit-learn估计器的方法。

了解更多关于[scikit-learn Pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)。

In [None]:
# Remove the column we are trying to predict ('income-level') from our features list
# Convert the Dataframe to a lists of lists
train_features = X_train.drop(TARGET, axis=1).to_numpy().tolist()
# Create our training labels list, convert the Dataframe to a lists of lists
train_labels = X_train[TARGET].to_numpy().tolist()

# Since the census data set has categorical features, we need to convert
# them to numerical values. We'll use a list of pipelines to convert each
# categorical column and then use FeatureUnion to combine them before calling
# the RandomForestClassifier.
categorical_pipelines = []

# Each categorical column needs to be extracted individually and converted to a numerical value.
# To do this, each categorical column will use a pipeline that extracts one feature column via
# SelectKBest(k=1) and a LabelBinarizer() to convert the categorical value to a numerical one.
# A scores array (created below) will select and extract the feature column. The scores array is
# created by iterating over the COLUMNS and checking if it is a CATEGORICAL_COLUMN.
for i, col in enumerate(COLUMNS):
    if col in CATEGORICAL_COLUMNS:
        # Create a scores array to get the individual categorical column.
        # Example:
        #  data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',
        #         'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']
        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #
        # Returns: [['Sate-gov']]
        scores = []
        # Build the scores array
        for j in range(len(COLUMNS)):
            if i == j:  # This column is the categorical column we want to extract.
                scores.append(1)  # Set to 1 to select this column
            else:  # Every other column should be ignored.
                scores.append(0)
        skb = SelectKBest(k=1)
        skb.scores_ = scores
        # Convert the categorical column to a numerical value
        lbn = LabelBinarizer()
        r = skb.transform(train_features)
        lbn.fit(r)
        # Create the pipeline to extract the categorical feature
        categorical_pipelines.append(
            (
                "categorical-{}".format(i),
                Pipeline([("SKB-{}".format(i), skb), ("LBN-{}".format(i), lbn)]),
            )
        )

# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
# From COLUMNS use the features that are numerical
skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
categorical_pipelines.append(("numerical", skb))

# Combine all the features using FeatureUnion
preprocess = FeatureUnion(categorical_pipelines)

训练一个随机森林分类模型

接下来，在预处理过的数据上拟合一个随机森林分类模型。

训练完毕后，将估计器添加到管道对象中，并将管道保存到磁盘。

In [None]:
# Create the classifier
classifier = RandomForestClassifier()

# Transform the features and fit them to the classifier
classifier.fit(preprocess.transform(train_features), train_labels)

# Create the overall model as a single pipeline
pipeline = Pipeline([("union", preprocess), ("classifier", classifier)])

# Save the pipeline
joblib.dump(pipeline, "model.joblib")

## 为提供预测创建一个容器映像

要使用模型进行预测服务，您需要将模型上传到 Vertex AI 模型注册表，使用一个自定义容器来进行预测。
要创建容器映像，您需要执行以下步骤：
- 将模型保存到您的 Cloud 存储桶中。
- 本地创建一个使用 [`FastAPI`](https://fastapi.tiangolo.com/tutorial/first-steps/) Python 包的服务应用程序。
- 使用 Docker 将应用程序制作成镜像，并上传到 [Artifact Registry 使用 Cloud Build](https://cloud.google.com/build/docs/build-push-docker-image)。

了解更多关于在 Vertex AI 上使用[自定义容器进行预测](https://cloud.google.com/vertex-ai/docs/predictions/use-custom-container)的信息。

### 将模型上传到 Cloud 存储桶

In [None]:
!gsutil cp "model.joblib" {BUCKET_URI}/{MODEL_ARTIFACT_DIR}/

创建服务应用程序

创建一个源目录，将您的服务应用程序打包在其中。

In [None]:
# Create the source directory
! mkdir $SRC_DIR

# Create the app folder
! mkdir $SRC_DIR/app

# Move your model to the app folder
! mv model.joblib $SRC_DIR/app/

创建包含使用 `FastAPI` 为预测提供服务的代码的 `main.py` 文件。

In [None]:
%%writefile $SRC_DIR/app/main.py
# Import the required libraries
from fastapi import FastAPI, Request
import joblib
import json
import os
from google.cloud import storage
import logging

app = FastAPI()
# Define the Cloud Storage client
gcs_client = storage.Client()

# Download the model file from Cloud Storage bucket
with open("model.joblib", 'wb') as model_f:
    gcs_client.download_blob_to_file(
            f"{os.environ['AIP_STORAGE_URI']}/model.joblib", model_f
        )
    
# Load the scikit-learn model/pipeline file
_model = joblib.load("model.joblib")

# Define a function for health route
@app.get(os.environ['AIP_HEALTH_ROUTE'], status_code=200)
def health():
    return {}

# Define a function for prediction route
@app.post(os.environ['AIP_PREDICT_ROUTE'])
async def predict(request: Request):
    # await the request
    body = await request.json()
    # parse the request instances
    instances = body["instances"]
    # pass it to the model/pipeline for prediction scores
    predictions = _model.predict_proba(instances).tolist()
    # return the batch prediction scores
    return {"predictions": predictions}

### 创建需求文件
创建一个名为 `requirements.txt` 的文件，指定应用程序所需的软件包版本。

In [None]:
%%writefile $SRC_DIR/requirements.txt
joblib==1.2.0
numpy==1.23.3
scikit-learn==1.0
google-cloud-storage>=1.44.0,<2.0.0dev

创建一个用于设置环境变量的bash脚本

创建名为`prestart.sh`的bash脚本，将端口设置为`AIP_HTTP_PORT`。您容器中的HTTP服务器将监听此端口上的请求。

In [None]:
%%writefile $SRC_DIR/app/prestart.sh
#!/bin/bash
export PORT=$AIP_HTTP_PORT

容器化服务应用程序

为将服务应用程序放入容器中创建一个Dockerfile。

In [None]:
%%writefile $SRC_DIR/Dockerfile

FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9

COPY ./app /app
COPY requirements.txt requirements.txt

RUN pip install -r requirements.txt

创建私有Docker仓库

您的第一步是在Google Artifact Registry中创建您自己的Docker仓库。

In [None]:
import os

! gcloud services enable artifactregistry.googleapis.com

if os.getenv("IS_TESTING"):
    ! sudo apt-get update --yes && sudo apt-get --only-upgrade --yes install google-cloud-sdk-cloud-run-proxy google-cloud-sdk-harbourbridge google-cloud-sdk-cbt google-cloud-sdk-gke-gcloud-auth-plugin google-cloud-sdk-kpt google-cloud-sdk-local-extract google-cloud-sdk-minikube google-cloud-sdk-app-engine-java google-cloud-sdk-app-engine-go google-cloud-sdk-app-engine-python google-cloud-sdk-spanner-emulator google-cloud-sdk-bigtable-emulator google-cloud-sdk-nomos google-cloud-sdk-package-go-module google-cloud-sdk-firestore-emulator kubectl google-cloud-sdk-datastore-emulator google-cloud-sdk-app-engine-python-extras google-cloud-sdk-cloud-build-local google-cloud-sdk-kubectl-oidc google-cloud-sdk-anthos-auth google-cloud-sdk-app-engine-grpc google-cloud-sdk-pubsub-emulator google-cloud-sdk-datalab google-cloud-sdk-skaffold google-cloud-sdk google-cloud-sdk-terraform-tools google-cloud-sdk-config-connector
    ! gcloud components update --quiet

创建存储库

要存储您的容器镜像，请在 Artifact Registry 中创建一个存储库。

In [None]:
!gcloud artifacts repositories create {REPOSITORY} \
    --repository-format=docker \
    --location=$LOCATION

#### 推送容器镜像
使用Cloud Build将您的serving应用程序容器化，并将其推送到您的存储库中。

In [None]:
%cd $SRC_DIR/
!gcloud builds submit --region={LOCATION} --tag={LOCATION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE} --suppress-logs
%cd ..

## 将模型上传到Vertex AI Registry

现在，使用容器镜像和模型上传到的Cloud Storage存储桶中的工件目录路径创建一个Vertex AI模型。

要上传您的模型，您可以使用Vertex AI SDK中的`Model.upload()`方法，通过传递以下参数：

- `display_name`：模型资源的显示名称。
- `artifact_uri`：存放模型文件/工件的Cloud Storage路径。
- `serving_container_image_uri`：提供服务的容器镜像路径。
- `serving_container_predict_route`：提供服务应用程序的预测路由。
- `serving_container_health_route`：提供服务应用程序的健康检查路由。

In [None]:
aip_model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=f"{BUCKET_URI}/{MODEL_ARTIFACT_DIR}",
    serving_container_image_uri=f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}",
    serving_container_predict_route="/predict",
    serving_container_health_route="/health",
)

在这一部分，您将运行一个 Vertex AI 流水线，其中包括以下步骤：
1. 从 Vertex AI 模型注册表中导入您的模型。
2. 对测试数据进行批量预测采样。
3. 从采样的测试数据中删除目标字段。
4. 运行批处理预测作业。
5. 使用地面实况/目标信息评估批处理预测作业的结果。
6. 将生成的评估指标导入至 Vertex AI 模型。

### 启用 Dataflow API

谷歌云管道组件的模型评估组件会内部创建数据流作业来执行基础任务。在运行评估管道之前，启用 Dataflow API 是必要的。

In [None]:
!gcloud services enable dataflow.googleapis.com

### 定义管道
要为评估您的模型定义 Vertex AI 管道，您可以使用 `google-cloud-pipeline-components` Python 包。Google Cloud Pipeline Components 提供了一个 SDK，其中包含一组管道组件，用户可以与 Google Cloud 服务（如 Vertex AI、Dataflow 和 BigQuery）进行交互。

了解更多关于[Google Cloud 管道组件](https://cloud.google.com/vertex-ai/docs/pipelines/components-introduction)。

评估管道使用以下组件：
- `GetVertexModelOp`：获取 Vertex AI 模型工件。
- `EvaluationDataSamplerOp`：随机对输入数据集进行下采样，以指定的大小进行计算适用于 AutoML Tabular 和定制模型的 Vertex 可解释 AI 特征归因。使用 Apache Beam 创建 Dataflow 作业来对数据集进行下采样。
- `TargetFieldDataRemoverOp`：从输入数据集中删除目标字段，以支持结构化 AutoML 模型和 Vertex AI 批量预测的定制模型。
- `ModelBatchPredictOp`：创建 Vertex AI 批量预测作业并等待完成。
- `ModelEvaluationClassificationOp`：在训练模型的批量预测结果上计算评估指标。使用 Apache Beam 和 TFMA 创建 Dataflow 作业来计算评估指标。支持表格、图像、视频和文本数据的多类分类评估。
- `ModelImportEvaluationOp`：将模型评估工件导入到现有的 Vertex AI 模型中，使用 ModelService.ImportModelEvaluation。

In [None]:
# define the evaluation pipeline


@kfp.dsl.pipeline(name="custom-tabular-classification-evaluation-pipeline")
def evaluation_custom_tabular_feature_attribution_pipeline(
    project: str,
    location: str,
    root_dir: str,
    model_name: str,
    target_field_name: str,
    bigquery_source_input_uri: str,
    bigquery_destination_output_uri: str,
    batch_predict_instances_format: str,
    evaluation_class_names: list,
    batch_predict_predictions_format: str = "bigquery",
    evaluation_prediction_label_column: str = "",
    evaluation_prediction_score_column: str = "prediction",
    batch_predict_machine_type: str = "n1-standard-4",
    batch_predict_starting_replica_count: int = 5,
    batch_predict_max_replica_count: int = 10,
    batch_predict_data_sample_size: int = 10000,
):
    # Import the components
    from google_cloud_pipeline_components.aiplatform import ModelBatchPredictOp
    from google_cloud_pipeline_components.experimental.evaluation import (
        EvaluationDataSamplerOp, GetVertexModelOp,
        ModelEvaluationClassificationOp, ModelImportEvaluationOp,
        TargetFieldDataRemoverOp)

    # Get the Vertex AI model resource
    get_model_task = GetVertexModelOp(model_resource_name=model_name)

    # Run the data sampling task
    data_sampler_task = EvaluationDataSamplerOp(
        project=project,
        location=location,
        root_dir=root_dir,
        bigquery_source_uri=bigquery_source_input_uri,
        instances_format=batch_predict_instances_format,
        sample_size=batch_predict_data_sample_size,
    )

    # Run the task to remove the target field from data for batch prediction
    data_splitter_task = TargetFieldDataRemoverOp(
        project=project,
        location=location,
        root_dir=root_dir,
        bigquery_source_uri=data_sampler_task.outputs["bigquery_output_table"],
        instances_format=batch_predict_instances_format,
        target_field_name=target_field_name,
    )

    # Run the batch prediction task
    batch_predict_task = ModelBatchPredictOp(
        project=project,
        location=location,
        model=get_model_task.outputs["model"],
        job_display_name="model-registry-batch-prediction",
        bigquery_source_input_uri=data_splitter_task.outputs["bigquery_output_table"],
        instances_format=batch_predict_instances_format,
        predictions_format=batch_predict_predictions_format,
        bigquery_destination_output_uri=bigquery_destination_output_uri,
        machine_type=batch_predict_machine_type,
        starting_replica_count=batch_predict_starting_replica_count,
        max_replica_count=batch_predict_max_replica_count,
    )

    # Run the evaluation based on prediction type
    eval_task = ModelEvaluationClassificationOp(
        project=project,
        location=location,
        root_dir=root_dir,
        class_labels=evaluation_class_names,
        prediction_label_column=evaluation_prediction_label_column,
        prediction_score_column=evaluation_prediction_score_column,
        target_field_name=target_field_name,
        ground_truth_format=batch_predict_instances_format,
        ground_truth_bigquery_source=data_sampler_task.outputs["bigquery_output_table"],
        predictions_format=batch_predict_predictions_format,
        predictions_bigquery_source=batch_predict_task.outputs["bigquery_output_table"],
    )

    # Import the model evaluations to the Vertex AI model
    ModelImportEvaluationOp(
        classification_metrics=eval_task.outputs["evaluation_metrics"],
        model=get_model_task.outputs["model"],
        dataset_type=batch_predict_instances_format,
    )

### 可选：导入BigQuery表以进行预测的解决方法_bigquery_source

In [None]:
"""
# Set constants for BigQuery Table
BIGQUERY_PROJECT_ID = "your-project-id"
BIGQUERY_DATASET_ID = "your-dataset-id"
BIGQUERY_PREDICTION_RESULTS_TABLE_ID = "your-table-id"

# Import the BigQuery table using the importer to obtain a BQTable artifact
bq_table_uri = f"bq://{BIGQUERY_PROJECT_ID}.{BIGQUERY_DATASET_ID}.{BIGQUERY_PREDICTION_RESULTS_TABLE_ID}"
bq_table = kfp.v2.dsl.importer(
    artifact_uri=bq_table_uri,
    artifact_class=artifact_types.BQTable,
    metadata={
        "projectId": BIGQUERY_PROJECT_ID,
        "datasetId": BIGQUERY_DATASET_ID,
        "tableId": BIGQUERY_PREDICTION_RESULTS_TABLE_ID,
    },
).output

# Run the evaluation based on prediction type
eval_task = ModelEvaluationClassificationOp(
    project=project,
    location=location,
    root_dir=root_dir,
    class_labels=evaluation_class_names,
    prediction_label_column=evaluation_prediction_label_column,
    prediction_score_column=evaluation_prediction_score_column,
    target_field_name=target_field_name,
    ground_truth_format=batch_predict_instances_format,
    ground_truth_bigquery_source=bq_table_uri,
    predictions_format=batch_predict_predictions_format,
    predictions_bigquery_source=bq_table,
)
"""

### 编译评估流程

将定义好的流程编译成一个（json/yaml）文件。

In [None]:
compiler.Compiler().compile(
    pipeline_func=evaluation_custom_tabular_feature_attribution_pipeline,
    package_path=PIPELINE_PACKAGE_PATH,
)

### 定义参数

在运行您的流水线之前，请设置以下参数：

- `project`：Google Cloud 项目的项目 ID。
- `location`：需要运行流水线的位置。如果未设置，流水线将默认使用 Vertex AI SDK 配置的位置。
- `root_dir`：用于存储分阶段文件和工件的 Cloud Storage 目录。在目录下创建一个随机子目录，用于存储作业信息，以便在失败时恢复作业。
- `model_name`：训练的自定义表格分类模型的资源名称。
- `target_field_name`：用作评估基准的列的名称。
- `bigquery_source_input_uri`：存储测试输入的 BigQuery 表 URI。
- `bigquery_destination_output_uri`：用于在测试集上导出预测的 BigQuery 数据集 URI。
- `batch_predict_instances_format`：用于批量预测和评估的输入格式。
- `batch_predict_predictions_format`：用于批量预测和评估的输出格式。
- `evaluation_class_names`：数据集中目标字段的所有类名称列表。
- `batch_predict_data_sample_size`：批量预测作业和评估所需的输入测试数据样本大小。

In [None]:
parameters = {
    "project": PROJECT_ID,
    "location": LOCATION,
    "root_dir": PIPELINE_ROOT,
    "model_name": aip_model.resource_name,
    "target_field_name": TARGET,
    "bigquery_source_input_uri": f"bq://{PROJECT_ID}.{table_ref.dataset_id}.{table_ref.table_id}",
    "bigquery_destination_output_uri": f"bq://{PROJECT_ID}.{table_ref.dataset_id}",
    "batch_predict_instances_format": "bigquery",
    "batch_predict_predictions_format": "bigquery",
    "evaluation_class_names": CLASS_LABELS,
    "batch_predict_data_sample_size": BATCH_SAMPLE_SIZE,
}

### 运行管道

使用以下参数创建一个 Vertex AI 管道作业并运行它：

- `display_name`：应显示在 Google Cloud 控制台中的管道名称。
- `template_path`：编译后的 PipelineSpec JSON 或 YAML 文件的路径。可以是本地路径、Google Cloud 存储 URI 或 Artifact Registry URI。
- `parameter_values`：运行时参数名称与控制管道运行的值的映射。
- `enable_caching`：布尔值，指定是否打开缓存或不打开缓存。

了解更多关于 Vertex AI SDK 的 [PipelineJob 类](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.PipelineJob)。

创建管道作业后，使用配置的 `SERVICE_ACCOUNT` 来运行它。

In [None]:
# Create the pipeline job
job = aiplatform.PipelineJob(
    display_name=PIPELINE_DISPLAY_NAME,
    template_path=PIPELINE_PACKAGE_PATH,
    parameter_values=parameters,
    enable_caching=True,
)
# Run the pipeline job
job.run(service_account=SERVICE_ACCOUNT)

在从上一步获取的结果中，单击生成的链接以在云控制台中查看您的运行情况。

在用户界面中，单击管道的有向无环图（DAG）中的节点可展开或折叠。以下是DAG的部分展开视图（单击图像以查看更大版本）。

## 打印指标

在管道顺利运行后，从评估任务中获取评估指标并打印出来。

In [None]:
# Iterate over the pipeline tasks
for task in job._gca_resource.job_detail.task_details:
    # Obtain the artifacts from the evaluation task
    if (
        ("model-evaluation" in task.task_name)
        and ("model-evaluation-import" not in task.task_name)
        and (
            task.state == aiplatform_v1.types.PipelineTaskDetail.State.SUCCEEDED
            or task.state == aiplatform_v1.types.PipelineTaskDetail.State.SKIPPED
        )
    ):
        evaluation_metrics = task.outputs.get("evaluation_metrics").artifacts[0]
        evaluation_metrics_gcs_uri = evaluation_metrics.uri

print(evaluation_metrics)
print(evaluation_metrics_gcs_uri)

## 可视化指标

使用条形图可视化生成的评估指标。

In [None]:
metrics = []
values = []
for i in evaluation_metrics.metadata.items():
    metrics.append(i[0])
    values.append(i[1])
plt.figure(figsize=(5, 3))
plt.bar(x=metrics, height=values)
plt.title("Evaluation Metrics")
plt.ylabel("Value")
plt.show()

清理

要清理此项目中使用的所有Google Cloud资源，您可以[删除用于本教程的Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除本教程中创建的各个资源。

- Vertex AI 模型
- Vertex AI 管道作业
- Artifact Registry 中的存储库
- BigQuery 数据集
- 云存储桶（将 `delete_bucket` 设置为 **True** 可删除在此笔记本中创建的云存储桶）。

In [None]:
# Delete model resource
aip_model.delete()

# Delete the evaluation pipeline
job.delete()

# Delete the repository in Artifact Registry
! gcloud artifacts repositories delete --location=us-central1 {REPOSITORY} --quiet

# Delete the BigQuery dataset
! bq rm -r -f $PROJECT_ID:$PREDICTION_INPUT_DATASET_ID

delete_bucket = False
# Delete Cloud Storage objects
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI

# Delete the locally generated files and folders
! rm $PIPELINE_PACKAGE_PATH
! rm -rf $SRC_DIR