In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI管道：使用`google-cloud-pipeline-components`和Spark ML进行贷款资格预测

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_dataproc_tabular.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_dataproc_tabular.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      查看GitHub
    </a>                                                                                            
  </td>
  <td>
<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/pipelines/google_cloud_pipeline_components_dataproc_tabular.ipynb" target='_blank'>
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在Vertex AI Workbench中打开
    </a>
  </td>
</table>

请注意：此笔记本已在以下环境中测试：

* Python版本= 3.9

## 概述

本笔记本展示如何使用Spark MLlib和DataprocPySparkBatchOp组件构建Spark ML管道，以确定银行公司客户是否有资格获得贷款。特别是，该管道涵盖了一个Spark MLib管道，从数据预处理到随机森林分类器的超参数调整，该分类器预测客户有资格获得贷款的概率。

了解更多关于[Vertex AI Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) 和 [Dataproc components](https://cloud.google.com/vertex-ai/docs/pipelines/dataproc-component)。

### 目标

在这个笔记本中，您将学习如何构建一个 Vertex AI 管道，并使用Spark ML训练一个随机森林模型，用于贷款资格分类问题。

本教程使用以下Google Cloud ML服务和资源：

- Vertex AI 数据集
- Vertex AI 管道
- Vertex AI 训练

执行的步骤包括：

* 使用`DataprocPySparkBatchOp`来预处理数据。
* 在训练数据上创建一个 Vertex AI 数据集资源。
* 使用 PySpark 训练一个随机森林模型。
* 构建一个 Vertex AI 管道并运行训练作业。
* 使用Spark serving图像来在 Vertex AI 端点上部署一个Spark模型。

数据集

该数据集是[贷款资格数据集](https://datasetsearch.research.google.com/search?src=2&query=Loan%20Eligible%20Dataset&docid=L2cvMTFsajJrM3EzcA%3D%3D)的预处理版本。

### 费用

本教程使用 Google Cloud 的计费组件：

* Vertex AI
* Cloud Storage
* Dataproc Serverless

了解 [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing)、[Cloud Storage 价格](https://cloud.google.com/storage/pricing)、[Dataproc 价格](https://cloud.google.com/dataproc/pricing) 并使用 [定价计算器](https://cloud.google.com/products/calculator/)，根据您的预期使用量生成费用估算。

## 安装

安装执行此笔记本所需的软件包。

In [None]:
import os

# (optional) update gcloud if needed
if os.getenv("IS_TESTING"):
    ! gcloud components update --quiet


! pip3 install --upgrade --quiet google-cloud-aiplatform==1.30.1 \
                                 kfp==1.8.14 \
                                 google-cloud-pipeline-components==1.0.33 --no-warn-conflicts

只有Colab: 取消注释以下单元格以重新启动内核。

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## 开始之前

### 设置您的谷歌云项目

**无论您使用哪种笔记本环境，以下步骤都是必需的。**

1. [选择或创建谷歌云项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建帐户时，您将获得$300的免费信用额度用于计算/存储成本。

2. [确保为您的项目启用了计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用Artifact Registry、Cloud Build、Container Registry、Dataproc和Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,containerregistry.googleapis.com,dataproc.googleapis.com,aiplatform.googleapis.com)。

4. 如果您正在本地运行此笔记本，您需要安装[Cloud SDK](https://cloud.google.com/sdk)。

#### 设置您的项目 ID

**如果您不知道您的项目 ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 查看支持页面：[查找项目 ID](https://support.google.com/googleapi/answer/7014113)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

区域

您还可以更改 Vertex AI 使用的“REGION”变量。了解有关 [Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations)的更多信息。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

UUID

如果您处于现场教程会话中，您可能会使用共享测试帐户或项目。为了避免在创建资源时发生用户之间的名称冲突，您为每个实例会话创建一个UUID，并将其附加到您在本教程中创建的资源的名称上。

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

###验证您的谷歌云帐户

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下相关说明操作。

**1. Vertex AI Workbench**
* 不需要做任何操作，因为您已经进行了身份验证。

**2. 本地JupyterLab实例，请取消注释并运行：**

In [None]:
# ! gcloud auth login

3. 合作，取消注释并运行:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

4. 服务账户或其他
*请查看如何向您的服务账户授予云存储权限，网址为：https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples。

### 启用Google云服务

如果尚未完成，请在您的项目中启用以下服务：

* Artifact Registry
* Cloud Build
* Container Registry
* Dataproc
* Vertex AI

In [None]:
! gcloud services enable \
    artifactregistry.googleapis.com \
    cloudbuild.googleapis.com \
    containerregistry.googleapis.com \
    dataproc.googleapis.com \
    aiplatform.googleapis.com

创建一个云存储桶

创建一个存储桶以存储中间产物，例如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

只有在您的存储桶尚不存在时：运行以下单元格以创建您的云存储桶。

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

#### 服务账号

您可以使用服务账号来创建 Vertex AI 管道作业。如果您不想使用项目的计算引擎服务账号，请将`SERVICE_ACCOUNT`设置为另一个服务账号ID。

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import os
import sys

IS_COLAB = "google.colab" in sys.modules

if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

设置服务帐户访问 Vertex AI Pipelines

运行以下命令，将您的服务帐户授予读取和写入管道数据的访问权限，该数据存储在您在上一步中创建的存储桶中。您只需要针对每个服务帐户运行此步骤一次。

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### 加载预处理数据

该笔记本使用您从Vertex AI Feature Store中读取的预处理数据集。

In [None]:
PUBLIC_DATA_URI = "gs://cloud-samples-data/vertex-ai/dataset-management/datasets/loan_eligibilty/data.csv"
FEATURES_TRAIN_URI = f"{BUCKET_URI}/data/features/snapshots/{UUID}"

!gsutil cp -r $PUBLIC_DATA_URI $FEATURES_TRAIN_URI

为Dataproc Serverless启用私有Google访问

In [None]:
SUBNETWORK = "default"  # @param {type:"string"}

!gcloud compute networks subnets list --regions=$REGION --filter=$SUBNETWORK

!gcloud compute networks subnets update $SUBNETWORK \
--region=$REGION \
--enable-private-ip-google-access

!gcloud compute networks subnets describe $SUBNETWORK \
--region=$REGION \
--format="get(privateIpGoogleAccess)"

### 创建Docker存储库

您可以在工件存储库中为即将创建的自定义dataproc镜像创建一个Docker存储库。

In [None]:
# set repo name
REPO_NAME = "loan-eligibility-spark-demo"

# create the repository
!gcloud artifacts repositories create $REPO_NAME \
    --repository-format=docker \
    --location=$REGION \
    --quiet \
    --description="loan eligibility spark docker repository"

导入库并定义常量

In [1]:
# General
from pathlib import Path as path
from typing import NamedTuple

from google.cloud import aiplatform as vertex_ai
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import (ClassificationMetrics, Condition, Metrics, Output,
                        component)

In [None]:
# Setup
DATAPROC_RUNTIME_VERSION = "1.1.20"
SRC = path("src")
BUILD_PATH = path("build")
DELIVERABLES = path("deliverables")
DATA_PATH = path("data")
RUNTIME_IMAGE = "dataproc_serverless_custom_runtime"
IMAGE_TAG = "1.0.0"

# Pipeline
PIPELINE_NAME = "pyspark-loan-eligibility-pipeline"
PIPELINE_ROOT = f"{BUCKET_URI}/pipelines"
PIPELINE_PACKAGE_PATH = str(BUILD_PATH / f"pipeline_{UUID}.json")
RUNTIME_CONTAINER_IMAGE = f"gcr.io/{PROJECT_ID}/{RUNTIME_IMAGE}:{IMAGE_TAG}"
SUBNETWORK_URI = f"projects/{PROJECT_ID}/regions/{REGION}/subnetworks/{SUBNETWORK}"
ML_APPLICATION = "loan-eligibility"
TASK = "sparkml"
MODEL_TYPE = "rfor"
VERSION = "1.0.0"
MODEL_NAME = f"{ML_APPLICATION}-{TASK}-{MODEL_TYPE}-{VERSION}"
ARTIFACT_URI = f"{BUCKET_URI}/deliverables/bundle/{UUID}"

# Preprocessing
PREPROCESSING_PYTHON_FILE_URI = f"{BUCKET_URI}/src/data_preprocessing.py"
PROCESSED_DATA_URI = f"{BUCKET_URI}/data/processed"
PREPROCESSING_ARGS = [
    "--train-data-path",
    FEATURES_TRAIN_URI,
    "--out-process-path",
    PROCESSED_DATA_URI,
]

# Training
TRAINING_PYTHON_FILE_URI = f"{BUCKET_URI}/src/model_training.py"
MODEL_URI = f"{BUCKET_URI}/deliverables/model/rfor/{UUID}/train_model"
METRICS_URI = f"{BUCKET_URI}/deliverables/metrics/rfor/{UUID}/train_metrics.json"
TRAINING_ARGS = [
    "--train-path",
    PROCESSED_DATA_URI,
    "--model-path",
    MODEL_URI,
    "--metrics-path",
    METRICS_URI,
]

# Condition
AUPR_THRESHOLD = 0.5
AUPR_HYPERTUNE_CONDITION = "hypertune"

# Hypertuning
HPT_PYTHON_FILE_URI = f"{BUCKET_URI}/src/hp_tuning.py"
HPT_MODEL_URI = f"{BUCKET_URI}/deliverables/model/rfor/{UUID}/model"
HPT_METRICS_URI = f"{BUCKET_URI}/deliverables/metrics/rfor/{UUID}/metrics.json"
HPT_ARGS = [
    "--train-path",
    PROCESSED_DATA_URI,
    "--model-path",
    HPT_MODEL_URI,
    "--metrics-path",
    HPT_METRICS_URI,
]
HPT_BUNDLE_URI = f"{ARTIFACT_URI}/model.zip"
HPT_ARGS = [
    "--train-path",
    PROCESSED_DATA_URI,
    "--model-path",
    HPT_MODEL_URI,
    "--metrics-path",
    HPT_METRICS_URI,
    "--bundle-path",
    HPT_BUNDLE_URI,
]
HPT_RUNTIME_PROPERTIES = {
    "spark.jars.packages": "ml.combust.mleap:mleap-spark-base_2.12:0.21.1,ml.combust.mleap:mleap-spark_2.12:0.21.1"
}

# Experiment
EXPERIMENT_NAME = "loan-eligibility"

# Deploy
SERVING_IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPO_NAME}/spark-ml-serving"

### 初始化Vertex AI SDK客户端

In [None]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=BUCKET_URI,
    experiment=EXPERIMENT_NAME,
)

## 构建顶点管道来训练和部署一个Spark模型

在这种情况下，ML管道包括以下步骤：

1. 使用`DataprocPySparkBatchOp`填充分类和数值变量
2. 使用`DataprocPySparkBatchOp`训练`RandomForestClassifier`
3. 运行一个自定义组件以评估模型

如果模型符合性能条件，则：

4. 使用`DataprocPySparkBatchOp`对`RandomForestClassifier`进行超参数调整
5. 将模型序列化为MLeap格式，以在Spark之外使用模型

如果`deploy_model`管道参数设置为`True`：

6. 上传模型到Vertex AI模型注册表
7. 创建一个Vertex AI端点
8. 部署模型到Vertex AI端点，用于提供在线预测请求。

定义PySpark作业的代码。

定义数据预处理、模型训练和超参数调整的代码。

为代码初始化一个源目录。

In [None]:
# make a source directory to save the code
! mkdir $SRC
! echo "" > $SRC/__init__.py

创建用于数据预处理的源代码

创建`data_preprocessing.py`文件，该文件用于加载数据，预处理用于训练的数据，并将处理后的数据上传到云存储中。通过此代码，创建一个带有日志记录功能的Spark会话。预处理是通过此会话处理的，涉及将变量`label`和`loan_amount`的数据类型从字符串转换为双精度型。该代码的参数定义如下：

- `--train-data-path`：训练样本的GCS路径。
- `--out-process-path`：保存处理后数据的路径。

In [None]:
%%writefile $SRC/data_preprocessing.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
data_preprocessing.py is the module for

  - ingest data
  - do simple preprocessing tasks
  - upload processed data to gcs
"""

# Libraries --------------------------------------------------------------------------------
import logging
import argparse
from pathlib import Path
import sys

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as error:
    print('WARN: Something wrong with pyspark library. Please check configuration settings!')
    print(error)

from pyspark.sql.types import StructType, DoubleType, StringType

# Variables --------------------------------------------------------------------------------
DATA_SCHEMA = (StructType()
               .add("label", StringType(), True)
               .add("loan_amount", StringType(), True)
               .add("loan_term", StringType(), True)
               .add("property_area", StringType(), True)
               .add("timestamp", StringType(), True)
               .add("entity_type_customer_id", StringType(), True)
               .add("feature_7", DoubleType(), True)
               .add("feature_3", DoubleType(), True)
               .add("feature_1", DoubleType(), True)
               .add("feature_9", DoubleType(), True)
               .add("feature_5", DoubleType(), True)
               .add("feature_0", DoubleType(), True)
               .add("feature_8", DoubleType(), True)
               .add("feature_4", DoubleType(), True)
               .add("feature_2", DoubleType(), True)
               .add("feature_6", DoubleType(), True)
               )

ENTITY_CUSTOMER_ID = 'entity_type_customer_id'
FEATURE_STORE_IDS = ['timestamp', 'entity_type_customer_id']
CATEGORICAL_VARIABLES = ['loan_term', 'property_area']
IDX_CATEGORICAL_FEATURES = [f'{col}_idx' for col in CATEGORICAL_VARIABLES]
TARGET = 'label'


# Helpers ----------------------------------------------------------------------------------

def set_logger():
    """
    Set logger for the module
    Returns:
        logger: logger object
    """
    fmt_pattern = "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
    main_logger = logging.getLogger(__name__)
    main_logger.setLevel(logging.INFO)
    main_logger.propagate = False
    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt_pattern)
    stream_handler.setFormatter(formatter)
    main_logger.addHandler(stream_handler)
    return main_logger


def get_args():
    """
    Get arguments from command line
    Returns:
        args: arguments from command line
    """
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument(
        '--train-data-path',
        help='The GCS path of training sample',
        type=str,
        required=True)
    args_parser.add_argument(
        '--out-process-path',
        help='''
        The path to load processed data. 
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=True)
    return args_parser.parse_args()


# Main -------------------------------------------------------------------------------------

def main(logger, args):
    """
    Main function
    Args:
        logger: logger object
        args: arguments from command line
    Returns:
        None
    """
    # variables
    train_data_path = args.train_data_path
    output_data_path = args.out_process_path

    logger.info('initializing data preprocessing.')
    logger.info('start spark session.')

    spark = (SparkSession.builder
             .master("local[*]")
             .appName("loan eligibility")
             .getOrCreate())
    try:
        logger.info(f'spark version: {spark.sparkContext.version}')
        logger.info('start ingesting data.')

        training_data_raw_df = (spark.read.option("header", True)
                                .option("delimiter", ',')
                                .schema(DATA_SCHEMA)
                                .csv(train_data_path)
                                .drop(*FEATURE_STORE_IDS))

        training_data_raw_df = training_data_raw_df.withColumn("label",
                                                               training_data_raw_df.label.cast('double'))
        training_data_raw_df = training_data_raw_df.withColumn("loan_amount",
                                                               training_data_raw_df.loan_amount.cast('double'))
        training_data_raw_df.show(truncate=False)

        logger.info(f'load prepared data to {output_data_path}.')
        training_data_raw_df.write.mode('overwrite').csv(str(output_data_path), header=True)
    except RuntimeError as main_error:
        logger.error(main_error)
    else:
        logger.info('data preprocessing successfully completed!')
        return 0


if __name__ == "__main__":
    runtime_args = get_args()
    runtime_logger = set_logger()
    main(runtime_logger, runtime_args)

#### 创建用于模型训练的源代码

在训练数据上创建`model_training.py`文件，用于训练一个随机森林分类器模型。训练是在Spark会话内使用Spark ML来执行的。该代码从Cloud存储桶中获取训练数据，对其进行处理并训练随机森林模型。训练好的模型以及从训练好的模型中获取的指标（如AUC-ROC、准确性、精确率等）然后保存到提供的输出Cloud存储路径中。该代码接受以下参数：

- `--train-path`：训练样本的Cloud存储路径。
- `--model-path`：存储训练好的模型的Cloud存储路径。
- `--metrics-path`：存储模型指标的Cloud存储路径。

In [None]:
%%writefile $SRC/model_training.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
model_training.py is the module for training spark pipeline
"""

# Libraries --------------------------------------------------------------------------------
import logging
import sys
import argparse
from pathlib import Path as path
import tempfile
import json
from urllib.parse import urlparse

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    print('WARN: Something wrong with pyspark library. Please check configuration settings!')
    print(e)

from pyspark.sql.types import StructType, DoubleType, StringType
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import round as spark_round
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

from google.cloud import storage

# Variables --------------------------------------------------------------------------------

# Data schema
DATA_SCHEMA = (StructType()
               .add("label", DoubleType(), True)
               .add("loan_amount", DoubleType(), True)
               .add("loan_term", StringType(), True)
               .add("property_area", StringType(), True)
               .add("feature_7", DoubleType(), True)
               .add("feature_3", DoubleType(), True)
               .add("feature_1", DoubleType(), True)
               .add("feature_9", DoubleType(), True)
               .add("feature_5", DoubleType(), True)
               .add("feature_0", DoubleType(), True)
               .add("feature_8", DoubleType(), True)
               .add("feature_4", DoubleType(), True)
               .add("feature_2", DoubleType(), True)
               .add("feature_6", DoubleType(), True)
               )

# Training
TARGET = 'label'
CATEGORICAL_VARIABLES = ['loan_term', 'property_area']
IDX_CATEGORICAL_FEATURES = [f'{col}_idx' for col in CATEGORICAL_VARIABLES]
REAL_TIME_FEATURES_VECTOR = 'real_time_features_vector'
REAL_TIME_FEATURES = 'real_time_features'
FEATURES_SELECTED = ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
                     'feature_6', 'feature_7', 'feature_8', 'feature_9', 'real_time_features']
FEATURES = 'features'
RANDOM_SEED = 8
RANDOM_QUOTAS = [0.8, 0.2]


# Helpers ----------------------------------------------------------------------------------
def set_logger():
    """
    Set logger
    Returns:
        logger: logger
    """
    fmt_pattern = "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
    main_logger = logging.getLogger(__name__)
    main_logger.setLevel(logging.INFO)
    main_logger.propagate = False
    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt_pattern)
    stream_handler.setFormatter(formatter)
    main_logger.addHandler(stream_handler)
    return main_logger


def get_args():
    """
    Get arguments
    Returns:
        args: arguments
    """
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument(
        '--train-path',
        help='''
        The GCS path of training data'
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=True)
    args_parser.add_argument(
        '--model-path',
        help='''
        The GCS path to store the trained model. 
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=True)
    args_parser.add_argument(
        '--metrics-path',
        help='''
        The GCS path to store the metrics of model. 
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=True)
    return args_parser.parse_args()


def build_preprocessing_components():
    """
    Build preprocessing components
    Returns:
        data_preprocessing_stages: data preprocessing stages
    """
    loan_term_indexer = StringIndexer(inputCol=CATEGORICAL_VARIABLES[0], outputCol=IDX_CATEGORICAL_FEATURES[0],
                                      stringOrderType='frequencyDesc', handleInvalid='keep')
    property_area_indexer = StringIndexer(inputCol=CATEGORICAL_VARIABLES[1], outputCol=IDX_CATEGORICAL_FEATURES[1],
                                          stringOrderType='frequencyDesc', handleInvalid='keep')
    data_preprocessing_stages = [loan_term_indexer, property_area_indexer]
    return data_preprocessing_stages


def build_feature_engineering_components():
    """
    Build feature engineering components
    Returns:
        feature_engineering_stages: feature engineering stages
    """
    feature_engineering_stages = []
    realtime_vector_assembler = VectorAssembler(inputCols=IDX_CATEGORICAL_FEATURES, outputCol=REAL_TIME_FEATURES_VECTOR)
    realtime_scaler = StandardScaler(inputCol=REAL_TIME_FEATURES_VECTOR, outputCol=REAL_TIME_FEATURES)
    features_vector_assembler = VectorAssembler(inputCols=FEATURES_SELECTED, outputCol=FEATURES)
    feature_engineering_stages.extend((realtime_vector_assembler,
                                       realtime_scaler,
                                       features_vector_assembler))
    return feature_engineering_stages


def build_training_model_component():
    """
    Build training model component
    Returns:
        model_training_stage: model_training_stage
    """
    model_training_stage = []
    rfor = RandomForestClassifier(featuresCol=FEATURES, labelCol=TARGET, seed=RANDOM_SEED)
    model_training_stage.append(rfor)
    return model_training_stage


def build_pipeline(data_preprocessing_stages, feature_engineering_stages, model_training_stage):
    """
    Build pipeline
    Args:
        data_preprocessing_stages:  data preprocessing stages
        feature_engineering_stages: feature engineering stages
        model_training_stage: model_training_stage
    Returns:
        pipeline: pipeline
    """
    pipeline = Pipeline(stages=data_preprocessing_stages + feature_engineering_stages + model_training_stage)
    return pipeline


def get_true_score_prediction(predictions, target):
    """
    Get true score prediction
    Args:
        predictions: predictions
        target: target
    Returns:
        roc_dict: a dict of roc values for each class
    """
    split1_udf = udf(lambda value: value[1].item(), DoubleType())
    roc_dataset = predictions.select(col(target).alias('true'),
                                     spark_round(split1_udf('probability'), 5).alias('score'),
                                     'prediction')
    roc_df = roc_dataset.toPandas()
    roc_dict = roc_df.to_dict(orient='list')
    return roc_dict


def get_metrics(predictions, target, mode):
    """
    Get metrics
    Args:
        predictions: predictions
        target: target column name
        mode: train or test
    Returns:
        metrics: metrics
    """
    metric_labels = ['area_roc', 'area_prc', 'accuracy', 'f1', 'precision', 'recall']
    metric_cols = ['true', 'score', 'prediction']
    metric_keys = [f'{mode}_{ml}' for ml in metric_labels] + metric_cols
    bc_evaluator = BinaryClassificationEvaluator(labelCol=target)
    mc_evaluator = MulticlassClassificationEvaluator(labelCol=target)

    # areas, acc, f1, prec, rec
    metric_values = []
    area_roc = round(bc_evaluator.evaluate(predictions, {bc_evaluator.metricName: 'areaUnderROC'}), 5)
    area_prc = round(bc_evaluator.evaluate(predictions, {bc_evaluator.metricName: 'areaUnderPR'}), 5)
    acc = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "accuracy"}), 5)
    f1 = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "f1"}), 5)
    prec = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "weightedPrecision"}), 5)
    rec = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "weightedRecall"}), 5)

    # true, score, prediction
    roc_dict = get_true_score_prediction(predictions, target)
    true = roc_dict['true']
    score = roc_dict['score']
    pred = roc_dict['prediction']

    metric_values.extend((area_roc, area_prc, acc, f1, prec, rec, true, score, pred))
    metrics = dict(zip(metric_keys, metric_values))

    return metrics


def upload_file(bucket_name, source_file_name, destination_blob_name):
    """
    Upload file to bucket
    Args:
        bucket_name: bucket name
        source_file_name: source file name
        destination_blob_name: destination blob name
    Returns:
        None
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)


def write_metrics(bucket_name, metrics, destination, dir='/tmp'):
    """
    Write metrics to file
    Args:
        bucket_name: bucket name
        metrics: metrics
        destination: destination
        dir: directory to write file temporarily
    Returns:
        None
    """
    temp_dir = tempfile.TemporaryDirectory(dir=dir)
    temp_metrics_file_path = str(path(temp_dir.name) / path(destination).name)
    with open(temp_metrics_file_path, 'w') as temp_file:
        json.dump(metrics, temp_file)
    upload_file(bucket_name, temp_metrics_file_path, destination)
    temp_dir.cleanup()


# Main -------------------------------------------------------------------------------------

def main(logger, args):
    """
    Main function
    Args:
        logger: logger
        args: args
    Returns:
        None
    """
    train_path = args.train_path
    model_path = args.model_path
    metrics_path = args.metrics_path

    try:
        logger.info('initializing pipeline training.')
        logger.info('start spark session.')
        spark = (SparkSession.builder
                 .master("local[*]")
                 .appName("loan eligibility")
                 .getOrCreate())
        logger.info(f'spark version: {spark.sparkContext.version}')
        logger.info('start bulding pipeline.')
        preprocessing_stages = build_preprocessing_components()
        feature_engineering_stages = build_feature_engineering_components()
        model_training_stage = build_training_model_component()
        pipeline = build_pipeline(preprocessing_stages, feature_engineering_stages, model_training_stage)

        logger.info(f'load train data from {train_path}.')
        raw_data = (spark.read.format('csv')
                    .option("header", "true")
                    .schema(DATA_SCHEMA)
                    .load(train_path))

        logger.info(f'fit model pipeline.')
        train, test = raw_data.randomSplit(RANDOM_QUOTAS, seed=RANDOM_SEED)
        pipeline_model = pipeline.fit(train)
        predictions = pipeline_model.transform(test)
        metrics = get_metrics(predictions, TARGET, 'test')
        for m, v in metrics.items():
            print(f'{m}: {v}')

        logger.info(f'load model pipeline in {model_path}.')
        pipeline.write().overwrite().save(model_path)

        logger.info(f'Upload metrics under {metrics_path}.')     
        bucket = urlparse(model_path).netloc
        metrics_file_path = urlparse(metrics_path).path.strip('/')
        write_metrics(bucket, metrics, metrics_file_path)
        
    except RuntimeError as main_error:
        logger.error(main_error)
    else:
        logger.info('model pipeline training successfully completed!')
        return 0


if __name__ == "__main__":
    runtime_args = get_args()
    runtime_logger = set_logger()
    main(runtime_logger, runtime_args)

#### 为超参数调整创建源代码

创建 `hp_tuning.py` 文件，用于通过交叉验证对随机森林分类器模型的超参数进行调整。此代码接受以下参数：

- `--train-path`：训练样本的 GCS 路径。
- `--model-path`：存储训练模型的 GCS 路径。
- `--metrics-path`：存储模型指标的 GCS 路径。

超参数调整作业还将最佳表现模型序列化为 MLeap bundle，该模型可以作为用于提供预测的模型导入到 Vertex AI - 请查看下面的 *在 Vertex AI 中服务您的模型* 部分。

In [None]:
%%writefile $SRC/hp_tuning.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
hp_model_tuning.py is the module for hypertune the spark pipeline
"""

# Libraries --------------------------------------------------------------------------------
import logging
import sys
import argparse
from os import environ
from datetime import datetime
from pathlib import Path as path
import tempfile
from urllib.parse import urlparse, urljoin
import json

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    print('WARN: Something wrong with pyspark library. Please check configuration settings!')
    print(e)
    
import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer

from pyspark.sql.types import StructType, DoubleType, StringType
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import round as spark_round
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline

from google.cloud import storage

# Variables --------------------------------------------------------------------------------

# Data schema
DATA_SCHEMA = (StructType()
               .add("label", DoubleType(), True)
               .add("loan_amount", DoubleType(), True)
               .add("loan_term", StringType(), True)
               .add("property_area", StringType(), True)
               .add("feature_7", DoubleType(), True)
               .add("feature_3", DoubleType(), True)
               .add("feature_1", DoubleType(), True)
               .add("feature_9", DoubleType(), True)
               .add("feature_5", DoubleType(), True)
               .add("feature_0", DoubleType(), True)
               .add("feature_8", DoubleType(), True)
               .add("feature_4", DoubleType(), True)
               .add("feature_2", DoubleType(), True)
               .add("feature_6", DoubleType(), True)
               )

# Training
TARGET = 'label'
CATEGORICAL_VARIABLES = ['loan_term', 'property_area']
IDX_CATEGORICAL_FEATURES = [f'{col}_idx' for col in CATEGORICAL_VARIABLES]
REAL_TIME_FEATURES_VECTOR = 'real_time_features_vector'
REAL_TIME_FEATURES = 'real_time_features'
FEATURES_SELECTED = ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
                     'feature_6', 'feature_7', 'feature_8', 'feature_9', 'real_time_features']
FEATURES = 'features'
RANDOM_SEED = 8
RANDOM_QUOTAS = [0.8, 0.2]
MAX_DEPTH = [5, 10, 15]
MAX_BINS = [24, 32, 40]
N_TREES = [25, 30, 35]
N_FOLDS = 5


# Helpers ----------------------------------------------------------------------------------
def set_logger():
    """
    Set logger for the module
    Returns:
        logger: logger object
    """
    fmt_pattern = "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
    main_logger = logging.getLogger(__name__)
    main_logger.setLevel(logging.INFO)
    main_logger.propagate = False
    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt_pattern)
    stream_handler.setFormatter(formatter)
    main_logger.addHandler(stream_handler)
    return main_logger


def get_args():
    """
    Get arguments from command line
    Returns:
        args: arguments from command line
    """
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument(
        '--train-path',
        help='''
        The GCS path of training data'
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=False)
    args_parser.add_argument(
        '--model-path',
        help='''
        The GCS path to store the trained model. 
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=False)
    args_parser.add_argument(
        '--metrics-path',
        help='''
        The GCS path to store the metrics of model. 
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=True)
    args_parser.add_argument(
        '--bundle-path',
        help='''
        The GCS path to store the exported MLeap bundle. 
        Format: 
        - locally: /path/to/dir
        - cloud: gs://bucket/path
        ''',
        type=str,
        required=True)
    return args_parser.parse_args()


def build_preprocessing_components():
    """
    Build preprocessing components
    Returns:
        preprocessing_components: preprocessing components
    """
    loan_term_indexer = StringIndexer(inputCol=CATEGORICAL_VARIABLES[0], outputCol=IDX_CATEGORICAL_FEATURES[0],
                                      stringOrderType='frequencyDesc', handleInvalid='keep')
    property_area_indexer = StringIndexer(inputCol=CATEGORICAL_VARIABLES[1], outputCol=IDX_CATEGORICAL_FEATURES[1],
                                          stringOrderType='frequencyDesc', handleInvalid='keep')
    data_preprocessing_stages = [loan_term_indexer, property_area_indexer]
    return data_preprocessing_stages


def build_feature_engineering_components():
    """
    Build feature engineering components
    Returns:
        feature_engineering_components: feature engineering components
    """
    feature_engineering_stages = []
    realtime_vector_assembler = VectorAssembler(inputCols=IDX_CATEGORICAL_FEATURES, outputCol=REAL_TIME_FEATURES_VECTOR)
    realtime_scaler = StandardScaler(inputCol=REAL_TIME_FEATURES_VECTOR, outputCol=REAL_TIME_FEATURES)
    features_vector_assembler = VectorAssembler(inputCols=FEATURES_SELECTED, outputCol=FEATURES)
    feature_engineering_stages.extend((realtime_vector_assembler,
                                       realtime_scaler,
                                       features_vector_assembler))
    return feature_engineering_stages


def build_training_model_component():
    """
    Build training model component
    Returns:
        training_model_component: training model component
    """
    model_training_stage = []
    rfor = RandomForestClassifier(featuresCol=FEATURES, labelCol=TARGET, seed=RANDOM_SEED)
    model_training_stage.append(rfor)
    return model_training_stage


def build_hp_pipeline(data_preprocessing_stages, feature_engineering_stages, model_training_stage):
    """
    Build hyperparameter pipeline
    Args:
        data_preprocessing_stages: preprocessing components
        feature_engineering_stages: feature engineering components
        model_training_stage: training model component
    Returns:
        hp_pipeline: hyperparameter pipeline
    """
    pipeline = Pipeline(stages=data_preprocessing_stages + feature_engineering_stages + model_training_stage)
    params_grid = (ParamGridBuilder()
                   .addGrid(model_training_stage[0].maxDepth, MAX_DEPTH)
                   .addGrid(model_training_stage[0].maxBins, MAX_BINS)
                   .addGrid(model_training_stage[0].numTrees, N_TREES)
                   .build())
    evaluator = BinaryClassificationEvaluator(labelCol=TARGET)
    cross_validator = CrossValidator(estimator=pipeline,
                                     estimatorParamMaps=params_grid,
                                     evaluator=evaluator,
                                     numFolds=N_FOLDS)
    return cross_validator


def get_true_score_prediction(predictions, target):
    """
    Get true score and prediction
    Args:
        predictions: predictions
        target: target column
    Returns:
        roc_dict: a dict of roc values for each class
    """

    split1_udf = udf(lambda value: value[1].item(), DoubleType())
    roc_dataset = predictions.select(col(target).alias('true'),
                                     spark_round(split1_udf('probability'), 5).alias('score'),
                                     'prediction')
    roc_df = roc_dataset.toPandas()
    roc_dict = roc_df.to_dict(orient='list')
    return roc_dict


def get_metrics(predictions, target, mode):
    """
    Get metrics
    Args:
        predictions: predictions
        target: target column
        mode: train or test
    Returns:
        metrics: metrics
    """
    metric_labels = ['area_roc', 'area_prc', 'accuracy', 'f1', 'precision', 'recall']
    metric_cols = ['true', 'score', 'prediction']
    metric_keys = [f'{mode}_{ml}' for ml in metric_labels] + metric_cols

    bc_evaluator = BinaryClassificationEvaluator(labelCol=target)
    mc_evaluator = MulticlassClassificationEvaluator(labelCol=target)

    # areas, acc, f1, prec, rec
    metric_values = []
    area_roc = round(bc_evaluator.evaluate(predictions, {bc_evaluator.metricName: 'areaUnderROC'}), 5)
    area_prc = round(bc_evaluator.evaluate(predictions, {bc_evaluator.metricName: 'areaUnderPR'}), 5)
    acc = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "accuracy"}), 5)
    f1 = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "f1"}), 5)
    prec = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "weightedPrecision"}), 5)
    rec = round(mc_evaluator.evaluate(predictions, {mc_evaluator.metricName: "weightedRecall"}), 5)

    # true, score, prediction
    roc_dict = get_true_score_prediction(predictions, target)
    true = roc_dict['true']
    score = roc_dict['score']
    pred = roc_dict['prediction']

    metric_values.extend((area_roc, area_prc, acc, f1, prec, rec, true, score, pred))
    metrics = dict(zip(metric_keys, metric_values))

    return metrics


def upload_file(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)


def write_metrics(bucket_name, metrics, destination, dir='/tmp'):
    temp_dir = tempfile.TemporaryDirectory(dir=dir)
    temp_metrics_file_path = str(path(temp_dir.name) / path(destination).name)
    with open(temp_metrics_file_path, 'w') as temp_file:
        json.dump(metrics, temp_file)
    upload_file(bucket_name, temp_metrics_file_path, destination)
    temp_dir.cleanup()


# Main -------------------------------------------------------------------------------------

def main(logger, args):
    """
    Main function
    Args:
        logger: logger
        args: args
    Returns:
        None
    """
    train_path = args.train_path
    model_path = args.model_path
    metrics_path = args.metrics_path
    bundle_path = args.bundle_path

    try:
        logger.info('initializing pipeline training.')
        logger.info('start spark session.')
        spark = (SparkSession.builder
                 .master("local[*]")
                 .appName("loan eligibility")
                 .getOrCreate())
        logger.info(f'spark version: {spark.sparkContext.version}')
        logger.info('start building pipeline.')
        preprocessing_stages = build_preprocessing_components()
        feature_engineering_stages = build_feature_engineering_components()
        model_training_stage = build_training_model_component()
        pipeline_cross_validator = build_hp_pipeline(preprocessing_stages, feature_engineering_stages,
                                                     model_training_stage)
        logger.info(f'load train data from {train_path}.')
        raw_data = (spark.read.format('csv')
                        .option("header", "true")
                        .schema(DATA_SCHEMA)
                        .load(train_path))
        logger.info(f'fit model pipeline.')
        train, test = raw_data.randomSplit(RANDOM_QUOTAS, seed=RANDOM_SEED)
        pipeline_model = pipeline_cross_validator.fit(train)
        predictions = pipeline_model.transform(test)
        metrics = get_metrics(predictions, TARGET, 'test')
        for m, v in metrics.items():
            print(f'{m}: {v}')

        logger.info(f'load model pipeline in {model_path}.')
        pipeline_model.write().overwrite().save(model_path)

        logger.info(f'upload metrics under {metrics_path}.')
        bucket = urlparse(model_path).netloc
        metrics_file_path = urlparse(metrics_path).path.strip('/')
        write_metrics(bucket, metrics, metrics_file_path)
        
        logger.info('export MLeap bundle to temporary location')
        pipeline_model.bestModel.serializeToBundle(f'jar:file:/tmp/bundle.zip', predictions)
        
        logger.info(f'upload MLeap bundle to {bundle_path}')
        bundle_file_path = urlparse(bundle_path).path.strip('/')
        bucket = urlparse(bundle_path).netloc
        logger.info(f'Copying /tmp/bundle.zip to bucket {bucket} using object name {bundle_file_path} ...')
        upload_file(bucket, '/tmp/bundle.zip', bundle_file_path)
        
    except RuntimeError as main_error:
        logger.error(main_error)
    else:
        logger.info('model pipeline training successfully completed!')
        return 0


if __name__ == "__main__":
    runtime_args = get_args()
    runtime_logger = set_logger()
    main(runtime_logger, runtime_args)

### 上传源代码

为了使用`DataprocPySparkBatchOp` 来自 google-cloud-pipeline-components，您需要将代码上传到云存储桶中。

In [None]:
! gsutil cp $SRC/__init__.py $BUCKET_URI/src/__init__.py
! gsutil cp $SRC/data_preprocessing.py $BUCKET_URI/src/data_preprocessing.py
! gsutil cp $SRC/model_training.py $BUCKET_URI/src/model_training.py
! gsutil cp $SRC/hp_tuning.py $BUCKET_URI/src/hp_tuning.py

构建一个自定义的Dataproc无服务器容器镜像

Dataproc无服务器提供默认的运行时镜像。了解更多关于[Dataproc无服务器Spark运行时发布版本](https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-versions)。

您也可以为您的Dataproc无服务器工作负载使用自定义容器镜像。本部分的步骤构建一个包含额外依赖项的自定义容器镜像。可以在使用`DataprocPySparkBatchOp`组件启动流水线中指定自定义容器镜像。

定义Dataproc Serverless自定义运行时镜像

In [None]:
!mkdir -m 777 -p $BUILD_PATH

In [None]:
%%writefile $BUILD_PATH/Dockerfile

# Debian 11 is recommended.
FROM debian:11-slim

# Suppress interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# (Required) Install utilities required by Spark scripts.
RUN apt update && apt install -y procps tini

# (Optional) Add extra jars.
ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"

# (Optional) Install and configure Miniconda3.
ENV CONDA_HOME=/opt/miniconda3
ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
ENV PATH=${CONDA_HOME}/bin:${PATH}
COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \
  && ${CONDA_HOME}/bin/conda config --system --set always_yes True \
  && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \
  && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \
  && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict

# (Optional) Install Conda packages.
#
# The following packages are installed in the default image, it is strongly
# recommended to include all of them.
#
# Use mamba solver to install packages quickly.
RUN ${CONDA_HOME}/bin/conda install -n base conda-libmamba-solver
RUN ${CONDA_HOME}/bin/conda install \
      cython \
      fastavro \
      fastparquet \
      gcsfs \
      google-cloud-bigquery-storage \
      google-cloud-bigquery[pandas] \
      google-cloud-dataproc \
      numpy \
      pandas \
      python \
      scikit-image \
      scikit-learn \
      scipy \
      mleap --solver=libmamba

# (Required) Create the 'spark' group/user.
# The GID and UID must be 1099. Home directory is required.
RUN groupadd -g 1099 spark
RUN useradd -u 1099 -g 1099 -d /home/spark -m spark
USER spark

下载 `spark-bigquery-with-dependencies` jar 文件

In [None]:
!gsutil cp gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar $BUILD_PATH
!wget -P $BUILD_PATH https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh

使用Cloud Build构建Dataproc无服务器自定义运行时

**注意：**此步骤可能需要大约20分钟完成。

In [None]:
!gcloud builds submit --tag $RUNTIME_CONTAINER_IMAGE $BUILD_PATH --machine-type=N1_HIGHCPU_32 --timeout=3600s --verbosity=info

### 为管道参数构建自定义组件

为了传递作业参数，您需要为管道的每个步骤创建一些自定义组件。

#### 创建用于向预处理组件传递参数的组件

以下组件以先前定义的预处理函数所需的格式传递参数`--train-data-path`和`--out-process-path`。

In [None]:
@component(base_image="python:3.8-slim")
def build_preprocessing_args(train_data_path: str, processed_data_path: str) -> list:
    return [
        "--train-data-path",
        train_data_path,
        "--out-process-path",
        processed_data_path,
    ]

创建组件以传递参数给训练组件

以下组件以先前定义的模型训练函数所需的格式传递参数 `--train-path`、`--model-path` 和 `--metrics-path`。

In [None]:
@component(base_image="python:3.8-slim")
def build_training_args(train_path: str, model_path: str, metrics_path: str) -> list:
    return [
        "--train-path",
        train_path,
        "--model-path",
        model_path,
        "--metrics-path",
        metrics_path,
    ]

#### 创建模型评估自定义组件

定义用于处理模型评估指标的组件。从模型训练组件获得的`metrics_uri`、`metrics`和`plots`将通过此组件进一步评估。

In [None]:
@component(
    base_image="python:3.8",
    packages_to_install=["numpy==1.21.2", "pandas==1.3.3", "scikit-learn==0.24.2"],
)
def evaluate_model(
    metrics_uri: str,
    metrics: Output[Metrics],
    plots: Output[ClassificationMetrics],
) -> NamedTuple("Outputs", [("threshold_metric", float)]):
    # Libraries --------------------------------------------------------------------------------------------------------------------------
    import json

    import numpy as np
    from sklearn.metrics import confusion_matrix, roc_curve

    # Variables --------------------------------------------------------------------------------------------------------------------------
    metrics_path = metrics_uri.replace("gs://", "/gcs/")
    labels = ["not eligible", "eligible"]

    # Helpers --------------------------------------------------------------------------------------------------------------------------
    def calculate_roc(metrics, true, score):
        y_true_np = np.array(metrics[true])
        y_score_np = np.array(metrics[score])
        fpr, tpr, thresholds = roc_curve(
            y_true=y_true_np, y_score=y_score_np, pos_label=True
        )
        return fpr, tpr, thresholds

    def calculate_confusion_matrix(metrics, true, prediction):
        y_true_np = np.array(metrics[true])
        y_pred_np = np.array(metrics[prediction])
        c_matrix = confusion_matrix(y_true_np, y_pred_np)
        return c_matrix

    # Main -------------------------------------------------------------------------------------------------------------------------------
    with open(metrics_path) as json_file:
        metrics_dict = json.load(json_file)

    area_roc = metrics_dict["test_area_roc"]
    area_prc = metrics_dict["test_area_prc"]
    acc = metrics_dict["test_accuracy"]
    f1 = metrics_dict["test_f1"]
    prec = metrics_dict["test_precision"]
    rec = metrics_dict["test_recall"]

    metrics.log_metric("Test_areaUnderROC", area_roc)
    metrics.log_metric("Test_areaUnderPRC", area_prc)
    metrics.log_metric("Test_Accuracy", acc)
    metrics.log_metric("Test_f1-score", f1)
    metrics.log_metric("Test_Precision", prec)
    metrics.log_metric("Test_Recall", rec)

    fpr, tpr, thresholds = calculate_roc(metrics_dict, "true", "score")
    c_matrix = calculate_confusion_matrix(metrics_dict, "true", "prediction")
    plots.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())
    plots.log_confusion_matrix(labels, c_matrix.tolist())

    component_outputs = NamedTuple(
        "Outputs",
        [
            ("threshold_metric", float),
        ],
    )

    return component_outputs(area_prc)

为超参数调整组件创建传递参数的组件

以下组件以先前定义的超参数调整函数所要求的格式传递参数 `--train-path`，`--model-path`，`--metrics-path` 和 `--bundle-path`。

In [None]:
@component(base_image="python:3.8-slim")
def build_hpt_args(
    train_path: str,
    model_path: str,
    metrics_path: str,
    bundle_path: str,
) -> list:
    return [
        "--train-path",
        train_path,
        "--model-path",
        model_path,
        "--metrics-path",
        metrics_path,
        "--bundle-path",
        bundle_path,
    ]

###（可选）使用Vertex AI为您的模型提供服务

超参数调整任务将最佳执行模型导出为一个MLeap包。这个MLeap包可以导入到Vertex AI模型注册表中，并用于预测服务。有关更多信息，请参见[Serving Spark ML model using Vertex AI](https://cloud.google.com/architecture/spark-ml-model-with-vertexai)。

启用将MLeap包导入到Vertex AI模型注册表和在线预测服务中。

In [None]:
# Set DEPLOY_MODEL to True
DEPLOY_MODEL = True

构建模型服务容器镜像

在将您的模型导入到模型注册表中时，需要一个*服务容器镜像*。服务容器镜像为模型提供了模型服务实现。了解更多关于[使用Vertex AI为Spark ML模型提供服务](https://cloud.google.com/architecture/spark-ml-model-with-vertexai)。

**注意：**此步骤可能需要大约5到10分钟才能完成。

In [None]:
DEPLOY_MODEL_CONDITION = 'deploy'

if DEPLOY_MODEL:

    import os
    
    CWD = os.getcwd()

    # Clone and build the scala-sbt cloud builder
    ! git clone https://github.com/GoogleCloudPlatform/cloud-builders-community.git
    ! cd {CWD}/cloud-builders-community/scala-sbt && \
        gcloud builds submit .

    # Clone and build the serving container code
    ! cd {CWD} && git clone https://github.com/GoogleCloudPlatform/vertex-ai-spark-ml-serving.git
    ! cd {CWD}/vertex-ai-spark-ml-serving && \
        gcloud builds submit --config=cloudbuild.yaml \
            --substitutions="_LOCATION={REGION},_REPOSITORY={REPO_NAME},_IMAGE=spark-ml-serving" .

### 为将模型工件导入到管道中创建组件

该管道使用`ModelImportOp`组件将模型导入（上传）到Vertex AI模型注册表。

`import_model_artifact` python组件创建一个模型工件，可以将其传递给`ModelImportOp`组件。

In [None]:
@dsl.component(
    base_image="python:3.8-slim",
    packages_to_install=["google-cloud-aiplatform"],
)
def import_model_artifact(
    model: dsl.Output[dsl.Artifact], artifact_uri: str, serving_image_uri: str
):
    model.metadata["containerSpec"] = {
        "imageUri": serving_image_uri,
        "healthRoute": "/health",
        "predictRoute": "/predict",
    }
    model.uri = artifact_uri

### 定义用于模型服务的模式

服务容器需要以JSON格式提供的模型模式，该模式在容器启动期间读取。了解更多关于[提供模型模式](https://cloud.google.com/architecture/spark-ml-model-with-vertexai#provide_the_model_schema)的信息。

编写模型模式文件：

In [None]:
%%writefile $SRC/schema.json
{
  "input": [
    {
      "name": "loan_amount",
      "type": "DOUBLE"
    },
    {
      "name": "loan_term",
      "type": "STRING"
    },
    {
      "name": "property_area",
      "type": "STRING"
    },
    {
      "name": "feature_7",
      "type": "DOUBLE"
    },
    {
      "name": "feature_3",
      "type": "DOUBLE"
    },
    {
      "name": "feature_1",
      "type": "DOUBLE"
    },
    {
      "name": "feature_9",
      "type": "DOUBLE"
    },
    {
      "name": "feature_5",
      "type": "DOUBLE"
    },
    {
      "name": "feature_0",
      "type": "DOUBLE"
    },
    {
      "name": "feature_8",
      "type": "DOUBLE"
    },
    {
      "name": "feature_4",
      "type": "DOUBLE"
    },
    {
      "name": "feature_2",
      "type": "DOUBLE"
    },
    {
      "name": "feature_6",
      "type": "DOUBLE"
    }
  ],
  "output": [
    {
      "name": "prediction",
      "type": "DOUBLE"
    }
  ]
}

将模型模式配置文件复制到 GCS。

在启动时，Serving 容器会从 `AIP_STORAGE_URI` 环境中读取模型模式文件的位置。有关更多信息，请参阅[将模型导入到 Vertex AI](https://cloud.google.com/architecture/spark-ml-model-with-vertexai#import-the-model-into-vertex-ai)。

In [None]:
! gsutil cp $SRC/schema.json $ARTIFACT_URI/schema.json

### 将您的工作流定义为 Vertex AI 管道

使用 Kubeflow Pipelines SDK 将您的工作流定义为机器学习管道。该管道使用了先前定义的自定义组件，以及来自 `google-cloud-pipeline-components` 软件包的组件。

In [None]:
@dsl.pipeline(name=PIPELINE_NAME, description="A pipeline to train a PySpark model.")
def pipeline(
    preprocessing_main_python_file_uri: str = PREPROCESSING_PYTHON_FILE_URI,
    train_data_path: str = FEATURES_TRAIN_URI,
    preprocessed_data_path: str = PROCESSED_DATA_URI,
    training_main_python_file_uri: str = TRAINING_PYTHON_FILE_URI,
    train_path: str = PROCESSED_DATA_URI,
    model_path: str = MODEL_URI,
    metrics_path: str = METRICS_URI,
    threshold: float = AUPR_THRESHOLD,
    hpt_main_python_file_uri: str = HPT_PYTHON_FILE_URI,
    hpt_model_path: str = HPT_MODEL_URI,
    hpt_metrics_path: str = HPT_METRICS_URI,
    hpt_bundle_path: str = HPT_BUNDLE_URI,
    custom_container_image: str = RUNTIME_CONTAINER_IMAGE,
    model_name: str = MODEL_NAME,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    subnetwork_uri: str = SUBNETWORK_URI,
    deploy_model: bool = DEPLOY_MODEL,
    artifact_uri: str = ARTIFACT_URI,
    serving_image_uri: str = SERVING_IMAGE_URI,
    dataproc_runtime_version: str = DATAPROC_RUNTIME_VERSION,
):
    from google_cloud_pipeline_components.v1.dataproc import \
        DataprocPySparkBatchOp
    from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,
                                                              ModelDeployOp)
    from google_cloud_pipeline_components.v1.model import ModelUploadOp

    # build preprocessed data args
    build_preprocessing_args_op = build_preprocessing_args(
        train_data_path=train_data_path, processed_data_path=preprocessed_data_path
    )

    # preprocess data
    data_preprocessing_op = DataprocPySparkBatchOp(
        project=project_id,
        location=location,
        container_image=custom_container_image,
        main_python_file_uri=preprocessing_main_python_file_uri,
        args=build_preprocessing_args_op.output,
        subnetwork_uri=subnetwork_uri,
        runtime_config_version=dataproc_runtime_version,
    ).after(build_preprocessing_args_op)

    # build training data args
    build_training_args_op = build_training_args(
        train_path=train_path,
        model_path=model_path,
        metrics_path=metrics_path,
    ).after(data_preprocessing_op)

    # training model
    model_training_op = DataprocPySparkBatchOp(
        project=project_id,
        location=location,
        container_image=custom_container_image,
        main_python_file_uri=training_main_python_file_uri,
        args=build_training_args_op.output,
        subnetwork_uri=subnetwork_uri,
        runtime_config_version=dataproc_runtime_version,
    ).after(build_training_args_op)

    evaluate_model_op = evaluate_model(metrics_uri=metrics_path).after(
        model_training_op
    )

    # evaluate condition
    with Condition(
        evaluate_model_op.outputs["threshold_metric"] >= threshold,
        name=AUPR_HYPERTUNE_CONDITION,
    ):
        build_hpt_args_op = build_hpt_args(
            train_path=train_path,
            model_path=hpt_model_path,
            metrics_path=hpt_metrics_path,
            bundle_path=hpt_bundle_path,
        ).after(evaluate_model_op)

        # hyperparameter tuning
        hyperparameter_tuning_op = DataprocPySparkBatchOp(
            project=project_id,
            location=location,
            container_image=custom_container_image,
            main_python_file_uri=hpt_main_python_file_uri,
            args=build_hpt_args_op.output,
            runtime_config_properties=HPT_RUNTIME_PROPERTIES,
            subnetwork_uri=subnetwork_uri,
            # TODO: change to Dataproc Serverless Runtime 1.1.x image when MLeap supports Spark 3.3
            runtime_config_version="1.0.29",
        ).after(model_training_op)

        # evaluate condition to upload and deploy model to Vertex AI
        with Condition(
            # kfp casts `bool` parameter to `str`
            deploy_model == "True",
            name=DEPLOY_MODEL_CONDITION,
        ):
            # import the model into the pipeline as a kfp model artifact
            import_model_artifact_op = import_model_artifact(
                artifact_uri=artifact_uri,
                serving_image_uri=serving_image_uri,
            )

            # upload model to Vertex AI
            model_upload_op = ModelUploadOp(
                project=project_id,
                location=location,
                display_name=model_name,
                unmanaged_container_model=import_model_artifact_op.outputs["model"],
            ).after(hyperparameter_tuning_op)

            # create a serving endpoint
            endpoint_op = EndpointCreateOp(
                project=project_id,
                location=location,
                display_name=model_name,
            ).after(model_upload_op)

            # deploy  model to the serving endpoint
            _ = ModelDeployOp(
                model=model_upload_op.outputs["model"],
                endpoint=endpoint_op.outputs["endpoint"],
                dedicated_resources_machine_type="n1-standard-2",
                dedicated_resources_min_replica_count=1,
                dedicated_resources_max_replica_count=1,
            ).after(endpoint_op)

### 将您的管道编译成一个 JSON 文件

现在您已经定义了管道的工作流程，您可以将管道编译成 JSON 格式。

In [None]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path=PIPELINE_PACKAGE_PATH)

### 提交您的流水线运行

接下来，您可以使用Vertex AI Python SDK通过Vertex AI Pipelines提交和运行您的流水线。

从流水线运行产生的参数、工件和指标将自动记录到 Vertex AI 实验中作为一个实验运行。

In [None]:
pipeline = vertex_ai.PipelineJob(
    display_name=PIPELINE_NAME,
    template_path=PIPELINE_PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

pipeline.submit(service_account=SERVICE_ACCOUNT, experiment=EXPERIMENT_NAME)

检查您的流水线运行状态

最后，您可以通过早期单元格输出中提供的链接来检查您的流水线状态。或者，您可以使用下面单元格中的`wait()`方法来等待流水线完全执行并检查流水线执行状态。

In [None]:
pipeline.wait()

### (可选) 查看实验运行

您可以将所有实验运行的参数、工件和度量值作为 pandas DataFrame 检索出来。有关更多信息，请参阅[比较和分析运行](https://cloud.google.com/vertex-ai/docs/experiments/compare-analyze-runs)。

In [None]:
# get the experiment by name
experiment = vertex_ai.Experiment(experiment_name=EXPERIMENT_NAME)

# export the data as a dataframe
experiment_df = experiment.get_data_frame()

# Show successfully completed experiment runs, sorted by F1 score
experiment_df.query('state == "COMPLETE"').sort_values(
    "metric.Test_f1-score", ascending=False
)

### （可选）从部署的模型获取在线预测

如果模型被部署到 Vertex AI 端点，您可以请求在线预测。使用 `google-cloud-aiplatform` 客户端库来请求预测，或者您也可以使用 `curl`。

对于这个模型，预测的响应会包含发送到端点的每个预测实例的预测标签（`0 == 不符合条件`，`1 == 符合条件`）。

#### 使用 `google-cloud-aiplatform` 请求在线预测

下面的代码示例演示了如何使用 `google-cloud-aiplatform` 客户端库来请求一个或多个实例的预测。

In [None]:
instances = [
    [214.0, "360", "Rural", 2.13, 2.21, 0.0, 0.0, 2.31, 2.01, 0.0, 0.0, 0.0, 0.0],
    [213.0, "360", "Semiurban", 2.03, 2.11, 0.0, 0.0, 2.13, 2.02, 0.0, 0.0, 0.0, 0.0],
]

endpoint = vertex_ai.Endpoint.list(filter=f'display_name="{MODEL_NAME}"')[-1]
endpoint.predict(instances)

使用`curl`请求在线预测

要使用`curl`，首先将预测实例写入文件中：

In [None]:
%%writefile instances.json
{
    "instances": [
        [214.0, "360", "Rural", 2.13, 2.21, 0.0, 0.0, 2.31, 2.01, 0.0, 0.0, 0.0, 0.0],
        [213.0, "360", "Semiurban", 2.03, 2.11, 0.0, 0.0, 2.13, 2.02, 0.0, 0.0, 0.0, 0.0]
    ]
}

使用`curl`命令将预测请求发送到Vertex AI端点。

In [None]:
! curl -X POST \
   -H "Authorization: Bearer $(gcloud auth print-access-token)" \
   -H "Content-Type: application/json" \
   https://{REGION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}:predict \
   -d "@instances.json"

清理工作

要清理此项目中使用的所有谷歌云资源，您可以删除用于教程的[谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除本教程中创建的各个资源：

- Vertex AI Pipeline
- Vertex AI Endpoint
- Vertex AI Model
- Vertex AI Experiment
- Artifact Repository
- Cloud Storage 桶
- 本地 src、build 和 cloned repo 文件夹

In [None]:
# Delete pipeline
pipeline.delete()

# Delete endpoints
endpoint_list = vertex_ai.Endpoint.list(filter=f'display_name="{MODEL_NAME}"')
for endpoint in endpoint_list:
    endpoint.undeploy_all()
    endpoint.delete()

# Delete model
model_list = vertex_ai.Model.list(filter=f'display_name="{MODEL_NAME}"')
for model in model_list:
    model.delete()

# Delete experiment
experiment.delete()

In [None]:
# Delete the Artifact repository
! gcloud artifacts repositories delete $REPO_NAME --location=$REGION --quiet

将`delete_bucket`设置为**True**，以删除在此笔记本中使用的Cloud Storage存储桶。

In [None]:
# Delete the Cloud Storage bucket
delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI

In [None]:
# remove the local src, build and repo folders
!rm -rf $SRC $BUILD_PATH cloud-builders-community vertex-ai-spark-ml-serving