In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用自定义模型批量预测和特征过滤
<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/custom_batch_prediction_feature_filter.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在Colab中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fcustom_batch_prediction_feature_filter.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在Colab企业版中打开
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/custom_batch_prediction_feature_filter.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在Workbench中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/custom_batch_prediction_feature_filter.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在GitHub上查看
    </a>
  </td>
</table>

## 概述

本教程演示了如何使用Python的Vertex AI SDK训练自定义表格分类模型，并使用特征过滤执行批量预测。这意味着您可以在一组选定的特征上运行批量预测，或者在预测中排除一组特征。

了解有关[Vertex AI批量预测](https://cloud.google.com/vertex-ai/docs/tabular-data/classification-regression/get-batch-predictions)的更多信息。

### 目标

在这个笔记本中，您将学习如何使用Vertex AI SDK for Python从Python脚本中创建一个自定义训练模型，并在一个Docker容器中运行批量预测作业，通过包含或排除特征列表。

本教程使用以下 Google Cloud ML 服务和资源：

- BigQuery
- Cloud Storage
- Vertex AI 管理的数据集
- Vertex AI 训练
- Vertex AI 批量预测

执行的步骤包括：

- 创建一个Vertex AI自定义`TrainingPipeline`用于训练模型。
- 训练一个TensorFlow模型。
- 发送批量预测作业。

### 数据集

本教程使用的数据集来自[BigQuery公共数据集](https://cloud.google.com/bigquery/public-data)的企鹅数据集。该数据集包含以下字段：`culmen_length_mm`，`culmen_depth_mm`，`flipper_length_mm`，`body_mass_g`，用于预测企鹅的物种(`species`)。

### 成本

本教程使用 Google Cloud 的计费组件：

* Vertex AI
* Cloud Storage
* BigQuery

了解 [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing)、[Cloud Storage 价格](https://cloud.google.com/storage/pricing)、[BigQuery 价格](https://cloud.google.com/bigquery/pricing)，并使用 [定价计算器](https://cloud.google.com/products/calculator/) 根据您的预期使用量生成成本估算。

开始吧。

安装Vertex AI SDK for Python和其他所需包

In [None]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 google-cloud-bigquery \
                                 pyarrow \
                                 db-dtypes

### 重新启动运行时（仅限Colab）

为了使用新安装的软件包，您必须重新启动Google Colab上的运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️内核将重新启动。在继续下一步之前，请等待它完成。⚠️</b>
</div>

### 验证您的笔记本环境（仅限Colab）

在Google Colab上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并初始化Python的Vertex AI SDK

要开始使用Vertex AI，您必须拥有现有的Google Cloud项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

### 创建一个云存储桶

创建一个存储桶，用于存储诸如数据集之类的中间文件。

当您使用Cloud SDK提交训练作业时，您需要将包含训练代码的Python软件包上传到一个云存储桶中。Vertex AI 将从此软件包中运行代码。在这个教程中，Vertex AI 还会将作业产生的训练模型保存在同一个存储桶中。通过使用这个模型文件，您可以创建Vertex AI模型资源并用于预测。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

只有当您的存储桶尚不存在时：运行以下单元格来创建您的云存储桶。

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### 导入库

In [None]:
import json

import numpy as np
from google.cloud import aiplatform, bigquery

### 初始化 Vertex AI SDK for Python

为您的项目和相应的存储桶初始化 Python 版本的 Vertex SDK。

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

初始化BigQuery客户端

为您的项目初始化BigQuery Python客户端。

In [None]:
# Set up BigQuery client
bqclient = bigquery.Client(project=PROJECT_ID)

### 设置预构建容器

Vertex AI提供预构建容器来运行训练和预测。 

有关最新列表，请参见[用于训练的预构建容器](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers)和[用于预测的预构建容器](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers)。

In [None]:
TRAIN_VERSION = "tf-cpu.2-8"
DEPLOY_VERSION = "tf2-cpu.2-8"

TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/{}:latest".format(TRAIN_VERSION)
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(DEPLOY_VERSION)

print("Training:", TRAIN_IMAGE)
print("Deployment:", DEPLOY_IMAGE)

### 准备数据

为了提高自定义深度学习模型的收敛性，需要对数据进行标准化处理。为此，请计算每个数值列的均值和标准差。

将这些总结统计信息传递给训练脚本，在训练之前对数据进行标准化处理。在预测时，再次使用这些总结统计信息对测试数据进行标准化处理。

In [None]:
# Calculate mean and std across all rows

# Define NA values
NA_VALUES = ["NA", "."]


# Download a table
def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe()


# Remove NA values
def clean_dataframe(df):
    return df.replace(to_replace=NA_VALUES, value=np.NaN).dropna()


def calculate_mean_and_std(df):
    # Calculate mean and std for each applicable column
    mean_and_std = {}
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32" or dtype == "float64":
            mean_and_std[column] = {
                "mean": df[column].mean(),
                "std": df[column].std(),
            }

    return mean_and_std

In [None]:
# Define the BigQuery source dataset
BQ_SOURCE = "bq://bigquery-public-data.ml_datasets.penguins"

dataframe = download_table(BQ_SOURCE)
dataframe = clean_dataframe(dataframe)
mean_and_std = calculate_mean_and_std(dataframe)
print(f"The mean and stds for each column are: {str(mean_and_std)}")

# Write to a file
MEAN_AND_STD_JSON_FILE = "mean_and_std.json"

with open(MEAN_AND_STD_JSON_FILE, "w") as outfile:
    json.dump(mean_and_std, outfile)

# Save to the staging bucket
! gsutil cp {MEAN_AND_STD_JSON_FILE} {BUCKET_URI}

### 从 BigQuery 数据集创建 Vertex AI 表格数据集

训练模型的第一步是创建一个 Vertex AI 表格数据集资源。

In [None]:
DATASET_DISPLAY_NAME = "sample-penguins-unique"

dataset = aiplatform.TabularDataset.create(
    display_name=DATASET_DISPLAY_NAME, bq_source=BQ_SOURCE
)

### 训练模型

有两种方法可以使用容器镜像来训练模型：

- **使用 Vertex AI 预构建容器**。如果您使用预构建的训练容器，还必须指定要安装到容器镜像中的 Python 包。这个 Python 包包含您的训练代码。

- **使用您自己的自定义容器镜像**。如果您使用自己的容器，容器镜像必须包含您的训练代码。

###定义训练脚本的命令参数

准备要传递给训练脚本的命令行参数。
* `args`：要传递给相应Python模块的命令行参数。在本例中，它们是：
  * `--epochs`：训练的时代数。
  * `--batch_size`：训练的批量大小。
  * `--distribute`：用于单设备或分布式训练的训练分布策略。
     * `"single"`：单设备。
     * `"mirror"`：单个计算实例上的所有GPU设备。
     * `"multi"`：所有计算实例上的所有GPU设备。
  * `--mean_and_std_json_file`：在云存储上具有预先计算的均值和标准差的文件。

In [None]:
JOB_NAME = "penquins-custom-job-unique"
EPOCHS = 20
BATCH_SIZE = 10
TRAIN_STRATEGY = "single"

CMDARGS = [
    "--epochs=" + str(EPOCHS),
    "--batch_size=" + str(BATCH_SIZE),
    "--distribute=" + TRAIN_STRATEGY,
    "--mean_and_std_json_file=" + f"{BUCKET_URI}/{MEAN_AND_STD_JSON_FILE}",
]

### 训练脚本

在下一个单元格中，编写训练脚本`task.py`的内容。简而言之，该脚本执行以下操作：

- 使用 BigQuery Python 客户端库从 BigQuery 表中加载数据。
- 从 Cloud Storage 存储桶加载预先计算的平均值和标准差。
- 使用 TF.Keras 模型 API 构建模型。
- 调用 `compile()` 编译模型。
- 根据参数 `args.distribute` 设置训练分发策略。
- 根据参数 `args.epochs` 和 `args.batch_size` 调用 `fit()` 进行模型训练。
- 从环境变量 `AIP_MODEL_DIR` 获取保存模型工件的目录。此变量由[训练服务设置](https://cloud.google.com/vertex-ai/docs/training/code-requirements#environment-variables)。
- 将训练后的模型保存到模型目录中。

In [None]:
%%writefile task.py

import argparse
import os
from typing import Tuple, Optional

import pandas as pd
import numpy as np
import tensorflow as tf

from google.cloud import bigquery
from google.cloud import storage

# Read environmental variables
training_data_uri = os.getenv("AIP_TRAINING_DATA_URI")
validation_data_uri = os.getenv("AIP_VALIDATION_DATA_URI")
test_data_uri = os.getenv("AIP_TEST_DATA_URI")

# Read args
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--batch_size', dest='batch_size',
                    default=10, type=int,
                    help='Batch size.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='Distributed training strategy.')
parser.add_argument('--mean_and_std_json_file', dest='mean_and_std_json_file', type=str,
                    help='GCS URI to the JSON file with pre-calculated column means and standard deviations.')
args = parser.parse_args()

# Set up BigQuery clients
bqclient = bigquery.Client()


def download_blob(
  bucket_name: str, 
  source_blob_name: str, 
  destination_file_name: str
  ) -> None:
    """Downloads a blob from the bucket to a local path.
    Args:
        - bucket_name: "your-bucket-name"
        - source_blob_name: "storage-object-name"
        - destination_file_name: "local/path/to/file"
    """

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(
        "Blob {} downloaded to {}.".format(
            source_blob_name, destination_file_name
        )
    )

def extract_bucket_and_prefix_from_gcs_path(gcs_path: str) -> Tuple[str, Optional[str]]:
    """Given a complete GCS path, return the bucket name and prefix as a tuple.

    Example Usage:

        bucket, prefix = extract_bucket_and_prefix_from_gcs_path(
            "gs://example-bucket/path/to/folder"
        )

        # bucket = "example-bucket"
        # prefix = "path/to/folder"

    Args:
        gcs_path (str):
            Required. A full path to a Cloud Storage folder or resource.
            Can optionally include "gs://" prefix or end in a trailing slash "/".

    Returns:
        Tuple[str, Optional[str]]
            A (bucket, prefix) pair from provided GCS path. If a prefix is not
            present, None is returned in its place.
    """
    if gcs_path.startswith("gs://"):
        gcs_path = gcs_path[5:]
    if gcs_path.endswith("/"):
        gcs_path = gcs_path[:-1]

    gcs_parts = gcs_path.split("/", 1)
    gcs_bucket = gcs_parts[0]
    gcs_blob_prefix = None if len(gcs_parts) == 1 else gcs_parts[1]

    return (gcs_bucket, gcs_blob_prefix)


# Download means and std
def download_mean_and_std(mean_and_std_json_file):
    """Download mean and std for each column"""
    import json
    
    bucket, file_path = extract_bucket_and_prefix_from_gcs_path(mean_and_std_json_file)
    download_blob(bucket_name=bucket, source_blob_name=file_path, destination_file_name=file_path)
    
    with open(file_path, 'r') as file:
        return json.loads(file.read())

        
# # Download a table
def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix):]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(table)
    
    return rows.to_dataframe(create_bqstorage_client=False)


def standardize(df, mean_and_std):
    """Scales numerical columns using their means and standard deviation to get
    z-scores: the mean of each numerical column becomes 0, and the standard
    deviation becomes 1. This can help the model converge during training.

    Args:
      df: Pandas df

    Returns:
      Input df with the numerical columns scaled to z-scores
    """
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32":
            df[column] -= mean_and_std[column]["mean"]
            df[column] /= mean_and_std[column]["std"]
    return df


def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(["object"]).columns

    df[cat_columns] = df[cat_columns].apply(
        lambda x: x.astype(_CATEGORICAL_TYPES[x.name])
    )
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df


def convert_dataframe_to_dataset(
    df_train,
    df_validation,
    mean_and_std
):
    df_train = preprocess(df_train)
    df_validation = preprocess(df_validation)

    df_train_x, df_train_y = df_train, df_train.pop(LABEL_COLUMN)
    df_validation_x, df_validation_y = df_validation, df_validation.pop(LABEL_COLUMN)

    # Join train_x and eval_x to normalize on overall means and standard
    # deviations. Then separate them again.
    all_x = pd.concat([df_train_x, df_validation_x], keys=["train", "eval"])
    all_x = standardize(all_x, mean_and_std)
    df_train_x, df_validation_x = all_x.xs("train"), all_x.xs("eval")

    y_train = np.asarray(df_train_y).astype("float32")
    y_validation = np.asarray(df_validation_y).astype("float32")

    # Convert to numpy representation
    x_train = np.asarray(df_train_x)
    x_test = np.asarray(df_validation_x)

    # Convert to one-hot representation
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(SPECIES))
    y_validation = tf.keras.utils.to_categorical(y_validation, num_classes=len(SPECIES))

    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset_validation = tf.data.Dataset.from_tensor_slices((x_test, y_validation))
    return (dataset_train, dataset_validation)


# Remove NA values
def clean_dataframe(df):
    return df.replace(to_replace=NA_VALUES, value=np.NaN).dropna()


def create_model(num_features):
    # Create model
    Dense = tf.keras.layers.Dense
    model = tf.keras.Sequential(
        [
            Dense(
                100,
                activation=tf.nn.relu,
                kernel_initializer="uniform",
                input_dim=num_features,
            ),
            Dense(75, activation=tf.nn.relu),
            Dense(50, activation=tf.nn.relu),
            Dense(25, activation=tf.nn.relu),
            Dense(3, activation=tf.nn.softmax),
        ]
    )
    
    # Compile Keras model
    optimizer = tf.keras.optimizers.RMSprop(lr=0.001)
    model.compile(
        loss="categorical_crossentropy", metrics=["accuracy"], optimizer=optimizer
    )
    
    return model


mean_and_std = download_mean_and_std(args.mean_and_std_json_file)

# Single Machine, single compute device
if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
# Single Machine, multiple compute device
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
# Multiple Machine, multiple compute device
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

# Set up training variables
LABEL_COLUMN = "species"
UNUSED_COLUMNS = []
NA_VALUES = ["NA", "."]

# Possible categorical values
SPECIES = ['Adelie Penguin (Pygoscelis adeliae)',
           'Chinstrap penguin (Pygoscelis antarctica)',
           'Gentoo penguin (Pygoscelis papua)']
ISLANDS = ['Dream', 'Biscoe', 'Torgersen']
SEXES = ['FEMALE', 'MALE']

df_train = download_table(training_data_uri)
df_validation = download_table(validation_data_uri)
df_test = download_table(test_data_uri)

df_train = clean_dataframe(df_train)
df_validation = clean_dataframe(df_validation)

_CATEGORICAL_TYPES = {
    "island": pd.api.types.CategoricalDtype(categories=ISLANDS),
    "species": pd.api.types.CategoricalDtype(categories=SPECIES),
    "sex": pd.api.types.CategoricalDtype(categories=SEXES),
}

# Create datasets
dataset_train, dataset_validation = convert_dataframe_to_dataset(
  df_train, 
  df_validation, 
  mean_and_std
)

# Shuffle train set
dataset_train = dataset_train.shuffle(len(df_train))

# Create the model
with strategy.scope():
    model = create_model(num_features=dataset_train._flat_shapes[0].dims[0].value)

# Set up datasets
NUM_WORKERS = strategy.num_replicas_in_sync
# Here the batch size scales up by number of workers since
# `tf.data.Dataset.batch` expects the global batch size.
GLOBAL_BATCH_SIZE = args.batch_size * NUM_WORKERS
dataset_train = dataset_train.batch(GLOBAL_BATCH_SIZE)
dataset_validation = dataset_validation.batch(GLOBAL_BATCH_SIZE)

# Train the model
model.fit(dataset_train, epochs=args.epochs, validation_data=dataset_validation)

tf.saved_model.save(model, os.getenv("AIP_MODEL_DIR"))

### 训练模型

在Vertex AI上定义您的定制`TrainingPipeline`。

使用`CustomTrainingJob`类来定义`TrainingPipeline`。该类接受以下参数：

- `display_name`：此训练管道的用户定义名称。
- `script_path`：训练脚本的本地路径。
- `container_uri`：训练容器镜像的URI。
- `requirements`：脚本的Python包依赖项列表。
- `model_serving_container_image_uri`：可以为您的模型提供预测的容器的URI，可以是预构建的容器或定制容器。

使用`run`函数开始训练。该函数接受以下参数：

- `dataset`：用于针对此训练进行拟合的Vertex AI数据集。
- `model_display_name`：如果脚本生成托管的`Model`，则为`Model`的显示名称。
- `bigquery_destination`：要将训练数据写入的BigQuery项目位置。
- `args`：要传递给Python脚本的命令行参数。

`run`函数创建一个训练管道，训练并创建一个`Model`对象。训练管道完成后，`run`函数会返回`Model`对象。

In [None]:
job = aiplatform.CustomTrainingJob(
    display_name=JOB_NAME,
    script_path="task.py",
    container_uri=TRAIN_IMAGE,
    requirements=["google-cloud-bigquery>=2.20.0", "db-dtypes", "protobuf==3.20.3"],
    model_serving_container_image_uri=DEPLOY_IMAGE,
)

MODEL_DISPLAY_NAME = "penguins-unique"

# Start the training
model = job.run(
    dataset=dataset,
    model_display_name=MODEL_DISPLAY_NAME,
    bigquery_destination=f"bq://{PROJECT_ID}",
    args=CMDARGS,
)

### 使用特征过滤发送批量预测作业请求（instanceConfig字段）

现在模型已准备就绪，您可以直接从模型资源发送批量预测请求，而无需将模型部署到端点。

有时，您的输入数据与预测器接受的数据格式不匹配。特征过滤使您可以在预测请求中排除某些字段（例如标识符或元数据），这些字段位于输入数据中，或者仅包含输入数据中的部分字段在预测请求中，而无需在预测容器中进行任何自定义预处理或后处理。
您可以过滤和/或转换您的批量输入

在此笔记本中，您将学习如何通过在`BatchPredictionJob`请求中指定`instanceConfig`来包含或排除一组特征发送批量预测请求（**仅限v1beta1**）。

了解有关[Vertex AI上的预测](https://cloud.google.com/vertex-ai/docs/predictions/overview)的更多信息<br>
了解有关[特征过滤](https://cloud.google.com/vertex-ai/docs/predictions/get-predictions#filter_and_transform_input_data_preview)的更多信息

### 准备测试数据

通过标准化和将分类值转换为数值来准备测试数据。
您必须以与您的标准化训练数据相同的方式来标准化这些值。

在这个例子中，我们在测试数据集中添加了一个名为 `id` 的额外列，这个列在训练中没有使用。我们演示如何在预测时排除这个特征。
在这里，您使用与训练时相同的数据集进行测试。在实践中，通常希望使用单独的测试数据集来验证您的结果。

In [None]:
import pandas as pd
from google.cloud import bigquery

UNUSED_COLUMNS = []
LABEL_COLUMN = "species"

# Possible categorical values
SPECIES = [
    "Adelie Penguin (Pygoscelis adeliae)",
    "Chinstrap penguin (Pygoscelis antarctica)",
    "Gentoo penguin (Pygoscelis papua)",
]
ISLANDS = ["Dream", "Biscoe", "Torgersen"]
SEXES = ["FEMALE", "MALE"]

_CATEGORICAL_TYPES = {
    "island": pd.api.types.CategoricalDtype(categories=ISLANDS),
    "species": pd.api.types.CategoricalDtype(categories=SPECIES),
    "sex": pd.api.types.CategoricalDtype(categories=SEXES),
}


def standardize(df, mean_and_std):
    """Scales numerical columns using their means and standard deviation to get
    z-scores: the mean of each numerical column becomes 0, and the standard
    deviation becomes 1. This can help the model converge during training.

    Args:
      df: Pandas df

    Returns:
      Input df with the numerical columns scaled to z-scores
    """
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32":
            df[column] -= mean_and_std[column]["mean"]
            df[column] /= mean_and_std[column]["std"]
    return df


def preprocess(df, mean_and_std):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(["object"]).columns

    df[cat_columns] = df[cat_columns].apply(
        lambda x: x.astype(_CATEGORICAL_TYPES[x.name])
    )
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df


def convert_dataframe_to_list(df, mean_and_std):
    df = preprocess(df, mean_and_std)

    df_x, df_y = df, df.pop(LABEL_COLUMN)

    # Normalize on overall means and standard deviations.
    df = standardize(df, mean_and_std)

    y = np.asarray(df_y).astype("float32")

    # Convert to numpy representation
    x = np.asarray(df_x)

    # Convert to one-hot representation
    return x.tolist(), y.tolist(), df_x


x_test, y_test, df_x = convert_dataframe_to_list(dataframe, mean_and_std)

In [None]:
# Add id column to the test dataframe
ID_COLUMN_NAME = "id"
df_x_with_id = df_x.copy()
df_x_with_id[ID_COLUMN_NAME] = [i for i in range(0, df_x_with_id.shape[0])]

# Print columns of the datafram
print(f"Test dataset columns: {df_x_with_id.columns}")

将测试数据框上传到BigQuery

In [None]:
def save_dataframe_to_bigquery(
    dataframe: pd.DataFrame, dataset_name: str, table_name: str
) -> str:
    """This function loads a dataframe to a new bigquery table

    Args:
        dataframe (pd.Dataframe): dataframe to be loaded to bigquery
        dataset_name (str): name of the BigQuery dataset for storing the data
        table_name (str): name of the BigQuery table that is being created

    Returns:
        str: table id of the destination bigquery table
    """
    client = bigquery.Client(PROJECT_ID)

    bq_dataset = bigquery.Dataset(f"{PROJECT_ID}.{dataset_name}")
    bq_dataset = client.create_dataset(bq_dataset, exists_ok=True)

    job_config = bigquery.LoadJobConfig(
        # Optionally, set the write disposition. BigQuery appends loaded rows
        # to an existing table by default, but with WRITE_TRUNCATE write
        # disposition it replaces the table with the loaded data.
        write_disposition="WRITE_TRUNCATE",
    )

    # Reference: https://cloud.google.com/bigquery/docs/samples/bigquery-load-table-dataframe
    job = client.load_table_from_dataframe(
        dataframe=dataframe,
        destination=f"{PROJECT_ID}.{dataset_name}.{table_name}",
        job_config=job_config,
    )

    job.result()

    return str(job.destination)

In [None]:
# Upload the Dataframe to a BigQuery table

DATASET_NAME = "test_dataset"
TABLE_NAME = "test-data-unique"

TABLE_ID = save_dataframe_to_bigquery(
    dataframe=df_x_with_id, dataset_name=DATASET_NAME, table_name=TABLE_NAME
)

### 使用 REST API 发送 BatchPredictionJob 请求

现在您有了测试数据，您可以使用它来通过 REST API 发送批量预测请求。为此，您需要创建一个包含以下信息的 `JSON` 请求：

- `BATCH_JOB_NAME`：批量预测作业的显示名称。
- `MODEL_URI`：用于进行预测的模型资源的 URI。
- `INPUT_FORMAT`：输入数据的格式：bigquery、jsonl、csv、tf-record、tf-record-gzip 或 file-list。
- `INPUT_URI`：输入数据的 Cloud Storage URI。可能包含通配符。
- `OUTPUT_URI`：您希望 Vertex AI 将输出保存到的目录的 Cloud Storage URI。
- `MACHINE_TYPE`：用于此批量预测作业的机器资源。

在本示例中，我们创建了两个版本相同的 JSON 请求：一个带有 `excludedFields`，另一个带有 `includeFields`，以展示如何包含或排除某些功能。请注意，在本示例中，这两个请求执行相同的任务！

了解更多关于[请求批量预测](https://cloud.google.com/vertex-ai/docs/predictions/get-predictions#api_1)<br>
了解更多关于[instanceconfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs#instanceconfig)

In [None]:
BATCH_JOB_NAME = "penguins-test"
MODEL_URI = model.resource_name
INPUT_FORMAT = "bigquery"
INPUT_URI = f"bq://{TABLE_ID}"
OUTPUT_FORMAT = "bigquery"
OUTPUT_URI = f"bq://{PROJECT_ID}"
MACHINE_TYPE = "n1-standard-2"
EXCLUDED_FIELDS = [ID_COLUMN_NAME]

# Create a list of columns to be included
ALL_COLUMNS = list(df_x_with_id.columns)
INCLUDED_FIELDS = ALL_COLUMNS.copy()
INCLUDED_FIELDS.remove(ID_COLUMN_NAME)

### 创建JSON主体请求

In [None]:
import json

request_with_excluded_fields = {
    "displayName": f"{BATCH_JOB_NAME}-excluded_fields",
    "model": MODEL_URI,
    "inputConfig": {
        "instancesFormat": INPUT_FORMAT,
        "bigquerySource": {"inputUri": INPUT_URI},
    },
    "outputConfig": {
        "predictionsFormat": OUTPUT_FORMAT,
        "bigqueryDestination": {"outputUri": OUTPUT_URI},
    },
    "dedicatedResources": {
        "machineSpec": {
            "machineType": MACHINE_TYPE,
        }
    },
    "instanceConfig": {"excludedFields": EXCLUDED_FIELDS},
}

with open("request_with_excluded_fields.json", "w") as outfile:
    json.dump(request_with_excluded_fields, outfile)

In [None]:
request_with_included_fields = {
    "displayName": f"{BATCH_JOB_NAME}-included_fields",
    "model": MODEL_URI,
    "inputConfig": {
        "instancesFormat": INPUT_FORMAT,
        "bigquerySource": {"inputUri": INPUT_URI},
    },
    "outputConfig": {
        "predictionsFormat": OUTPUT_FORMAT,
        "bigqueryDestination": {"outputUri": OUTPUT_URI},
    },
    "dedicatedResources": {
        "machineSpec": {
            "machineType": MACHINE_TYPE,
        }
    },
    "instanceConfig": {"includedFields": INCLUDED_FIELDS},
}

with open("request_with_included_fields.json", "w") as outfile:
    json.dump(request_with_included_fields, outfile)

发送请求

要发送请求，请指定要使用的API版本。在这种情况下，您使用 `v1beta1` 可以使用 `instanceConfig`。

排除字段

在这里，我们使用 `excludedFields` 发送请求。运行下面的单元格后，您应该会收到一个包含您提供信息的 JSON 响应。然后等待作业完成（您可以在 Vertex AI 批量预测菜单上检查作业状态，也可以使用 Python SDK）。

In [None]:
! curl \
  -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  -d @request_with_excluded_fields.json \
  https://{REGION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/batchPredictionJobs

包括字段

在这里，我们使用 `includedFields` 发送请求：

In [None]:
! curl \
  -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  -d @request_with_included_fields.json \
  https://{REGION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/batchPredictionJobs

清理工作

要清理本项目中使用的所有谷歌云资源，您可以删除用于教程的[谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在此笔记本中创建的各个资源。

In [None]:
# Warning: Setting this to true deletes everything in your bucket
delete_bucket = True

# Delete the training job
job.delete()

# Delete the dataset
dataset.delete()

# Delete the model
model.delete()

if delete_bucket:
    ! gsutil rm -r $BUCKET_URI

# Delete the created BigQuery dataset
! bq rm -r -f $PROJECT_ID:$DATASET_NAME