In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Builder SDK：AutoML 预测模型训练示例

要使用这个 Colab 笔记本，您需要将笔记本复制到您自己的谷歌云端硬盘，并在 Colab（或 Colab）中打开。您可以运行每个步骤或单元，并查看其结果。要运行单元格，请使用 Shift+Enter。Colab 会自动显示每个单元格中最后一行的返回值。有关在 Colab 中运行笔记本的更多信息，请参见 [Colab 欢迎页面](https://colab.research.google.com/notebooks/welcome.ipynb)。

这个笔记本演示了如何基于时间序列数据集创建一个 AutoML 预测模型。还将提及导出和可视化测试集预测的过程。您需要提供一个存储数据集的存储桶。

注意：您可能会因测试此 SDK 而产生培训、预测、存储或使用其他 GCP 产品的费用。

# 安装Vertex AI SDK，进行身份验证，并将数据集上传到您的GCS存储桶

在安装SDK后，内核将会自动重新启动。您可能会看到错误消息`Your session crashed for an unknown reason`，这是正常的。

In [None]:
%%capture
!pip3 uninstall -y google-cloud-aiplatform
!pip3 install google-cloud-aiplatform

import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### 输入您的项目和GCS桶

在下面的单元格中输入您的项目ID。然后运行该单元格，以确保Cloud SDK在这个笔记本中的所有命令都使用正确的项目。

如果您不知道您的项目 ID，您可以使用 gcloud 命令获取您的项目 ID。

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

否则，请在这里设置您的项目ID。

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = ""  # @param {type:"string"}

如果您正在进行实时教程会话，您可能正在使用共享的测试账户或项目。为了避免在创建的资源上发生名称冲突，您可以为每个实例会话创建一个时间戳，并将其附加到您在本教程中创建的资源的名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

请在下面设置您的云存储桶名称。它必须在所有云存储桶中是唯一的。

您也可以更改REGION变量，在本笔记本的其余部分中使用。请确保选择一个[Vertex AI 服务可用的地区](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions)。您不可以使用多区域存储桶来进行 Vertex AI 的训练。

In [None]:
BUCKET_NAME = ""  # @param {type:"string"}
REGION = ""  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

我们使用的数据集是来自[Iowa Liquor Retail Sales](https://pantheon.corp.google.com/marketplace/product/iowa-department-of-commerce/iowa-liquor-sales)数据集的样本。训练样本包含了2020年的销售数据，而预测样本（用于批量预测步骤）包含了2021年1月至4月的销售数据。

In [None]:
TRAINING_DATASET_BQ_PATH = (
    "bq://bigquery-public-data:iowa_liquor_sales_forecasting.2020_sales_train"
)

初始化Vertex AI SDK

为Vertex AI初始化*客户端*。

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

# 从BigQuery中创建一个托管的时间序列数据集

这部分将从一个BigQuery表中创建一个数据集。

In [None]:
ds = aiplatform.datasets.TimeSeriesDataset.create(
    display_name="iowa_liquor_sales_train_job", bq_source=[TRAINING_DATASET_BQ_PATH]
)

ds.resource_name
ds

启动一个培训工作来创建一个模型

一旦我们定义了您的培训脚本，我们将创建一个模型，导出测试集的预测结果，并在培训日志中输出测试集预测结果的BigQuery位置。

In [None]:
time_column = "date"
time_series_identifier_column = "store_name"
target_column = "sale_dollars"

job = aiplatform.AutoMLForecastingTrainingJob(
    display_name="train-iowa-liquor-sales-automl-1",
    optimization_objective="minimize-rmse",
    column_specs={
        time_column: "timestamp",
        target_column: "numeric",
        "city": "categorical",
        "zip_code": "categorical",
        "county": "categorical",
    },
)

dataset_id = "iowa_liquor_sales_train_job"
bq_table_name = "iowa_liquor_sales_test_pred"
bq_evaluated_examples_uri = "bq://{}:{}:{}".format(
    PROJECT_ID, dataset_id, bq_table_name
)
# This will take around an hour to run
model = job.run(
    dataset=ds,
    target_column=target_column,
    time_column=time_column,
    time_series_identifier_column=time_series_identifier_column,
    available_at_forecast_columns=[time_column],
    unavailable_at_forecast_columns=[target_column],
    time_series_attribute_columns=["city", "zip_code", "county"],
    forecast_horizon=30,
    context_window=30,
    data_granularity_unit="day",
    data_granularity_count=1,
    weight_column=None,
    export_evaluated_data_items=True,
    budget_milli_node_hours=500,
    model_display_name="iowa-liquor-sales-forecast-model",
    predefined_split_column_name=None,
)

In [None]:
# @title # Fetch Model Evaluation Metrics
# @markdown Fetch the model evaluation metrics calculated during training on the test set.

import pandas as pd

list_evaluation_pager = model.api_client.list_model_evaluations(
    parent=model.resource_name
)
for model_evaluation in list_evaluation_pager:
    metrics_dict = {m[0]: m[1] for m in model_evaluation.metrics.items()}
    df = pd.DataFrame(metrics_dict.items(), columns=["Metric", "Value"])
    print(df.to_string(index=False))

In [None]:
time_column = "date"
time_series_identifier_column = "store_name"
target_column = "sale_dollars"
MY_PROJECT = PROJECT_ID

eval_ex_uri = job.evaluated_data_items_bigquery_uri

In [None]:
# @title # Visualize the Forecasts
# @markdown The following snippet visualizes the test set predictions from the forecasting training job above to aid in model evaluation.
# @markdown Visit the given link to view the generated forecasts in [Data Studio](https://support.google.com/datastudio/answer/6283323?hl=en).

import urllib


def _sanitize_bq_uri(bq_uri):
    if bq_uri.startswith("bq://"):
        bq_uri = bq_uri[5:]
    return bq_uri.replace(":", ".")


eval_ex_uri_clean = _sanitize_bq_uri(eval_ex_uri)


def get_data_studio_link(
    eval_input_uri, time_column, time_series_identifier_column, target_column
):

    base_url = "https://datastudio.google.com/c/u/0/reporting"
    query = (
        "SELECT \\n"
        " CAST({} as DATETIME) timestamp_col,\\n"
        " CAST({} as STRING) time_series_identifier_col,\\n"
        " CAST({} as NUMERIC) actual_values,\\n"
        " CAST(predicted_{}.value as NUMERIC) predicted_values,\\n"
        " CAST(predicted_on_{} as DATETIME) predicted_on_Date_col,\\n"
        " CAST({} as NUMERIC) - CAST(predicted_{}.value as NUMERIC) residuals,\\n"
        " * \\n"
        "FROM `{}` input"
    )
    query = query.format(
        time_column,
        time_series_identifier_column,
        target_column,
        target_column,
        time_column,
        target_column,
        target_column,
        eval_input_uri,
    )
    params = {
        "templateId": "5df87696-b427-49d8-aeec-b885f9b7080f",
        "ds0.connector": "BIG_QUERY",
        "ds0.projectId": MY_PROJECT,
        "ds0.billingProjectId": MY_PROJECT,
        "ds0.type": "CUSTOM_QUERY",
        "ds0.sql": query,
    }
    params_str_parts = []
    for k, v in params.items():
        params_str_parts.append('"{}":"{}"'.format(k, v))
    params_str = "".join(["{", ",".join(params_str_parts), "}"])
    return "{}?{}".format(base_url, urllib.parse.urlencode({"params": params_str}))


print(
    get_data_studio_link(
        eval_ex_uri_clean, time_column, time_series_identifier_column, target_column
    )
)

清理

要清理此项目中使用的所有谷歌云资源，您可以删除用于教程的[Google Cloud 项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的个别资源：

In [None]:
# Delete model resource
model.delete(sync=True)

# Delete Cloud Storage objects that were created
! gsutil -m rm -r $BUCKET_NAME