In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Python的Vertex AI SDK: Vertex AI预测模型训练示例

要使用这个Colaboratory笔记本，您可以将笔记本复制到自己的Google Drive中，并使用Colaboratory（或Colab）打开。您可以运行每个步骤或单元，并查看其结果。要运行一个单元，使用Shift+Enter。Colab会自动显示每个单元中最后一行的返回值。有关在Colab中运行笔记本的更多信息，请参见[Colab欢迎页面](https://colab.research.google.com/notebooks/welcome.ipynb)。

这个笔记本演示了如何基于时间序列数据集创建一个AutoML模型。您需要提供一个存储数据集的存储桶。

注意：在测试这个SDK时，您可能会因为训练、预测、存储或使用其他GCP产品而产生费用。

安装Vertex AI SDK、进行身份验证，并将数据集上传到您的GCS存储桶

安装SDK后，内核将自动重新启动。您可能会看到这个错误消息“您的会话因为未知原因而崩溃”，这是正常的。

In [None]:
%%capture
!pip3 uninstall -y google-cloud-aiplatform
!pip3 install google-cloud-aiplatform
 
import IPython
 
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

输入您的项目和GCS存储桶

请在下面的单元格中输入您的项目ID。然后运行该单元格，确保在这个笔记本中的所有命令中，Cloud SDK都使用正确的项目。

如果您不知道您的项目ID，您可以使用gcloud获取您的项目ID。

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

否则，在这里设置您的项目ID。

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

如果您参加直播教程会话，您可能会使用一个共享的测试账户或项目。为了避免资源创建时用户之间的名称冲突，您可以为每个实例会话创建一个时间戳，并将其附加到您在本教程中创建的资源名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

请在下方设置您的云存储桶名称。它必须在所有云存储桶中是唯一的。

您还可以更改REGION变量，该变量在本笔记本的其余部分中使用。请确保[选择一个支持Vertex AI服务的区域](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions)。您不能在使用Vertex AI进行训练时使用多区域存储桶。

In [None]:
BUCKET_NAME = "gs://[your-bucket-name]"  # @param {type:"string"}
REGION = "[your-region]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

我们正在使用的数据集是来自[Iowa Liquor Retail Sales](https://pantheon.corp.google.com/marketplace/product/iowa-department-of-commerce/iowa-liquor-sales)数据集的样本。训练样本包含2020年的销售数据，预测样本（用于批量预测步骤）包含2021年1月至4月的销售数据。

In [None]:
TRAINING_DATASET_BQ_PATH = 'bq://bigquery-public-data:iowa_liquor_sales_forecasting.2020_sales_train'

初始化顶点AI SDK

为顶点AI初始化*client*。

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

从BigQuery创建一个托管时间序列数据集

此部分将从BigQuery表中创建一个数据集。

In [None]:
ds = aiplatform.datasets.TimeSeriesDataset.create(
    display_name='iowa_liquor_sales_train',
    bq_source=[TRAINING_DATASET_BQ_PATH])

ds.resource_name

# 启动一个训练作业来创建一个模型

一旦我们定义了您的训练脚本，我们就会创建一个模型。

In [None]:
time_column = "date"
time_series_identifier_column="store_name"
target_column="sale_dollars"

job = aiplatform.AutoMLForecastingTrainingJob(
    display_name='train-iowa-liquor-sales-automl_1',
    optimization_objective='minimize-rmse',    
    column_transformations=[
        {"timestamp": {"column_name": time_column}},
        {"numeric": {"column_name": target_column}},
        {"categorical": {"column_name": "city"}},
        {"categorical": {"column_name": "zip_code"}},
        {"categorical": {"column_name": "county"}},
    ]
)

# This will take around an hour to run
model = job.run(
    dataset=ds,
    target_column=target_column,
    time_column=time_column,
    time_series_identifier_column=time_series_identifier_column,
    available_at_forecast_columns=[time_column],
    unavailable_at_forecast_columns=[target_column],
    time_series_attribute_columns=["city", "zip_code", "county"],
    forecast_horizon=30,
    context_window=30,
    data_granularity_unit="day",
    data_granularity_count=1,
    weight_column=None,
    budget_milli_node_hours=1000,
    model_display_name="iowa-liquor-sales-forecast-model", 
    predefined_split_column_name=None,
)

In [None]:
#@title # Fetch Model Evaluation Metrics
#@markdown Fetch the model evaluation metrics calculated during training on the test set.

import pandas as pd

list_evaluation_pager = model.api_client.list_model_evaluations(parent=model.resource_name)
for model_evaluation in list_evaluation_pager:
  metrics_dict = {m[0]: m[1] for m in model_evaluation.metrics.items()}
  df = pd.DataFrame(metrics_dict.items(), columns=["Metric", "Value"])
  print(df.to_string(index=False))

# 运行批量预测

In [None]:
#@markdown ## Create Output BigQuery Dataset
#@markdown First, create a new BigQuery dataset for the batch prediction output in the same region as the batch prediction input dataset. 

import os
from google.cloud import bigquery

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

batch_predict_bq_input_uri = "bq://bigquery-public-data.iowa_liquor_sales_forecasting.2021_sales_predict"
batch_predict_bq_output_dataset_name = "iowa_liquor_sales_predictions"
batch_predict_bq_output_dataset_path = "{}.{}".format(PROJECT_ID, batch_predict_bq_output_dataset_name)
batch_predict_bq_output_uri_prefix = "bq://{}.{}".format(PROJECT_ID, batch_predict_bq_output_dataset_name)
# Must be the same region as batch_predict_bq_input_uri
client = bigquery.Client()
dataset = bigquery.Dataset(batch_predict_bq_output_dataset_path)
dataset_region = "US" # @param {type : "string"}
dataset.location = dataset_region
dataset = client.create_dataset(dataset)
print("Created bigquery dataset {} in {}".format(batch_predict_bq_output_dataset_path, dataset_region))

运行一个批量预测作业，根据包含历史销售数据的输入数据集为爱荷华州的商店生成酒类销售预测。

In [None]:
model.batch_predict(
   bigquery_source=batch_predict_bq_input_uri,
   instances_format="bigquery",
   bigquery_destination_prefix=batch_predict_bq_output_uri_prefix,
   predictions_format="bigquery",
   job_display_name="predict-iowa-liquor-sales-automl_1")

In [None]:
#@title # Visualize the Forecasts
#@markdown Follow the given link to visualize the generated forecasts in [Data Studio](https://support.google.com/datastudio/answer/6283323?hl=en).

import urllib

tables = client.list_tables(batch_predict_bq_output_dataset_path)

prediction_table_id = ""
for table in tables:
  if table.table_id.startswith(
      "predictions_") and table.table_id > prediction_table_id:
    prediction_table_id = table.table_id
batch_predict_bq_output_uri = "{}.{}".format(
    batch_predict_bq_output_dataset_path, prediction_table_id)


def _sanitize_bq_uri(bq_uri):
  if bq_uri.startswith("bq://"):
    bq_uri = bq_uri[5:]
  return bq_uri.replace(":", ".")


def get_data_studio_link(batch_prediction_bq_input_uri,
                         batch_prediction_bq_output_uri, time_column,
                         time_series_identifier_column, target_column):
  batch_prediction_bq_input_uri = _sanitize_bq_uri(
      batch_prediction_bq_input_uri)
  batch_prediction_bq_output_uri = _sanitize_bq_uri(
      batch_prediction_bq_output_uri)
  base_url = "https://datastudio.google.com/c/u/0/reporting"
  query = "SELECT \\n" \
  " CAST(input.{} as DATETIME) timestamp_col,\\n" \
  " CAST(input.{} as STRING) time_series_identifier_col,\\n" \
  " CAST(input.{} as NUMERIC) historical_values,\\n" \
  " CAST(predicted_{}.value as NUMERIC) predicted_values,\\n" \
  " * \\n" \
  "FROM `{}` input\\n" \
  "LEFT JOIN `{}` output\\n" \
  "ON\\n" \
  "CAST(input.{} as DATETIME) = CAST(output.{} as DATETIME)\\n" \
  "AND CAST(input.{} as STRING) = CAST(output.{} as STRING)"
  query = query.format(time_column, time_series_identifier_column,
                       target_column, target_column,
                       batch_prediction_bq_input_uri,
                       batch_prediction_bq_output_uri, time_column, time_column,
                       time_series_identifier_column,
                       time_series_identifier_column)
  params = {
      "templateId": "067f70d2-8cd6-4a4c-a099-292acd1053e8",
      "ds0.connector": "BIG_QUERY",
      "ds0.projectId": PROJECT_ID,
      "ds0.billingProjectId": PROJECT_ID,
      "ds0.type": "CUSTOM_QUERY",
      "ds0.sql": query
  }
  params_str_parts = []
  for k, v in params.items():
    params_str_parts.append("\"{}\":\"{}\"".format(k, v))
  params_str = "".join(["{", ",".join(params_str_parts), "}"])
  return "{}?{}".format(base_url,
                        urllib.parse.urlencode({"params": params_str}))


print(
    get_data_studio_link(batch_predict_bq_input_uri,
                         batch_predict_bq_output_uri, time_column,
                         time_series_identifier_column, target_column))

清理工作

为了清理项目中使用的所有Google Cloud资源，你可以删除用于教程的[Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，你可以删除在本教程中创建的个别资源：

In [None]:
# Delete model resource
model.delete(sync=True)

# Delete Cloud Storage objects that were created
! gsutil -m rm -r $BUCKET_NAME