In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用BigQuery ML和Vertex AI进行异常检测

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/tree/main/notebooks/community/pipelines/google_cloud_pipeline_components_bqml_pipeline_anomaly_detection.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/pipelines/google_cloud_pipeline_components_bqml_pipeline_anomaly_detection.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/pipelines/google_cloud_pipeline_components_bqml_pipeline_anomaly_detection.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在Vertex AI Workbench中打开
    </a>
  </td>
</table>

**_注意_**：此笔记本已在以下环境中进行测试：

* Python版本 = 3.9

## 概述

异常检测是利用机器学习识别与数据明显偏离的罕见观测值。异常检测可以以多种方式进行。有监督的、无监督的、基于图的方式。对于某些行业，如电信、制造业和金融服务，异常检测尤为重要。

例如，在制造业场景中，您可能收集一些传感器数据来预测发动机故障前剩余循环次数（TTF）。通过这种方式，您可以采取行动并做出关于维护计划的决策。

### 目标

在没有标记数据的情况下，您可能想知道如何最好地创建异常检测器。

在本笔记中，您将学习如何使用自动编码器来检测涡轮风扇发动机数据中的异常，并在此基础上构建异常检测管道。

本教程使用以下谷歌云ML服务和资源：

- `Vertex AI Pipelines`
- `BigQuery ML pipeline components`

执行的步骤包括：

- 定义自定义评估和指标可视化组件
- 定义管道：
    - 在BigQuery中构建训练数据集
    - 训练BigQuery自动编码器模型
    - 评估BigQuery自动编码器模型
    - 检查模型性能
    - 在BigQuery中构建测试数据集
    - 检测异常
    - 生成MSE图以评估预测
- 编译管道。
- 执行管道。

数据集

[`NASA涡轮风扇喷气发动机数据集`](https://www.kaggle.com/datasets/behrad3d/nasa-cmaps)是一个多变量时间序列数据集，其中每个时间序列描述一个不同的发动机。

该数据集包含26列，其中包含了单个操作循环期间采集的数据。

费用

此教程使用Google Cloud的可收费组件：

- Vertex AI
- BigQuery
- Cloud Storage

了解有关[Vertex AI价格](https://cloud.google.com/vertex-ai/pricing)，
[BigQuery价格](https://cloud.google.com/bigquery/pricing)和
[Cloud Storage价格](https://cloud.google.com/storage/pricing)的信息，
并使用[定价计算器](https://cloud.google.com/products/calculator/)
基于您的预期使用量生成费用估算。

安装以下所需的包以执行这个笔记本。

In [None]:
# Install the packages
! pip3 install --user --upgrade jinja2 google-cloud-bigquery kfp google-cloud-aiplatform google_cloud_pipeline_components -q --no-warn-conflicts

### 仅限 Colab 使用：请取消注释以下单元格以重启核心。

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## 开始之前

### 设置您的 Google Cloud 项目

**无论您使用什么笔记本环境，都需进行以下步骤。**

1. [选择或创建一个 Google Cloud 项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建一个账号时，您将获得 $300 的免费信用用于计算/存储成本。

2. [确保您的项目已启用计费功能](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用 Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

4. 如果您是在本地运行此笔记本，您需要安装 [Cloud SDK](https://cloud.google.com/sdk)。

#### 设置你的项目ID

**如果你不知道你的项目ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 查看支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### 地区

您还可以更改 Vertex AI 使用的 `REGION` 变量。了解有关 [Vertex AI 地区](https://cloud.google.com/vertex-ai/docs/general/locations) 的更多信息。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### 验证您的Google云账户

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下相关指示进行操作。

1. 顶点 AI 工作台
* 无需操作，因为您已经通过身份验证。

2. 本地的JupyterLab实例，请取消注释并运行:

In [None]:
# ! gcloud auth login

3. 协同工作，取消注释并运行:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

查看如何在https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples 上向您的服务账号授予云存储权限。

创建一个云存储桶

创建一个存储桶，用于存储中间产物，例如数据集。

In [None]:
BUCKET_URI = "gs://your-bucket-name-unique"  # @param {type:"string"}

只有当您的存储桶不存在时：运行以下单元格以创建您的云存储桶。

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

### 设置项目模板

您创建一个存储库集合以在本地组织您的项目。

In [None]:
import os

KFP_COMPONENTS_PATH = "components"
PIPELINES_PATH = "pipelines"
TRAIN_PIPELINES_PATH = os.path.join(PIPELINES_PATH, "train_pipelines")
TEST_PIPELINES_PATH = os.path.join(PIPELINES_PATH, "test_pipelines")

! mkdir -m 777 -p {KFP_COMPONENTS_PATH} {TRAIN_PIPELINES_PATH} {TEST_PIPELINES_PATH}

### 准备训练数据

接下来，您将将CSV训练数据复制到您的云存储存储桶中，然后为训练数据创建一个BigQuery数据集表。

In [None]:
PUBLIC_DATA_URI = (
    "gs://cloud-samples-data/vertex-ai/pipeline-deployment/datasets/turbofan_anomaly"
)
GCS_TRAIN_URI = f"{PUBLIC_DATA_URI}/train_FD001.csv"
GCS_TEST_URI = f"{PUBLIC_DATA_URI}/test_FD001.csv"
GCS_LABELS_URI = f"{PUBLIC_DATA_URI}/RUL_FD001.csv"

### 设置BigQuery数据集

您为本教程创建了以下BigQuery数据集：

- `sensors_train_raw_data_<timestamp>` 包含从传感器收集的训练数据
- `sensors_test_raw_data_<timestamp>` 包含从传感器收集的测试数据
- `sensors_label_data_<timestamp>` 包含用于验证结果的测试标签数据

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
LOCATION = REGION.split("-")[0]
BQ_DATASET = "iot_dataset"
BQ_TRAIN_RAW_TABLE = f"sensors_train_raw_data_{TIMESTAMP}"
BQ_TEST_RAW_TABLE = f"sensors_test_raw_data_{TIMESTAMP}"
BQ_LABELS_TABLE = f"sensors_label_data_{TIMESTAMP}"

! bq mk --location={LOCATION} --dataset {PROJECT_ID}:{BQ_DATASET}

! bq load \
  --location={LOCATION} \
  --source_format=CSV \
  --skip_leading_rows=1 \
  {BQ_DATASET}.{BQ_TRAIN_RAW_TABLE} \
  {GCS_TRAIN_URI} \
  id:INT64,cycle:INT64,setting1:FLOAT64,setting2:FLOAT64,setting3:FLOAT64,sensor:STRING,value:FLOAT64

! bq load \
  --location={LOCATION} \
  --source_format=CSV \
  --skip_leading_rows=1 \
  {BQ_DATASET}.{BQ_TEST_RAW_TABLE} \
  {GCS_TEST_URI} \
  id:INT64,cycle:INT64,setting1:FLOAT64,setting2:FLOAT64,setting3:FLOAT64,sensor:STRING,value:FLOAT64

! bq load \
  --location={LOCATION} \
  --source_format=CSV \
  --skip_leading_rows=1 \
  {BQ_DATASET}.{BQ_LABELS_TABLE} \
  {GCS_LABELS_URI} \
  id:INT64,time_to_failure:FLOAT64

### 导入库

接下来，导入库并设置一些在本教程中使用的变量。

In [None]:
from typing import NamedTuple

import tensorflow as tf
from google.cloud import aiplatform as vertex_ai
from google.cloud import bigquery
from google_cloud_pipeline_components.v1.bigquery import (
    BigqueryCreateModelJobOp, BigqueryEvaluateModelJobOp, BigqueryQueryJobOp)
from jinja2 import Template
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import (HTML, Artifact, Condition, Input, Metrics, Output,
                        component)

设定变量

In [None]:
# SQL templates
SENSORS = (
    "s1",
    "s2",
    "s3",
    "s4",
    "s5",
    "s6",
    "s7",
    "s8",
    "s9",
    "s10",
    "s11",
    "s12",
    "s13",
    "s14",
    "s15",
    "s16",
    "s17",
    "s18",
    "s19",
    "s20",
    "s21",
)
WINDOW = 5
PERIOD = 30
TARGET = "is_anomalous_ttf"
EXCLUDED_VARIABLES = "id, cycle, setting1, setting2, setting3"

### 辅助函数

`print_pipeline_output` 辅助函数允许验证管道运行，检查已执行的作业。

In [None]:
def print_pipeline_output(pipeline_root, job, output_task_name):
    JOB_ID = job.name
    print(JOB_ID)
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = (
            pipeline_root
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/executor_output.json"
        )
        GCP_RESOURCES = (
            pipeline_root
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/gcp_resources"
        )
        EVAL_METRICS = (
            pipeline_root
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/evaluation_metrics"
        )
        if tf.io.gfile.exists(EXECUTE_OUTPUT):
            ! gsutil cat $EXECUTE_OUTPUT
            return EXECUTE_OUTPUT
        elif tf.io.gfile.exists(GCP_RESOURCES):
            ! gsutil cat $GCP_RESOURCES
            return GCP_RESOURCES
        elif tf.io.gfile.exists(EVAL_METRICS):
            ! gsutil cat $EVAL_METRICS
            return EVAL_METRICS

    return None

### 初始化用于 Python 的 Vertex AI SDK

为您的项目和相应的存储桶初始化 Python 版的 Vertex AI SDK。

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

### 初始化Python的BigQuery SDK

为您的项目初始化Python的BigQuery SDK。

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID, location=REGION)

## BigQuery ML管道规范化

在接下来的单元格中，您将构建组件和管道，以训练和评估异常检测模型。

### 为运行管道设置变量

在这里，您初始化了一组特定于本教程中要运行的管道的变量。例如，您定义了管道配置，传递了训练表名称、模型配置和性能阈值。

In [None]:
# BQML pipeline job configuation
TRAIN_PIPELINE_NAME = "bqml-anomaly-detection-train-pipeline"
TRAIN_PIPELINE_ROOT = (
    urlparse(BUCKET_URI)._replace(path="pipelines/train_pipelines").geturl()
)
TRAIN_PIPELINE_PACKAGE = os.path.join(
    TRAIN_PIPELINES_PATH, f"{TRAIN_PIPELINE_NAME}.json"
)

# BQML pipeline conponent configuration
BQ_TRAIN_FEATURES_TABLE_PREFIX = "train_features"
BQ_TEST_FEATURES_TABLE_PREFIX = "test_features"
BQ_TRAIN_TABLE_PREFIX = "train_dataset"
BQ_TEST_TABLE_PREFIX = "test_dataset"
BQ_RECOSTRUCTION_MODEL_TABLE_PREFIX = "reconstruction_model"
DETECT_ANOMALIES_TABLE_PREFIX = "detect_anomalies"
BQ_TRAIN_FEATURES_TABLE = f"{BQ_TRAIN_FEATURES_TABLE_PREFIX}_{TIMESTAMP}"
BQ_TEST_FEATURES_TABLE = f"{BQ_TEST_FEATURES_TABLE_PREFIX}_{TIMESTAMP}"
BQ_TRAIN_TABLE = f"{BQ_TRAIN_TABLE_PREFIX}_{TIMESTAMP}"
BQ_TEST_TABLE = f"{BQ_TEST_TABLE_PREFIX}_{TIMESTAMP}"
BQ_RECOSTRUCTION_MODEL_TABLE = f"{BQ_RECOSTRUCTION_MODEL_TABLE_PREFIX}_{TIMESTAMP}"
DETECT_ANOMALIES_TABLE = f"{DETECT_ANOMALIES_TABLE_PREFIX}_{TIMESTAMP}"
CONTAMINATION_THRESHOLD = 0.1
PERF_THRESHOLD = 10

### 使用模板设置 SQL 查询

在 Vertex AI 上运行 BigQuery 和 BigQuery ML 管道的一种方法是将 SQL 查询定义为 Jinja 模板，并将它们作为 `pipeline components` 的参数传递。

在本教程中，您定义以下模板：

- `CREATE_FEATURES_SQL_TEMPLATE` 用于运行特征工程
- `CREATE_TRAIN_SQL_TEMPLATE` 用于创建训练数据集
- `TRAIN_RECONSTRUCTION_MODEL_TEMPLATE` 用于使用 BigQuery ML AutoEncoder 模型构建重建模型
- `CREATE_TEST_SQL_TEMPLATE` 用于创建测试数据集
- `DETECT_ANOMALIES_TEMPLATE` 用于检测异常
- `VISUALIZE_MSE_TEMPLATE` 用于可视化 MSE 图表

定义SQL查询模板

In [None]:
# Training ---------------------------------------------------------------------
CREATE_FEATURES_SQL_TEMPLATE = """
CREATE OR REPLACE TABLE
  `{{project_id}}.{{bq_dataset}}.{{features_table}}` AS
WITH
  get_long_from_wide_table AS (
    SELECT *
    FROM `{{project_id}}.{{bq_dataset}}.{{data_table}}`
    PIVOT(MAX(value) FOR sensor IN {{sensors}})
  ),

  get_features_table AS (
    SELECT
    *,
    {%- for sensor in sensors %}
    -- calculate rolling average sensor value
    AVG({{sensor}}) OVER(PARTITION BY id ORDER BY cycle RANGE BETWEEN {{window}} PRECEDING AND CURRENT ROW) AS {{"rolling_avg_" ~ sensor}},
    -- calculate rolling stdev sensor value
    IFNULL(STDDEV({{sensor}}) OVER(PARTITION BY id ORDER BY cycle RANGE BETWEEN {{window}} PRECEDING AND CURRENT ROW), 0) AS {{"rolling_sd_" ~ sensor}}
    {%- if not loop.last -%}
        ,
    {%- endif -%}
    {%- endfor %}
    FROM get_long_from_wide_table
  )

  SELECT * FROM get_features_table ORDER BY id, cycle
"""

CREATE_TRAIN_SQL_TEMPLATE = """
DECLARE period INT64 DEFAULT {{period}};

CREATE OR REPLACE TABLE
  `{{project_id}}.{{bq_dataset}}.{{train_table}}` AS
WITH
  get_last_cycle AS (
    SELECT id, max(cycle) as last_cycle
    FROM `{{project_id}}.{{bq_dataset}}.{{features_table}}`
    GROUP BY id
  ),

  get_target_train AS (
    SELECT
    a.*,
    CASE WHEN (b.last_cycle - a.cycle) < period THEN 1 ELSE 0 END AS {{target}},
    FROM `{{project_id}}.{{bq_dataset}}.{{features_table}}` as a
    LEFT JOIN get_last_cycle as b on a.id = b.id
  )

  SELECT * EXCEPT({{excluded_variables}}) FROM get_target_train
"""

TRAIN_RECONSTRUCTION_MODEL_TEMPLATE = """
CREATE OR REPLACE MODEL `{{project_id}}.{{bq_dataset}}.{{recostruction_model_name}}`
OPTIONS(MODEL_TYPE='AUTOENCODER',
        ACTIVATION_FN='RELU',
        HIDDEN_UNITS=[32, 16, 4, 16, 32],
        BATCH_SIZE=8,
        DROPOUT=0.2,
        EARLY_STOP=TRUE,
        LEARN_RATE=0.001,
        L1_REG_ACTIVATION=0.0001,
        OPTIMIZER='ADAM',
        MODEL_REGISTRY = 'vertex_ai',
        VERTEX_AI_MODEL_ID = 'reconstruction_model',
        VERTEX_AI_MODEL_VERSION_ALIASES = ['staging']
        )
AS SELECT * FROM `{{project_id}}.{{bq_dataset}}.{{train_table}}`
"""

# Test -------------------------------------------------------------------------
CREATE_TEST_SQL_TEMPLATE = """
DECLARE period INT64 DEFAULT {{period}};

CREATE OR REPLACE TABLE
 `{{project_id}}.{{bq_dataset}}.{{test_table}}` AS
WITH
 get_last_cycle AS (
   SELECT id, max(cycle) as last_cycle
   FROM `{{project_id}}.{{bq_dataset}}.{{features_table}}`
   GROUP BY id
 ),

 get_target_test AS (
   SELECT
   a.*
   FROM `{{project_id}}.{{bq_dataset}}.{{features_table}}` as a
   LEFT JOIN get_last_cycle as b ON a.id = b.id
   WHERE a.cycle = b.last_cycle
 )

 SELECT
 a.*,
 CASE WHEN b.time_to_failure < period THEN 1 ELSE 0 END AS {{target}}
 FROM get_target_test as a
 LEFT JOIN `{{project_id}}.{{bq_dataset}}.{{labels_table}}` as b ON a.id = b.id
"""

DETECT_ANOMALIES_TEMPLATE = """
CREATE OR REPLACE TABLE
  `{{project_id}}.{{bq_dataset}}.{{anomalies_table}}` AS
SELECT
  is_anomaly, mean_squared_error, {{target}}
FROM
  ML.DETECT_ANOMALIES(MODEL `{{project_id}}.{{bq_dataset}}.{{recostruction_model_name}}`,
                      STRUCT({{contamination_thr}} AS contamination),
                      TABLE `{{project_id}}.{{bq_dataset}}.{{test_table}}`)
"""

VISUALIZE_MSE_TEMPLATE = """
SELECT
  *
FROM
  `{{project_id}}.{{bq_dataset}}.{{anomalies_table}}`
"""

### 编译SQL查询模板

在定义SQL查询模板之后，您可以编译它们，并传递训练和测试参数。

In [None]:
# Training parameters specification
TRAIN_SQL_PARAMS = dict(
    project_id=PROJECT_ID,
    bq_dataset=BQ_DATASET,
    sensors=SENSORS,
    period=PERIOD,
    window=WINDOW,
    target=TARGET,
    excluded_variables=EXCLUDED_VARIABLES,
    contamination_threshold=CONTAMINATION_THRESHOLD,
    data_table=BQ_TRAIN_RAW_TABLE,
    features_table=BQ_TRAIN_FEATURES_TABLE,
    train_table=BQ_TRAIN_TABLE,
    recostruction_model_name=BQ_RECOSTRUCTION_MODEL_TABLE,
    anomalies_table=DETECT_ANOMALIES_TABLE,
    contamination_thr=CONTAMINATION_THRESHOLD,
)

CREATE_TRAIN_FEATURES_QUERY = Template(CREATE_FEATURES_SQL_TEMPLATE).render(
    TRAIN_SQL_PARAMS
)
CREATE_TRAIN_TABLE_QUERY = Template(CREATE_TRAIN_SQL_TEMPLATE).render(TRAIN_SQL_PARAMS)
TRAIN_RECOSTRUCTION_MODEL_QUERY = Template(TRAIN_RECONSTRUCTION_MODEL_TEMPLATE).render(
    TRAIN_SQL_PARAMS
)

# Testing parameters specification
TEST_SQL_PARAMS = dict(
    project_id=PROJECT_ID,
    bq_dataset=BQ_DATASET,
    sensors=SENSORS,
    period=PERIOD,
    window=WINDOW,
    data_table=BQ_TEST_RAW_TABLE,
    labels_table=BQ_LABELS_TABLE,
    target=TARGET,
    features_table=BQ_TEST_FEATURES_TABLE,
    test_table=BQ_TEST_TABLE,
    recostruction_model_name=BQ_RECOSTRUCTION_MODEL_TABLE,
    anomalies_table=DETECT_ANOMALIES_TABLE,
    contamination_thr=CONTAMINATION_THRESHOLD,
)

CREATE_TEST_FEATURES_QUERY = Template(CREATE_FEATURES_SQL_TEMPLATE).render(
    TEST_SQL_PARAMS
)
CREATE_TEST_TABLE_QUERY = Template(CREATE_TEST_SQL_TEMPLATE).render(TEST_SQL_PARAMS)
DETECT_ANOMALIES_QUERY = Template(DETECT_ANOMALIES_TEMPLATE).render(TEST_SQL_PARAMS)
VISUALIZE_MSE_QUERY = Template(VISUALIZE_MSE_TEMPLATE).render(TRAIN_SQL_PARAMS)

创建一个自定义组件，用于读取模型评估指标

使用Kubeflow SDK可视化API，在Vertex AI管道UI中构建一个自定义组件，用于消费模型评估指标并进行可视化。

In [None]:
@component(
    base_image="python:3.8-slim",
    output_component_file=f"{KFP_COMPONENTS_PATH}/build_bq_evaluate_metrics.yaml",
)
def get_model_evaluation_metrics(
    metrics_in: Input[Artifact],
    metrics_out: Output[Metrics],
    model_out: Output[Artifact],
) -> NamedTuple("Outputs", [("mean_squared_error", float)]):
    """
    Get the average mean absolute error from the metrics
    Args:
        metrics_in: metrics artifact
        metrics_out: resulting metrics artifact
        model_out: resulting model artifact
    Returns:
        avg_mean_absolute_error: average mean absolute error
    """

    # Extract rows and schema from metrics artifact
    rows = metrics_in.metadata["rows"]
    schema = metrics_in.metadata["schema"]

    # Convert into a dictionary format
    columns = [metrics["name"] for metrics in schema["fields"] if "name" in metrics]
    records = [dl["v"] for dl in rows[0]["f"]]
    metrics = {key: round(float(value), 3) for key, value in zip(columns, records)}

    # Log metrics
    for key in metrics.keys():
        metrics_out.log_metric(key, metrics[key])

    # Return the target metrics
    mean_absolute_error = metrics["mean_squared_error"]
    component_outputs = NamedTuple("Outputs", [("mean_squared_error", float)])

    # model metadata
    model_framework = "BQML"
    model_type = "AutoEncoder"
    model_user = "Author"
    model_function = "Reconstruction model"
    model_out.metadata["framework"] = model_framework
    model_out.metadata["type"] = model_type
    model_out.metadata["model function"] = model_function
    model_out.metadata["modified by"] = model_user

    return component_outputs(mean_absolute_error)

### 创建一个自定义组件来可视化每个标签的MSE

使用Kubeflow SDK的可视化API，在Vertex AI Pipelines UI中构建一个自定义组件来可视化每个标签的MSE。

In [None]:
@component(
    base_image="python:3.8-slim",
    packages_to_install=["pandas", "google-cloud-bigquery[bqstorage,pandas]", "plotly"],
    output_component_file=f"{KFP_COMPONENTS_PATH}/build_evaluation_plot.yaml",
)
def get_mse_plots(
    query: str,
    project: str,
    location: str,
    metrics_out: Output[HTML],
    model_out: Output[Artifact],
):
    """
    Get the mean squared error per labels
    Args:
        query: the query to generate the metrics
        project: the project id to iniziate the BQ client
        location: the region to iniziate the BQ client
        metrics_out: resulting metrics artifact
        model_out: resulting model artifact
    Returns:
        avg_mean_absolute_error: average mean absolute error
    """

    import plotly.graph_objects as go
    from google.cloud import bigquery
    from plotly.subplots import make_subplots

    # Initiate client
    client = bigquery.Client(project=project, location=location)

    # Run a Standard SQL query using the environment's default project
    table_df = client.query(query).to_dataframe()

    # Create anomalies/no anomalies datasets
    anomalies_df = table_df.query("is_anomalous_ttf == 1")
    no_anomalies_df = table_df.query("is_anomalous_ttf == 0")

    # Create a figure with subplots
    fig = make_subplots(
        rows=2,
        cols=2,
        specs=[[{"colspan": 2}, None], [{}, {}]],
        subplot_titles=(
            "Distribution of mean squared error (MSE) for anomaly and not anomaly sensor data",
            "Distribution of mean squared error (MSE) for anomaly sensor data",
            "Distribution of mean squared error (MSE) for not anomaly sensor data",
        ),
        x_title="Mean squared error (MSE)",
        y_title="Density",
    )

    # Add subplots to figure
    fig.add_trace(
        go.Histogram(
            x=anomalies_df["mean_squared_error"],
            name="Anomaly",
            marker_color="blue",
            showlegend=True,
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Histogram(
            x=no_anomalies_df["mean_squared_error"],
            name="No Anomaly",
            marker_color="orange",
            showlegend=True,
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Histogram(
            x=anomalies_df["mean_squared_error"],
            name="MSE_1",
            marker_color="red",
            showlegend=False,
        ),
        row=2,
        col=1,
    )
    fig.add_trace(
        go.Histogram(
            x=no_anomalies_df["mean_squared_error"],
            name="MSE_2",
            marker_color="green",
            showlegend=False,
        ),
        row=2,
        col=2,
    )

    # Update figure properties
    fig.update_layout(
        title="Anomaly detection report",
        title_x=0.5,
        bargap=0.2,
        bargroupgap=0.1,
        showlegend=True,
    )

    # Save output to static HTML file
    fig.write_html(metrics_out.path)

### 构建BQML训练流水线

使用Kubeflow Pipelines DSL包定义您的工作流程。

以下是流水线工作流程的步骤：

1. 在BigQuery中构建训练数据集
2. 训练BigQuery AutoEncoder模型
3. 评估BigQuery AutoEncoder模型
4. 检查模型性能
5. 在BigQuery中构建测试数据集
6. 检测异常
7. 生成MSE图表以评估预测结果

In [None]:
@dsl.pipeline(
    name=TRAIN_PIPELINE_NAME,
    description="A batch pipeline to train recostruction model using BQML",
)
def pipeline(
    create_train_features_query: str,
    create_train_table_query: str,
    train_recostruction_model_query: str,
    create_test_features_query: str,
    create_test_table_query: str,
    generate_anomalies_query: str,
    performance_thr: float,
    visualize_mse_query: str,
    project: str,
    location: str,
):

    # Create training features
    create_train_features_op = BigqueryQueryJobOp(
        query=create_train_features_query,
        project=project,
        location=location,
    ).set_display_name("build train features")

    # Create train dataset
    create_train_dataset_op = (
        BigqueryQueryJobOp(
            query=create_train_table_query, project=project, location=location
        )
        .set_display_name("build train table")
        .after(create_train_features_op)
    )

    # Train the recostruction model
    bq_recostruction_model_op = (
        BigqueryCreateModelJobOp(
            query=train_recostruction_model_query,
            project=project,
            location=location,
        )
        .set_display_name("train reconstruction model")
        .after(create_train_dataset_op)
    )

    # Evaluate recostruction model
    bq_arima_evaluate_model_op = (
        BigqueryEvaluateModelJobOp(
            model=bq_recostruction_model_op.outputs["model"],
            project=project,
            location=location,
        )
        .set_display_name("evaluate reconstruction model")
        .after(bq_recostruction_model_op)
    )

    # Plot model metrics
    get_evaluation_model_metrics_op = (
        get_model_evaluation_metrics(
            bq_arima_evaluate_model_op.outputs["evaluation_metrics"]
        )
        .after(bq_arima_evaluate_model_op)
        .set_display_name("generate evaluation metrics")
    )

    # Check the model performance. If AUTOENCODER MSE metric is below to a minimal threshold
    with Condition(
        get_evaluation_model_metrics_op.outputs["mean_squared_error"] < performance_thr,
        name="MSE good",
    ):

        # Create test features dataset
        create_test_features_op = BigqueryQueryJobOp(
            query=create_test_features_query,
            project=project,
            location=location,
        ).set_display_name("build test features")

        # Create test dataset
        create_test_dataset_op = (
            BigqueryQueryJobOp(
                query=create_test_table_query, project=project, location=location
            )
            .set_display_name("build test table")
            .after(create_test_features_op)
        )

        # Generate anomalies
        generate_anomalies_op = (
            BigqueryQueryJobOp(
                query=generate_anomalies_query,
                project=project,
                location=location,
            )
            .after(create_test_dataset_op)
            .set_display_name("generate anomalies")
        )

        # Plot mse graph of anomalies
        _ = (
            get_mse_plots(query=visualize_mse_query, project=project, location=location)
            .after(generate_anomalies_op)
            .set_display_name("plot mse report")
        )

将管道编译成一个 JSON 文件

接下来，您将编译管道，这将为您的管道生成一个 JSON 规范。

In [None]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path=TRAIN_PIPELINE_PACKAGE)

执行您的流水线

接下来，执行流水线。它使用您设置的以下参数作为默认值。

提交管道作业

In [None]:
TRAIN_PIPELINE_RUN_PARAMS = dict(
    create_train_features_query=CREATE_TRAIN_FEATURES_QUERY,
    create_train_table_query=CREATE_TRAIN_TABLE_QUERY,
    train_recostruction_model_query=TRAIN_RECOSTRUCTION_MODEL_QUERY,
    create_test_features_query=CREATE_TEST_FEATURES_QUERY,
    create_test_table_query=CREATE_TEST_TABLE_QUERY,
    generate_anomalies_query=DETECT_ANOMALIES_QUERY,
    performance_thr=PERF_THRESHOLD,
    visualize_mse_query=VISUALIZE_MSE_QUERY,
    project=PROJECT_ID,
    location=LOCATION,
)

bqml_train_pipeline = vertex_ai.PipelineJob(
    display_name=f"{TRAIN_PIPELINE_PACKAGE}-job",
    template_path=TRAIN_PIPELINE_PACKAGE,
    parameter_values=TRAIN_PIPELINE_RUN_PARAMS,
    pipeline_root=TRAIN_PIPELINE_ROOT,
    enable_caching=True,
)

bqml_train_pipeline.run()

查看BigQuery ML训练管道结果

最后，您将查看管道中每个任务的工件输出。

In [None]:
PROJECT_NUMBER = bqml_train_pipeline.gca_resource.name.split("/")[1]
print("PROJECT NUMBER: ", PROJECT_NUMBER)
print("\n\n")
print("bigquery-create-model-job")
artifacts = print_pipeline_output(
    TRAIN_PIPELINE_ROOT, bqml_train_pipeline, "bigquery-create-model-job"
)
print("\n\n")
print("bigquery-ml-evaluate-job")
artifacts = print_pipeline_output(
    TRAIN_PIPELINE_ROOT, bqml_train_pipeline, "bigquery-evaluate-model-job"
)
print("\n\n")

## 结论

在本笔记本中，您使用Vertex AI Pipelines和BigQuery ML构建了一个ML流水线，用于训练自动编码器以检测异常。

现在您知道如何利用预构建的 `google_cloud_components` 来训练BigQuery ML模型，以及如何构建自定义组件来评估和可视化性能指标。

清理

要清理此项目中使用的所有Google Cloud资源，您可以[删除用于教程的Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。

In [None]:
# delete pipeline
delete_pipeline = False
if delete_pipeline:
    vertex_ai_pipeline_jobs = vertex_ai.PipelineJob.list(
        filter=f'pipeline_name="{TRAIN_PIPELINE_NAME}"'
    )
    for pipeline_job in vertex_ai_pipeline_jobs:
        pipeline_job.delete()

# delete model
delete_model = False
if delete_model:
    DELETE_MODEL_SQL = f"DROP MODEL {BQ_DATASET}.{BQ_RECOSTRUCTION_MODEL_TABLE}"
    try:
        delete_model_query_job = bq_client.query(DELETE_MODEL_SQL)
        delete_model_query_result = delete_model_query_job.result()
    except Exception as e:
        print(e)

# delete bucket
delete_bucket = False
if os.getenv("IS_TESTING") or delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI

# Remove local resorces
delete_local_resources = False
if delete_local_resources:
    ! rm -rf {KFP_COMPONENTS_PATH}
    ! rm -rf {TRAIN_PIPELINES_PATH}
    ! rm -rf {TEST_PIPELINES_PATH}