In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI 实验：自动记录

<table align="left">

  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/experiments/get_started_with_vertex_experiments_autologging.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"><br> 在 Colab 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fexperiments%2Fget_started_with_vertex_experiments_autologging.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在 Colab Enterprise 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/experiments/get_started_with_vertex_experiments_autologging.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br>
      在 Vertex AI Workbench 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/experiments/get_started_with_vertex_experiments_autologging.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br>
      在 GitHub 上查看
    </a>
  </td>
</table>

注意：此笔记本已在以下环境中进行测试：

- Python版本= 3.9

## 概述

作为数据科学团队的一部分，您希望在实验阶段尝试不同的建模方法。为了保证可重复性，每种方法都有不同的参数，您需要手动跟踪。这是一项耗时的任务。为了解决这个挑战，Vertex AI SDK引入了自动记录功能，它是一种一行代码的SDK功能，利用MLflow来提供与您的 Vertex AI实验和实验运行相关的自动指标和参数跟踪（请参阅[将数据自动记录到实验运行](https://cloud.google.com/vertex-ai/docs/experiments/autolog-data)）。了解更多关于[Vertex AI实验](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments)。

### 目标

在本教程中，您将学习如何使用Vertex AI自动记录功能。

本教程使用以下谷歌云ML服务和资源：

- Vertex AI实验

执行的步骤包括：

- 在Vertex AI SDK中启用自动记录功能。
- 训练scikit-learn模型，并查看结果实验运行的度量和参数，自动记录到Vertex AI实验中，而无需设置实验运行。
- 训练Tensorflow模型，通过手动设置实验运行，并使用`aiplatform.start_run()`和`aiplatform.end_run()`来检查自动记录的度量和参数到Vertex AI实验。
- 在Vertex AI SDK中禁用自动记录功能，训练PyTorch模型，并检查未记录任何参数或度量。

数据集

该数据集是[UCI汽车评估数据集](https://archive-beta.ics.uci.edu/dataset/19/car+evaluation)，它源自简单的分层决策模型，包含属性以预测汽车评估类别。

### 成本

本教程使用 Google Cloud 的可计费组件：

* Vertex AI 实验
* 云存储

了解 [Vertex AI 定价](https://cloud.google.com/vertex-ai/pricing),
以及 [云存储定价](https://cloud.google.com/storage/pricing),
并使用 [定价计算器](https://cloud.google.com/products/calculator/)
根据您的预期使用量生成成本估算。

开始吧

### 为Python安装Vertex AI SDK和其他必需的包

### 安装顶点AI SDK for Python和其他所需的包

In [None]:
# Install the packages
USER = ""
! pip3 install {USER} --upgrade google-cloud-aiplatform tensorflow
! pip3 install {USER} --upgrade pandas==2.0.* scikit-learn category_encoders torch torchdata torchmetrics mlflow
! pip3 install {USER} --upgrade protobuf==3.20.3


### 重新启动运行时（仅适用于Colab）

为了使用新安装的软件包，您必须重新启动Google Colab上的运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ 内核即将重新启动。在继续下一步之前等待它完成。⚠️</b>
</div>

### 认证您的笔记本环境（仅适用于Colab）

在Google Colab上认证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并初始化Python的Vertex AI SDK

要开始使用Vertex AI，您必须拥有一个现有的Google Cloud项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

创建一个云存储桶

创建一个存储桶来存储中间产物，例如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

如果您的存储桶尚不存在：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

### 初始化Python顶点AI SDK

In [None]:
from google.cloud import aiplatform as vertex_ai

vertex_ai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)


#### UUID

如果您在进行实时教程会话，您可能会使用共享的测试账户或项目。为避免在创建的资源之间发生名称冲突，您可以为每个实例会话创建一个UUID，并将其附加到您在本教程中创建的资源的名称上。

In [None]:
import random
import string


# Generate a uuid of length 8
def generate_uuid():
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=8))


UUID = generate_uuid()

### 设置项目模板

设置您在本教程中使用的文件夹。

In [None]:
import os

tutorial_path = os.path.join(os.getcwd(), "sdk_autologging_tutorial")
data_path = os.path.join(tutorial_path, "data")

for path in tutorial_path, data_path:
    os.makedirs(path, exist_ok=True)

### 下载数据集

从公共云存储桶下载汽车评估数据集。

In [None]:
from urllib import request

DATA_URL = "http://cloud-samples-data.storage.googleapis.com/vertex-ai/dataset-management/datasets/uci_car_eval/car_evaluation_preprocessed.csv"
data_filepath = os.path.join(data_path, "car_evaluation_data.csv")
request.urlretrieve(DATA_URL, data_filepath)

import pandas as pd

COLUMN_NAMES = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
df = pd.read_csv(data_filepath)
df["class"] = df["class"].replace({"unacc": 0, "acc": 0, "good": 1, "vgood": 1})

processed_data_filepath = os.path.join(data_path, "car_evaluation_preprocessed.csv")
df.to_csv(processed_data_filepath, index=False)

In [None]:
!head {processed_data_filepath} -n 5

### 辅助函数

运行实验时，通常会定义实验助手函数，每个模型方法都会评估一个。以下是定义的实验助手函数：

* `train_sklearn_model`：一个使用Sklearn训练决策树模型的辅助函数。
* `train_tensorflow_model`：一个使用Tensorflow训练简单模型的辅助函数。
* `train_pytorch_model`：一个使用PyTorch训练简单神经网络的辅助函数。

In [None]:
def set_seed(seed: int):
    """
    A function to set the seed for reproducibility.
    Args:
        seed: Seed to be set
    Returns:
        None
    """
    import random

    import numpy as np
    import tensorflow as tf
    import torch

    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)


def train_sklearn_model(data_path: str, test_size: int, max_depth: int):
    """
    A function to train a Decision Tree model using sklearn.
    Args:
        data_path: Path to the data
        test_size: Size of the test set
        max_depth: Maximum depth of the Decision Tree
    Returns:
        None
    """

    # Libraries
    import pandas as pd
    from category_encoders import OrdinalEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.tree import DecisionTreeClassifier

    # Read data
    print("Reading data...")
    df = pd.read_csv(data_path)

    # Train, test split
    print("Generating train and test data...")
    x = df[["buying", "maint", "doors", "persons", "lug_boot", "safety"]]
    y = df[["class"]]
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, shuffle=True
    )

    # Build pipeline
    print("Building pipeline...")
    pipe = Pipeline(
        [
            ("encoder", OrdinalEncoder()),
            ("model", DecisionTreeClassifier(criterion="gini", max_depth=max_depth)),
        ]
    )

    # Train model
    print("Training model...")
    pipe.fit(x_train, y_train)

    # Evaluate model
    print("Evaluating model...")
    y_pred = pipe.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accurancy", round(accuracy, 3))


def train_tensorflow_model(
    data_path: str, test_size: float, batch_size: int, epochs: int
):
    """
    A function to train a TF model.
    Args:
        data_path: Path to the data
        test_size: Size of the test set
        batch_size: Batch size
        epochs: Number of epochs
    Returns:
        None
    """
    # Libraries
    import tensorflow as tf

    # Variables
    dataset_size = 1729
    features_values = {
        "buying": ["vhigh", "high", "med", "low"],
        "maint": ["vhigh", "high", "med", "low"],
        "doors": ["2", "3", "4", "5more"],
        "persons": ["2", "4", "more"],
        "lug_boot": ["small", "med", "big"],
        "safety": ["low", "med", "high"],
    }

    # Helpers
    def get_input_layer(features_vocabulary):
        input_map = {}
        for cat_name, cat_values in features_vocabulary.items():
            input_map[cat_name] = tf.keras.Input(
                shape=(1,), name=cat_name, dtype="string"
            )
        return input_map

    def get_features_layer(inputs_map, features_vocabulary):
        features_map = {}
        for cat_name, cat_values in features_vocabulary.items():
            # Calculate categories
            cat_index = tf.keras.layers.StringLookup(
                vocabulary=cat_values, max_tokens=5
            )(inputs_map[cat_name])
            # Create encoding layer
            cat_layer = tf.keras.layers.CategoryEncoding(num_tokens=5)(cat_index)
            features_map[cat_name] = cat_layer
        return features_map

    # Read data
    print("Reading data...")
    car_dataset = tf.data.experimental.make_csv_dataset(
        data_path,
        column_names=[
            "buying",
            "maint",
            "doors",
            "persons",
            "lug_boot",
            "safety",
            "class",
        ],
        label_name="class",
        batch_size=batch_size,
    )

    # Generating Train, test split
    print("Generating train and test data...")
    train_size = int(0.8 * dataset_size)
    test_size = int(test_size * dataset_size)
    train_dataset = car_dataset.take(train_size)
    test_dataset = car_dataset.skip(train_size).take(test_size)

    # Build model
    print("Building model...")
    inputs_layer = get_input_layer(features_values)
    features_layer = get_features_layer(inputs_layer, features_values)
    print("FEATURE LAYER TYPE: ", type(features_layer.values()))
    print("FEATURE LAYER TYPE Mod: ", type(features_layer))
    print("FEATURE LAYER: ", features_layer)
    x = tf.keras.layers.Concatenate()(list(features_layer.values()))
    x = tf.keras.layers.Dense(10, activation="relu")(x)
    x = tf.keras.layers.Dense(5, activation="relu")(x)
    output_layer = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(inputs=list(inputs_layer.values()), outputs=output_layer)

    # Compile model
    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=["accuracy"],
    )

    # Fit the model
    print("Training model...")
    model.fit(
        train_dataset,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=test_dataset,
    )


def train_pytorch_model(
    data_path: str, test_size: float, batch_size: int, lr: float, epochs: int, seed: int
):

    # Libraries
    import numpy as np
    import torch
    import torch.nn as nn
    import torchmetrics
    from torch.utils.data import DataLoader
    from torchdata import datapipes

    # Variables
    seed = 8
    features_map = {
        0: {"low": 0, "med": 1, "high": 2, "vhigh": 3},
        1: {"low": 0, "med": 1, "high": 2, "vhigh": 3},
        2: {"2": 0, "3": 1, "4": 2, "5more": 3},
        3: {"2": 0, "4": 1, "more": 2},
        4: {"small": 0, "med": 1, "big": 2},
        5: {"low": 0, "med": 1, "high": 2},
    }
    dataset_length = 1729

    # Helpers
    def row_processor(r):
        for i, value in enumerate(r[:-1]):
            r[i] = features_map[i][value]
        return {
            "data": np.array(r[:-1], dtype=np.float64),
            "labels": np.array(r[-1], dtype=np.float64),
        }

    # Model definition
    class SimpleNetwork(nn.Module):
        def __init__(self):
            super().__init__()
            self.linear_relu = nn.Sequential(
                nn.Linear(6, 12, dtype=torch.float64),
                nn.ReLU(),
                nn.Linear(12, 6, dtype=torch.float64),
                nn.ReLU(),
                nn.Linear(6, 3, dtype=torch.float64),
                nn.ReLU(),
                nn.Linear(3, 1, dtype=torch.float64),
            )

        def forward(self, x):
            logits = self.linear_relu(x)
            return logits

    # Read data
    print("Reading and preparing data...")
    read_dp = datapipes.iter.FileLister(data_path)
    open_dp = datapipes.iter.FileOpener(read_dp)
    parse_dp = datapipes.iter.CSVParser(open_dp, delimiter=",", skip_lines=1)
    train_dp, test_dp = datapipes.iter.RandomSplitter(
        parse_dp,
        weights={"train": 1 - test_size, "test": test_size},
        total_length=dataset_length,
        seed=seed,
    )
    map_train_dp = datapipes.iter.Mapper(train_dp, row_processor)
    map_test_dp = datapipes.iter.Mapper(test_dp, row_processor)
    train_dataloader = DataLoader(map_train_dp, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(map_test_dp, batch_size=batch_size, shuffle=False)

    # Build model
    print("Building model...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SimpleNetwork().to(device)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    # Train model
    print("Training model...")
    model.train()
    for t in range(epochs):
        batch = 0
        for row in iter(train_dataloader):
            features, labels = row["data"].to(device), row["labels"].to(device)
            train_predictions = model(features)
            train_prediction, _ = torch.max(train_predictions, 1)
            train_loss = loss_fn(train_prediction, labels)

            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            batch += 1
            print(f"Epoch {t + 1} - Batch {batch} - Loss {train_loss.item():.4f}")

    # Test model
    print("Evaluating model...")
    metric = torchmetrics.classification.BinaryAccuracy()
    metric_values = []
    model.eval()
    for t in range(epochs):
        batch = 0
        with torch.no_grad():
            for row in iter(test_dataloader):
                features, labels = row["data"].to(device), row["labels"].to(device)
                val_predictions = model(features)
                val_prediction, _ = torch.max(val_predictions, 1)
                metric.update(val_prediction, labels)
        accuracy = metric.compute()
        metric_values.append(accuracy)
        metric.reset()

        batch += 1
        print(f"Epoch {t + 1} - Batch {batch} - Accuracy {accuracy:.4f}")

设置种子以确保可复制性。

In [None]:
set_seed(8)

## 使用 Vertex AI 实验中的自动记录进行模型实验

Vertex AI Experiments 自动记录功能允许您运行实验并自动记录不同 ML 框架的参数和指标。

在启动 Vertex AI 实验后，使用 `vertex_ai.autolog()` 启用自动记录功能。

有两种使用自动记录的方式：

1. *使用自动实验运行创建*
2. *使用用户实验运行创建*

使用*自动实验运行创建*，您运行一个实验。Vertex AI SDK 会自动创建一个实验运行，记录在 Vertex AI 实验中的所有参数和指标。

使用*用户实验运行创建*，您可以使用 `vertex_ai.start_run(您的实验运行名称)` 创建一个实验并运行。然后，在结束实验运行时使用 `vertex_ai.end_run()` 可以访问到生成的参数和指标。

创建一个用于跟踪训练参数和指标的实验。

首先，使用`init()`方法初始化一个实验。

由于一些模型类型，如TensorFlow，会自动记录时间序列指标，所以你需要创建一个TensorBoard实例。

要创建一个TensorBoard实例，你可以使用`vertex_ai.Tensorboard.create()`。

In [None]:
autologged_experiment_name = f"autologging-experiment-{UUID}"

In [None]:
experiment_tensorboard = vertex_ai.Tensorboard.create()
vertex_ai.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
    experiment=autologged_experiment_name,
    experiment_tensorboard=experiment_tensorboard,
    experiment_description="autolog-experiment-with-automatic-run",
)

在这个部分，Vertex AI SDK会自动创建一个实验运行，通过在Vertex AI实验中记录所有参数、训练和后训练指标。

启用自动记录

首先，使用`vertex_ai.autolog()`方法启用自动记录。

调用`vertex_ai.autolog()`后，来自支持的ML框架的模型训练调用的任何指标和参数将自动记录到Vertex Experiments。

In [None]:
vertex_ai.autolog()

运行基准实验

接下来，通过运行Sklearn模型实验来定义您的基准模型。

In [None]:
sklearn_config = dict(data_path=processed_data_filepath, test_size=0.2, max_depth=5)
train_sklearn_model(**sklearn_config)

获取实验结果

然后，使用方法 `get_experiment_df()` 将实验结果作为 pandas 数据框获取。

请注意所有参数和指标都已记录在 Vertex AI 实验中。

特别是，`run_name` 已经自动分配，您定义的 `accuracy_score` 指标也已记录。

In [None]:
experiment_df = vertex_ai.get_experiment_df()
experiment_df = experiment_df.T
experiment_df

自动记录用户实验运行创建

如上所述，自动记录会自动将运行的实验分配给一个实验运行。

但是你可以随时在实验中初始化一个运行，并使用`start_run()`。

初始化一个新的实验运行

在实验中跟踪特定的运行。

In [None]:
autologged_manual_run_experiment_name = f"autologging-tf-experiment-{UUID}"
vertex_ai.start_run(autologged_manual_run_experiment_name)

接下来，用TensorFlow模型运行一个新实验。

In [None]:
tf_config = dict(
    data_path=processed_data_filepath, test_size=0.2, batch_size=5, epochs=3
)
train_tensorflow_model(**tf_config)

比较实验结果

实验运行完成后，调用`end_run()`方法来完成该运行的日志记录，然后您可以再次使用`get_experiment_df()`来获取实验的结果。

In [None]:
vertex_ai.end_run()

In [None]:
experiment_df = vertex_ai.get_experiment_df()
experiment_df.T

不自动记录的运行实验

启用自动记录是可选的。您可以随时使用 `vertex_ai.autolog(disable=True)`来禁用自动记录。

禁用自动登录

In [None]:
vertex_ai.autolog(disable=True)

使用Pytorch运行最终实验。

In [None]:
pt_config = dict(
    data_path=processed_data_filepath,
    test_size=0.2,
    batch_size=64,
    epochs=2,
    lr=0.01,
    seed=8,
)
train_pytorch_model(**pt_config)

检查实验结果

使用`vertex_ai.get_experiment_df()`来检查PyTorch实验运行是否已被记录。

In [None]:
experiment_df = vertex_ai.get_experiment_df()
experiment_df.T

请注意，Vertex AI SDK 自动记录使用了MLFlow的自动记录功能，在实现中只支持PyTorch Lightning。因此，您需要手动记录指标和模型来追踪这个PyTorch实验。查看[Vertex AI实验文档](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments)以了解更多信息。

清理

要清理这个项目中使用的所有Google Cloud资源，您可以删除用于教程的[Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。

In [None]:
import os

# Delete experiment
experiment_list = vertex_ai.Experiment.list()
for experiment in experiment_list:
    try:
        experiment.delete()
    except Exception as e:
        print(e)

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI