In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

使用Vertex AI上的预构建容器来训练和部署PyTorch模型

# 使用预构建容器在Vertex AI上训练和部署PyTorch模型

<table align="left">

  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/pytorch_train_deploy_models_with_prebuilt_containers.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"><br> 在Colab中运行
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fpytorch_train_deploy_models_with_prebuilt_containers.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在Colab企业版中打开
    </a>
  </td> 
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/pytorch_train_deploy_models_with_prebuilt_containers.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br>
      在Vertex AI工作台中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/pytorch_train_deploy_models_with_prebuilt_containers.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br>
      在GitHub上查看
    </a>
  </td>
</table>

## 概述

Vertex AI 提供了 Docker 容器映像，您可以将其作为预构建容器运行，用于自定义训练和预测。这些容器按机器学习（ML）框架和框架版本组织，包括您可能想在训练代码和提供预测中使用的常见依赖项。使用预构建容器通常比创建自定义容器更简单。

本教程演示如何使用 Vertex AI 上预构建容器训练和部署一个 PyTorch 图像模型。

了解更多关于[预构建容器用于自定义训练](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers)和[预构建容器用于预测和解释](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers)。

### 目标

在本教程中，您将学习如何构建、训练和部署一个 PyTorch 图像分类模型，使用预构建的容器进行自定义训练和预测。

本教程使用以下 Vertex AI 服务和资源：

- Vertex AI 训练服务
- Vertex AI 预测服务
- Vertex AI 模型注册表
- Vertex AI 模型资源
- Vertex AI 端点资源

执行的步骤包括：

- 将训练应用程序打包为 Python 源代码分发包
- 配置并在预构建容器中运行训练作业
- 将模型工件打包成模型归档文件
- 上传模型以进行部署
- 使用预构建容器部署模型进行预测
- 进行在线预测

### 数据集/模型

在本教程中，您将使用 PyTorch torchvision 模块中的 [MNIST](https://pytorch.org/vision/main/generated/torchvision.datasets.MNIST.html) 手写数字识别数据集。您将在MNIST数据集上训练一个简单的卷积神经网络，用于识别手写数字。

费用

本教程使用谷歌云的计费组件：

* Vertex AI
* 云存储

了解[Vertex AI价格](https://cloud.google.com/vertex-ai/pricing)，
以及[云存储价格](https://cloud.google.com/storage/pricing)，
并使用[定价计算器](https://cloud.google.com/products/calculator/)
根据您的预期使用量生成成本估算。

开始吧

### 安装用于Python的Vertex AI SDK和其他所需的软件包

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 torch \
                                 torchvision \
                                 torch-model-archiver

重新启动运行时（仅适用于Colab）

要使用新安装的包，您必须在Google Colab上重新启动运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ 内核即将重新启动。在继续下一步之前，请等待完成。⚠️</b>
</div>

### 在谷歌 Colab 上验证您的笔记本环境

在谷歌 Colab 上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并为Python初始化Vertex AI SDK

要开始使用Vertex AI，您必须拥有现有的Google Cloud项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

### 创建一个云存储桶

创建一个存储桶来存储诸如数据集等中间产物。

如果您的存储桶尚不存在：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### 为Python初始化Vertex AI SDK

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### 导入库

In [None]:
import base64
import io
import os

from matplotlib import pyplot as plt
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

使用预构建的容器训练 PyTorch 模型。在本教程中，您将使用预建容器为 PyTorch 模型训练自定义模型。

### 打包一个培训应用程序

#### 包布局

在开始培训之前，让我们看看如何为客户定制的培训任务组装 Python 包。解压后，该包包含以下内容：

- PKG-INFO
- README.md
- setup.cfg
- setup.py
- trainer
  - \_\_init\_\_.py
  - task.py

文件 `setup.cfg` 和 `setup.py` 是将包安装到 Docker 镜像的操作环境中的指令。

In [None]:
APP_NAME = "pytorch_101_trainer"
PYTHON_PACKAGE_APPLICATION_DIR = "python_package"

source_package_file_name = (
    f"{PYTHON_PACKAGE_APPLICATION_DIR}/dist/{APP_NAME}-0.1.tar.gz"
)
python_package_gcs_uri = f"{BUCKET_URI}/pytorch/training/{APP_NAME}-0.1.tar.gz"

In [None]:
! mkdir {PYTHON_PACKAGE_APPLICATION_DIR}
! mkdir {PYTHON_PACKAGE_APPLICATION_DIR}/trainer

! touch {PYTHON_PACKAGE_APPLICATION_DIR}/README.md
! touch {PYTHON_PACKAGE_APPLICATION_DIR}/trainer/__init__.py

In [None]:
%%writefile ./{PYTHON_PACKAGE_APPLICATION_DIR}/setup.py

import os
from setuptools import find_packages
from setuptools import setup
import setuptools

from distutils.command.build import build as _build
import subprocess


REQUIRED_PACKAGES = [
]

setup(
    name='pytorch_101_trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Vertex AI | Training | PyTorch | Image Classification | Python Package'
)

准备训练脚本

文件`trainer/task.py`是执行自定义训练作业的Python脚本。

In [None]:
%%writefile ./{PYTHON_PACKAGE_APPLICATION_DIR}/trainer/task.py

import os
import argparse

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

from google.cloud import storage


def load_data(batch_size):
  # Download training data from open datasets
  training_data = datasets.MNIST(
      root="data",
      train=True,
      download=True,
      transform=ToTensor(),
  )

  # Download test data from open datasets
  test_data = datasets.MNIST(
      root="data",
      train=False,
      download=True,
      transform=ToTensor(),
  )

  # Create data loaders
  train_dataloader = DataLoader(training_data, batch_size=batch_size)
  test_dataloader = DataLoader(test_data, batch_size=batch_size)

  return train_dataloader, test_dataloader

def create_model(device):
  # Define model
  class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

  model = NeuralNetwork().to(device)

  return model

def train(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


def main():
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--epochs", type=int, help="Number of training epochs.", default=2)
  parser.add_argument("--batch_size", type=int, help="Training batch size for one process.", default=32)
  parser.add_argument("--model_dir", type=str, help="Directory for saving models.", default=os.environ['AIP_MODEL_DIR'] if 'AIP_MODEL_DIR' in os.environ else "")
  argv = parser.parse_args()

  train_dataloader, test_dataloader = load_data(argv.batch_size)

  # Get cpu or gpu device for training
  device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
  print(f"Using {device} device")

  model = create_model(device)

  # Define a loss function and an optimizer.
  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

  for t in range(argv.epochs):
      print(f"Epoch {t+1}\n-------------------------------")
      train(train_dataloader, model, loss_fn, optimizer, device)
      test(test_dataloader, model, loss_fn, device)
  print("Done!")

  # Export the model to TorchScript
  model_filename = "pytorch-mnist.pt"
  local_path = os.path.join("/tmp", model_filename)
  model_scripted = torch.jit.script(model)
  model_scripted.save(local_path)

  if (os.path.exists(local_path)):
    # Upload the trained model to Cloud storage
    storage_path = os.path.join(argv.model_dir, model_filename)
    blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())
    blob.upload_from_filename(local_path)
    print(f"Saved model files in {argv.model_dir}")

if __name__ == "__main__":
    main()

创建一个Python源码分发包

您可以将您的训练应用程序与Python源码分发包一起上传到您的云存储桶。

In [None]:
! cd {PYTHON_PACKAGE_APPLICATION_DIR} && python3 setup.py sdist --formats=gztar
! gsutil cp {source_package_file_name} {python_package_gcs_uri}

print(f"Python source distribution package location: {python_package_gcs_uri}")

### 配置自定义训练作业

使用打包为Python源代码分发的培训代码的[预构建容器](https://cloud.google.com/vertex-ai/docs/training/create-custom-job)图像配置一个[自定义作业](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers)。

In [None]:
JOB_DISPLAY_NAME = "pytorch-custom-job-unique"
python_module_name = "trainer.task"
PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13:latest"
)

job = aiplatform.CustomPythonPackageTrainingJob(
    display_name=JOB_DISPLAY_NAME,
    python_package_gcs_uri=python_package_gcs_uri,
    python_module_name=python_module_name,
    container_uri=PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI,
)

### 运行自定义训练作业

接下来，您可以通过调用`run`方法来运行自定义作业开始训练作业。

**注意：** 当使用 Python 的 Vertex AI SDK 提交训练作业时，它会创建一个[训练流水线](https://cloud.google.com/vertex-ai/docs/training/create-training-pipeline)，并在`Vertex AI Training` 服务上启动自定义作业。

In [None]:
MACHINE_TYPE = "n1-standard-4"
ACCELERATOR_TYPE = "NVIDIA_TESLA_V100"
ACCELERATOR_COUNT = 1

EPOCHS = 1
BATCH_SIZE = 32
MODEL_DIR = f"{BUCKET_URI}/{JOB_DISPLAY_NAME}"

training_args = [
    "--epochs",
    str(EPOCHS),
    "--batch_size",
    str(BATCH_SIZE),
    "--model_dir",
    MODEL_DIR,
]

In [None]:
model = job.run(
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    base_output_dir=MODEL_DIR,
    replica_count=1,
    args=training_args,
    sync=True,
)

检查模型工件

当自定义训练作业完成后，您可以在云存储位置检查模型工件。

In [None]:
print(f"Model artifacts are available at {MODEL_DIR}")
! gsutil ls -l {MODEL_DIR}

将PyTorch模型部署到预构建的容器进行预测

您可以创建一个本地目录，然后将模型的构件从云存储复制到这个本地目录中。

In [None]:
# Create a local directory for model artifacts
model_path = "model-unique"

if not os.path.exists(model_path):
    ! mkdir {model_path}

In [None]:
model_name = "pytorch-mnist.pt"
model_file = f"{model_path}/{model_name}"

! gsutil cp {MODEL_DIR}/{model_name} {model_file}

创建自定义模型处理程序

自定义模型处理程序是一个Python脚本，在使用模型归档程序时，您将其与模型打包在一起。该脚本通常定义如何预处理输入数据，调用模型以及后处理输出。TorchServe为`image_classifier`、`image_segmenter`、`object_detector`和`text_classifier`提供了[默认处理程序](https://pytorch.org/serve/default_handlers.html)。在本教程中，您将创建一个自定义处理程序，扩展默认的[`image_classifier`](https://github.com/pytorch/serve/blob/master/ts/torch_handler/image_classifier.py)处理程序。这个自定义处理程序扩展了默认处理程序image_classifier.py中的ImageClassifier类，接受一个图像并返回该图像中的数字。

In [None]:
hander_file = f"{model_path}/custom_handler.py"

In [None]:
%%writefile {hander_file}

from torchvision import transforms
from ts.torch_handler.image_classifier import ImageClassifier
from torch.profiler import ProfilerActivity


class MNISTDigitClassifier(ImageClassifier):
    """
    MNISTDigitClassifier handler class. This handler extends class ImageClassifier from image_classifier.py, a
    default handler. This handler takes an image and returns the number in that image.
    """

    # Normalize MNIST images
    image_processing = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    def __init__(self):
        super(MNISTDigitClassifier, self).__init__()
        self.profiler_args = {
            "activities" : [ProfilerActivity.CPU],
            "record_shapes": True,
        }


    def postprocess(self, data):
        """The post process of MNIST converts the predicted output response to a label.
        Args:
            data (list): The predicted output from the Inference with probabilities is passed
            to the post-process function
        Returns:
            list : A list of dictionaries with predictions
        """
        return data.argmax(1).tolist()

将模型构件打包到模型存档文件中

您可以使用[`Torch model archiver`](https://github.com/pytorch/serve/tree/master/model-archiver)将所有模型构件打包到一个模型存档文件中。

请注意，预构建的 PyTorch 服务容器需要模型存档文件命名为`model.mar`，因此您需要在`torch-model-archiver`命令中将模型名称设置为`model`。

In [None]:
# Add torch-model-archiver to the PATH
os.environ["PATH"] = f'{os.environ.get("PATH")}:~/.local/bin'

In [None]:
! torch-model-archiver -f \
  --model-name model \
  --version 1.0  \
  --serialized-file $model_file \
  --handler $hander_file \
  --export-path $model_path

将模型工件复制到云存储

接下来，使用 `gsutil` 将模型工件复制到您的云存储存储桶。

In [None]:
MODEL_URI = f"{BUCKET_URI}/model"

! gsutil -m rm -r -f $MODEL_URI
! gsutil -m cp -r $model_path $MODEL_URI
! gsutil ls $MODEL_URI

### 上传模型以进行部署

接下来，您将模型工件上传到 `Vertex AI Model Registry`，这将为您的模型创建一个 Vertex AI 模型资源。本教程使用 PyTorch v1.11 容器，但针对您自己的用例，您可以从[PyTorch预构建容器列表](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers#pytorch)中进行选择。

In [None]:
DEPLOY_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.1-11:latest"

uploaded_model = aiplatform.Model.upload(
    display_name=model_name,
    serving_container_image_uri=DEPLOY_IMAGE_URI,
    artifact_uri=MODEL_URI,
)

### 部署模型进行预测

接下来，部署您的模型进行在线预测。设置变量`DEPLOY_COMPUTE`以配置用于预测的[计算资源](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute)的机器类型。

In [None]:
DEPLOY_COMPUTE = "n1-standard-4"

endpoint = uploaded_model.deploy(
    deployed_model_display_name=model_name,
    machine_type=DEPLOY_COMPUTE,
    accelerator_type=None,
    accelerator_count=0,
)

进行在线预测

您可以使用MNIST数据集作为在线预测的输入。

In [None]:
# Download test data from PyTorch torchvision dataset
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=transforms.ToTensor(),
)

# Create data loaders
test_dataloader = DataLoader(test_data, batch_size=64)

In [None]:
# Take one image as example for prediction
image, _ = test_data[0]
pil_image = transforms.ToPILImage()(image)
plt.imshow(pil_image)
plt.show()

获取在线预测

您将编码的输入图像数据发送到端点的`predict`请求，并获得预测结果。

In [None]:
buffered_image = io.BytesIO()
pil_image.save(buffered_image, format="JPEG")

data = {"data": base64.b64encode(buffered_image.getvalue()).decode("utf-8")}
prediction = endpoint.predict(instances=[data])

print(prediction.predictions)

### 进行批量预测（可选）

了解如何从您的PyTorch模型进行[批量预测](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/pytorch_image_classification_with_prebuilt_serving_containers.ipynb)。

清理

要清理此项目中使用的所有Google Cloud资源，您可以删除用于本教程的[Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的单个资源：

In [None]:
# Delete the deployment endpoint
endpoint.undeploy_all()
endpoint.delete()

# Delete the model from Model Registry
uploaded_model.delete()

# Delete Cloud Storage objects that were created
delete_bucket = True
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI