In [None]:
# Copyright  2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 在Vertex AI上使用PyTorch和Ray开始

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ray_on_vertex_ai/get_started_with_pytorch_rov.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在Colab中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fray_on_vertex_ai%2Fget_started_with_pytorch_rov.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在Colab Enterprise中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ray_on_vertex_ai/get_started_with_pytorch_rov.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在Vertex AI Workbench中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ray_on_vertex_ai/get_started_with_pytorch_rov.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在GitHub上查看
    </a>
  </td>
</table>

**_注意_**：此笔记本已在以下环境中进行了测试：

* Python 版本 = 3.9

## 概述

本教程演示了如何使用 Ray on Vertex AI SDK 和 Vertex AI SDK 来训练和提供一个 PyTorch 图像分类模型。

了解更多关于 [Ray on Vertex AI 概览](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview)。

### 目标

在本教程中，您将学习如何通过在Vertex AI上利用Ray来高效分发PyTorch图像分类模型的训练过程。此外，您还将学习如何将训练好的模型顺利部署到Vertex AI Endpoint。

本教程使用以下 Google Cloud ML 服务和资源：

- Ray on Vertex AI
- Vertex AI Model Registry
- Vertex AI Prediction

执行的步骤包括：

- 准备训练脚本
- 使用Ray Jobs API提交一个Ray作业
- 从PyTorch下载一个训练好的图像模型
- 创建一个自定义模型处理程序
- 将模型工件打包在模型归档文件中
- 在Vertex AI Model Registry中注册模型
- 在Vertex AI Endpoint中部署模型
- 进行在线预测

### 数据集

本教程使用[CIFAR-10数据集](https://pytorch.org/vision/stable/generated/torchvision.datasets.CIFAR10.html)，包含60000张32x32像素的彩色图片，分为10个类别，每个类别有6000张图片。

成本

本教程使用 Google Cloud 的可计费组件：

* Vertex AI
* Cloud Storage

了解 [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing)，
以及 [Cloud Storage 价格](https://cloud.google.com/storage/pricing)，
并使用 [定价计算器](https://cloud.google.com/products/calculator/)
根据您预期的使用情况生成成本估算。

## 安装

安装以下所需的软件包以执行此笔记本。

In [None]:
# Install the packages
import os

if not os.getenv("IS_TESTING"):
    USER = "--user"
else:
    USER = ""

! pip3 install {USER} google-cloud-aiplatform[ray]==1.56.0 ray[data]==2.9.3 ray[train]==2.9.3 ray[tune]==2.9.3 -q --no-warn-conflicts
! pip3 install {USER} torch==2.1.2 torchvision==0.16.2 torchmetrics==1.2.1 torchserve==0.9.0 torch-model-archiver==0.9.0 -q --no-warn-conflicts
! pip3 install {USER} google-auth==2.27.0 etils==1.5.2 -q --no-warn-conflicts

### 仅限使用Colab：取消注释以下单元格以重新启动内核。

In [None]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## 在开始之前

### 设置您的Google Cloud项目

**无论您使用什么笔记本环境，以下步骤都是必需的。**

1. [选择或创建一个Google Cloud项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建账号时，您将获得$300的免费信用用于计算/存储成本。

2. [确保您的项目已启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

4. 如果您在本地运行这个笔记本，您需要安装[Cloud SDK](https://cloud.google.com/sdk)。

#### 设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 查看支持页面：[找到项目ID](https://support.google.com/googleapi/answer/7014113)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

#### 区域

您也可以更改 Vertex AI 使用的 `REGION` 变量。了解更多关于 [Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations)。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您正在参加现场教程会话，您可能会使用共享测试账户或项目。为避免在创建的资源之间发生名称冲突，您为每个实例会话创建一个时间戳，并将时间戳附加到您在本教程中创建的资源名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

###验证您的谷歌云账户

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下相关说明进行操作。

1. Vertex AI Workbench
* 无需操作，您已通过验证。

2. 本地JupyterLab实例，取消注释并运行：

In [None]:
# ! gcloud auth login

3. 协作、取消注释并运行:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

请查看如何将Cloud Storage权限授予您的服务帐户 https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples。

创建一个云存储桶

创建一个存储桶，用于存储中间产物，例如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

如果您的存储桶尚不存在：运行以下单元格以创建您的云存储桶。

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

将torch-model-archiver添加到PATH

更新`PATH`环境变量以添加`torch-model-archiver`

In [None]:
import os

os.environ["PATH"] = f'{os.environ.get("PATH")}:~/.local/bin'

在 Vertex AI 上设置一个 Ray 集群

在运行下面的代码之前，请确保在Vertex AI上[安装](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/set-up)Ray，并在Vertex AI上至少[创建](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/create-cluster)一个Ray集群。

In [None]:
import vertex_ray
from google.cloud import aiplatform as vertex_ai
from vertex_ray import Resources

#### 初始化Vertex AI SDK用于Python

为您的项目初始化Vertex AI SDK用于Python。

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

定义集群配置

要在 Vertex AI 上配置一个 Ray 集群，您可以使用默认的配置请求，也可以根据需要指定副本数量（节点数）、机器类型、磁盘规格和加速器。

In [None]:
head_node_type = Resources(
    machine_type="n1-standard-16",
    node_count=1,
)

worker_node_types = [
    Resources(
        machine_type="n1-standard-16",
        node_count=2,
    )
]

创建 Ray 集群

使用与 Ray 一起使用的 Vertex AI SDK for Python 版本创建 Ray 集群。

In [None]:
cluster_name = f"ray-cluster-{TIMESTAMP}"

In [None]:
ray_cluster_name = vertex_ray.create_ray_cluster(
    head_node_type=head_node_type,
    worker_node_types=worker_node_types,
    cluster_name=cluster_name,
)

获取Ray集群

使用Python的Vertex AI SDK来获取Ray集群。

In [None]:
ray_cluster = vertex_ray.get_ray_cluster(ray_cluster_name)
print("Ray cluster on Vertex AI:", ray_cluster_name)

### 设置教程文件夹

在本教程中使用的文件夹设置。

In [None]:
from pathlib import Path as path

root_path = path.cwd()
tutorial_path = root_path / "tutorial"
src_path = tutorial_path / "src"
deliverables_path = tutorial_path / "deliverables"
build_path = tutorial_path / "build"
tests_path = tutorial_path / "tests"

src_path.mkdir(parents=True, exist_ok=True)
deliverables_path.mkdir(parents=True, exist_ok=True)
build_path.mkdir(parents=True, exist_ok=True)
tests_path.mkdir(parents=True, exist_ok=True)

### 导入库

In [None]:
import base64
import io
import os
import random
import shutil
import string
import time

# Ray - Training
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
# General
from etils import epath
from matplotlib import pyplot as plt
from ray.job_submission import JobStatus, JobSubmissionClient
# Serving
from ray.tune import ExperimentAnalysis
from vertex_ray.predict import torch as ray_torch

###设定变量

In [None]:
# training
LOGGING_URI = epath.Path(BUCKET_URI) / "logs"
EXPERIMENT_NAME = "torch_on_rov"
TRAINING_URI = LOGGING_URI / EXPERIMENT_NAME
TRAINING_PATH = deliverables_path / EXPERIMENT_NAME

# serving
DELIVERABLES_PATH = str(deliverables_path)
BUILD_URI = str(epath.Path(BUCKET_URI) / "build")
DEPLOY_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.1-11:latest"
MODEL_NAME = "torch_on_rov_cifar10"
DEPLOY_COMPUTE = "n1-standard-4"

### 定义辅助程序

In [None]:
def get_id(k=3):
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=k))


def plot_image_sample(test_dataset):
    """Plots a sample image from the CIFAR-10 dataset."""

    sample_idx = random.randrange(0, len(test_dataset))
    image, _ = test_dataset[sample_idx]
    pil_image = transforms.ToPILImage()(image)
    plt.imshow(pil_image)
    plt.show()

    return pil_image


def predict_from_image(pil_image, endpoint):
    """Predicts the class of an image using the given endpoint."""
    buffered_image = io.BytesIO()
    pil_image.save(buffered_image, format="JPEG")

    data = {"data": base64.b64encode(buffered_image.getvalue()).decode("utf-8")}
    response = endpoint.predict(instances=[data])

    return response.predictions

在这个教程中，您将使用Ray在Vertex AI上训练一个自定义图像分类模型。

#### 准备训练应用程序

在开始训练之前，让我们看一下如何组装一个Ray作业来分发您的训练。

Ray 2.4.0 使用 `train_loop_per_worker` 函数来执行分布式多工作器训练功能。

在设置数据集和模型之后，首先定义您的单工作器PyTorch训练函数，然后将其转换为以下分布式多工作器训练函数：

1. 使用 `ray.train.torch.prepare_data_loader` 对数据进行包装，并使用 `DistributedSampler` 进行分布式训练。

2. 使用 `ray.train.torch.prepare_model` 函数对模型进行包装，并使用 `DistributedDataParallel` 进行分布式训练。

有了多工作器训练函数之后，您需要定义 `ScalingConfig` 来指定期望的工作器数量，并指示分布式训练过程是否需要 GPUs。

此外，您可以定义一个 `RunConfig` 来指定检查点和同步行为，以及一些额外的训练循环参数。

最后，将所有内容传递给 `TorchTrainer`，Ray 使用它来利用分布式数据并行性（使用 PyTorch 的分布式后端）来分发您的训练。

### 准备训练脚本

文件 `src/task.py` 是执行 Ray 分布式训练作业的 Python 脚本。

In [None]:
training_script = """
# libraries

# general libraries
import os
import argparse
from etils import epath
import tempfile

# training libraries
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchmetrics.aggregation import MeanMetric
from torchmetrics.classification.accuracy import Accuracy

# ray libraries
import ray
from ray import train
from ray.train import ScalingConfig, RunConfig, CheckpointConfig, Checkpoint
from ray.train.torch import TorchTrainer, TorchCheckpoint


# helpers
def get_args():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('--batch-size', dest='batch_size',
                        type=int, default=16, help='Batch size')
    parser.add_argument('--epochs', dest='epochs',
                        type=int, default=10, help='Number of epochs')
    parser.add_argument('--lr', dest='lr',
                        type=int, default=1e-3, help='Learning rate')
    parser.add_argument('--num-workers', dest='num_workers',
                        type=int, default=1, help='Number of workers')
    parser.add_argument('--use-gpu', dest='use_gpu', action='store_true',
                        default=False, help='Use GPU')
    parser.add_argument('--experiment-name', dest='experiment_name', type=str,
                        default='cifar10-torch', help='Experiment name')
    parser.add_argument('--logging-dir', dest='logging_dir',
                        type=str, default='./logs', help='Logging directory')
    args = parser.parse_args()
    return args
    
# Create a simple model
class Cifar10Model(nn.Module):
    def __init__(self):
        super(Cifar10Model, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_epoch(train_loader, model, loss_fn, optimizer, device):
    # initiate training
    model.train()
    train_loss = MeanMetric()
    train_accuracy = Accuracy(task="multiclass", num_classes=10)

    # load metrics to device
    train_loss.to(device)
    train_accuracy.to(device)

    # training loop
    for batch, (data, target) in enumerate(train_loader):
        # move data to device
        data = data.to(device)
        target = target.to(device)

        # compute error
        output = model(data)
        loss = loss_fn(output, target)
        train_loss.update(loss)
        train_accuracy.update(output, target)

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print results
        if batch % 100 == 0:
            print(f'batch {batch}, loss {loss.item():.4f}')

    # compute loss and metrics
    train_loss = Tensor.numpy(train_loss.compute(), force=True).item()
    train_accuracy = Tensor.numpy(train_accuracy.compute(), force=True).item()

    return train_loss, train_accuracy


def evaluate_epoch(test_loader, model, loss_fn, device):
    # initiate evaluation
    model.eval()
    test_loss = MeanMetric()
    test_accuracy = Accuracy(task="multiclass", num_classes=10)

    # load metrics to device
    test_loss.to(device)
    test_accuracy.to(device)

    # evaluation loop
    for batch, (data, target) in enumerate(test_loader):
        with torch.no_grad():
            # move data to device
            data = data.to(device)
            target = target.to(device)

            # get loss and accuracy
            output = model(data)
            test_loss.update(loss_fn(output, target))
            test_accuracy.update(output, target)

    # compute loss and metrics
    test_loss = Tensor.numpy(test_loss.compute(), force=True).item()
    test_accuracy = Tensor.numpy(test_accuracy.compute(), force=True).item()

    return test_loss, test_accuracy


def train_loop_per_worker(config):
    # set configuration
    batch_size = config["batch_size"]
    epochs = config["epochs"]
    learning_rate = config["learning_rate"]

    # get device
    device = train.torch.get_device() if torch.cuda.is_available() else torch.device("cpu")

    # read dataset
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    transform = transforms.Compose([transforms.ToTensor(), normalize])

    train_dataset = datasets.CIFAR10(root="./train",
                                     transform=transform,
                                     train=True, download=True)

    test_dataset = datasets.CIFAR10(root="./test",
                                    transform=transform,
                                    train=False, download=True)

    # create data loaders
    train_loader = data.DataLoader(train_dataset, batch_size=batch_size)
    test_loader = data.DataLoader(test_dataset, batch_size=batch_size)
    train_loader = train.torch.prepare_data_loader(train_loader)
    test_loader = train.torch.prepare_data_loader(test_loader)

    # create model
    model = Cifar10Model()
    model = train.torch.prepare_model(model)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # train model
    for epoch in range(1, epochs + 1):
        train_loss, train_accuracy = train_epoch(train_loader, model, loss_fn, optimizer, device)
        test_loss, test_accuracy = evaluate_epoch(test_loader, model, loss_fn, device)

        # report metrics and model checkpoint
        train.report(
            metrics={"train_loss": train_loss, "train_accuracy": train_accuracy,
                     "test_loss": test_loss, "test_accuracy": test_accuracy},
            checkpoint=TorchCheckpoint.from_state_dict(model.state_dict())
        )
            
            
def main():
    # set configuration
    args = get_args()
    config = vars(args)

    # initialize ray session
    ray.init()

    # train model
    train_loop_config = {"learning_rate": config['lr'], "batch_size": config['batch_size'],
                         "epochs": config['epochs']}
    scaling_config = ScalingConfig(num_workers=config['num_workers'], use_gpu=config['use_gpu'])
    run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=1),
                           storage_path=config['logging_dir'],
                           name=config['experiment_name'])

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config=train_loop_config,
        run_config=run_config,
        scaling_config=scaling_config
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")


if __name__ == "__main__":
    main()
"""

with open(src_path / "task.py", "w") as f:
    f.write(training_script)
f.close()

### 准备`requirements`文件

`requirements.txt`文件包含你的Ray应用程序运行所需的依赖项。

In [None]:
requirements = """
importlib_resources==6.1.1
etils==1.5.2
ray[data]==2.9.3
ray[train]==2.9.3
ray[tune]==2.9.3
torch==2.1.2
torchvision==0.16.2
torchmetrics==1.2.1
"""

with open(tutorial_path / "requirements.txt", "w") as f:
    f.write(requirements)
f.close()

使用Ray Jobs API提交一个Ray作业

使用Ray Jobs API将脚本提交到Vertex AI上的Ray集群，并使用公共Ray仪表板地址。

需要强调的是，如果您更喜欢以编程方式提交作业，Ray Jobs API是首选选项。如果您更喜欢交互式的Python开发环境，也可以使用Ray on Vertex AI SDK。

启动客户提交工作。

In [None]:
client = JobSubmissionClient(address=f"vertex_ray://{ray_cluster.dashboard_address}")

提交工作。

In [None]:
id = get_id()

job_id = client.submit_job(
    submission_id=f"ray-job-{TIMESTAMP}-{id}",
    entrypoint=f"python3 task.py --experiment-name={EXPERIMENT_NAME} --num-workers=2 --logging-dir={LOGGING_URI}",
    runtime_env={
        "pip": {"packages": str(tutorial_path / "requirements.txt")},
        "working_dir": str(src_path),
    },
)

检查工作状态。

In [None]:
while True:
    job_status = client.get_job_status(job_id)
    if job_status == JobStatus.SUCCEEDED:
        print("Job succeeded!")
        break
    else:
        if job_status == JobStatus.FAILED:
            print("Job failed!")
            break
        else:
            print("Job is running...")
            time.sleep(60)

### 检查模型产物

当Ray训练作业完成后，在Cloud Storage位置中检查模型产物。

In [None]:
! gsutil ls -l {TRAINING_URI}

## 提供 PyTorch 模型

您可以通过以下方式在 Vertex AI 上使用 TorchServe 来提供 PyTorch 模型：

1. 下载 Ray 训练检查点。
2. 从 Ray TorchCheckpoint 中获取 PyTorch 模型。
3. 使用 Torch Model Archiver 工具，封装训练好的模型工件，包括模型工件、模型模块和自定义处理程序，创建一个存档文件。
4. 在 Vertex AI Model Registry 中注册模型。
5. 将模型部署到 Vertex AI 端点以进行预测。

### 下载Ray训练检查点

下载Ray训练作业的所有结果检查点。

In [None]:
! gsutil -q cp -r {TRAINING_URI} {DELIVERABLES_PATH}

### 获取最佳训练检查点

使用`ExperimentAnalysis`来根据相关指标和模式检索最佳检查点。

In [None]:
experiment_analysis = ExperimentAnalysis(TRAINING_PATH)
log_path = experiment_analysis.get_best_trial(metric="test_accuracy", mode="max")
best_checkpoint = experiment_analysis.get_best_checkpoint(
    log_path, metric="test_accuracy", mode="max"
)

使用Ray TorchCheckpoint从PyTorch模型。将TorchCheckpoint转换为PyTorch模型。

In [None]:
class Cifar10Model(nn.Module):
    def __init__(self):
        super(Cifar10Model, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model_definition = Cifar10Model()
torch_model = ray_torch.get_pytorch_model_from(best_checkpoint, model=model_definition)

### 构建PyTorch模型存档（.mar）文件

TorchServe允许您通过将所有模型工件打包到单个模型存档文件中来提供Torch模型。 在这种情况下，需要以下信息来创建一个独立的模型存档：

1. 序列化文件
2. 模型文件
3. 处理程序

保留模型

`model.pt` 包含模型的 state_dict。

In [None]:
torch.save(torch_model.state_dict(), build_path / "model.pt")

#### 创建`model`模块

`model.py`文件应包含模型架构。

In [None]:
model_script = """
import torch
import torch.nn as nn
import torch.nn.functional as F


class Cifar10Model(nn.Module):
    def __init__(self):
        super(Cifar10Model, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
"""

with open(build_path / "model.py", "w") as model_file:
    model_file.write(model_script)
model_file.close()

#### 创建 `custom_handler` 模块

`custom_handler.py` 文件使用 TorchServe 内置的 `image_classifier` 处理程序名称来处理自定义 TorchServe 推理逻辑。

In [None]:
custom_handler_script = '''
# Based on https://github.com/pytorch/serve/blob/master/examples/image_classifier/mnist/mnist_handler.py

from torchvision import transforms
from ts.torch_handler.image_classifier import ImageClassifier
from torch.profiler import ProfilerActivity


class Cifar10Classifier(ImageClassifier):
    """
    Cifar10Classifier handler class. This handler extends ImageClassifier class
    """

    label_names = [
        "plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"
    ]

    image_processing = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    def __init__(self):
        super(Cifar10Classifier, self).__init__()
        self.profiler_args = {
            "activities": [ProfilerActivity.CPU],
            "record_shapes": True,
        }

    def postprocess(self, data):
        """
        Post-process function to convert the predicted class id to label
        """
        predictions = data.argmax(1).tolist()
        return [self.label_names[pred] for pred in predictions]

'''

with open(build_path / "custom_handler.py", "w") as custom_handler_file:
    custom_handler_file.write(custom_handler_script)
custom_handler_file.close()

使用Torch模型归档工具将模型产品打包到一个模型归档文件（.mar）中。

In [None]:
build_script = """
torch-model-archiver -f --model-name cifar10 \
    --version 1.0  \
    --model-file model.py \
    --serialized-file model.pt \
    --handler custom_handler.py \
    --export-path .
"""

with open(build_path / "build.sh", "w") as build_file:
    build_file.write(build_script)
build_file.close()

In [None]:
! cd {str(build_path)} && chmod +x ./build.sh && ./build.sh

#### 将 `mar` 文件上传到存储桶

将 .mar 文件存储到云存储桶中。

In [None]:
!gsutil cp {str(build_path)}/cifar10.mar {BUILD_URI}/model.mar

在Vertex AI模型注册表中注册模型作为模型资源。

In [None]:
registered_model = vertex_ai.Model.upload(
    display_name=MODEL_NAME,
    serving_container_image_uri=DEPLOY_IMAGE_URI,
    artifact_uri=BUILD_URI,
)

### 部署模型以进行预测

创建一个 Vertex AI 终端节点，并部署注册的模型以进行预测。

In [None]:
endpoint = registered_model.deploy(
    deployed_model_display_name=MODEL_NAME,
    machine_type=DEPLOY_COMPUTE,
    accelerator_type=None,
    accelerator_count=0,
)

进行在线预测

从`CIFAR10`数据集中抽样一张图片以获取在线预测。

In [None]:
test_dataset = datasets.CIFAR10(
    root=tests_path / "data",
    transform=transforms.ToTensor(),
    train=False,
    download=True,
)

将采样图像可视化。

In [None]:
pil_image = plot_image_sample(test_dataset)

发送一个预测请求并获取预测的类别。

In [None]:
predictions = predict_from_image(pil_image, endpoint)

for pred in predictions:
    print("Predicted class:", pred)

清理

要清理此项目中使用的所有Google Cloud资源，您可以[删除用于本教程的Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在此教程中创建的各个资源。

In [None]:
import os

delete_endpoint = False
delete_model = False
delete_ray_cluster = False
delete_bucket = False
delete_tutorial = False

# Delete endpoint resource
if delete_endpoint or os.getenv("IS_TESTING"):
    endpoint.delete(force=True)

# Delete model resource
if delete_model or os.getenv("IS_TESTING"):
    registered_model.delete()

# Delete ray on vertex cluster
if delete_ray_cluster or os.getenv("IS_TESTING"):
    vertex_ray.delete_ray_cluster(ray_cluster.cluster_resource_name)

# Delete tutorial folder
if delete_tutorial or os.getenv("IS_TESTING"):
    shutil.rmtree(tutorial_path)

# Delete Cloud Storage objects that were created
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -q -m rm -r $BUCKET_URI