In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI模型花园 - TIMM

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_timm.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_timm.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_timm.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
在Vertex AI工作台中打开
    </a>
    (建议使用Python-3 CPU笔记本)
  </td>
</table>

## 概述

本笔记本演示了使用[timm](https://github.com/rwightman/pytorch-image-models)库在本地运行推理，微调PyTorch [timm模型](https://github.com/huggingface/pytorch-image-models#models)，并在[Vertex AI](https://cloud.google.com/vertex-ai)上部署模型。

### 目标

- 设置环境。
- 使用timm库在本地运行推理。
- 在Vertex AI上创建一个自定义训练作业来训练或微调模型。
- 在Vertex AI上部署模型进行在线预测。

### 成本

本教程使用Google Cloud的收费组件：

* Vertex AI
* 云存储

了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[云存储定价](https://cloud.google.com/storage/pricing)，并使用[Pricing Calculator](https://cloud.google.com/products/calculator/)根据您的预期使用量生成成本估算。

## 设置环境

### 设置云项目

1. [选择或创建一个谷歌云项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建帐户时，您将获得300美元的免费信用额用于计算/存储成本。

2. [确保您的项目已启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[Cloud Storage定价](https://cloud.google.com/storage/pricing)，并使用[定价计算器](https://cloud.google.com/products/calculator/)基于您的预期使用量生成成本估算。

3. [启用Artifact Registry](https://cloud.google.com/artifact-registry/docs/enable-service)并[创建存储Docker镜像的存储库](https://cloud.google.com/artifact-registry/docs/repositories/create-repos)。

4. [创建一个用于存储实验输出的云存储桶](https://cloud.google.com/storage/docs/creating-buckets)。

5. [启用Vertex AI API和Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component)。

6. [创建一个服务账号](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console)，并分配`Vertex AI用户`和`存储对象管理员`角色，用于将优化模型部署到Vertex AI端点。

### 配置所需库

强烈建议在 [Vertex AI 工作台](https://cloud.google.com/vertex-ai-workbench) 上运行此笔记本，无需手动安装任何额外的库。

如果您在本地运行此笔记本，您需要安装 [Cloud SDK](https://cloud.google.com/sdk) 和 [gsutil](https://cloud.google.com/storage/docs/gsutil_install)。

### 安装库

In [None]:
! pip3 install timm

只在Colab上运行以下命令，如果你使用Workbench，请跳过这一部分。

In [None]:
if "google.colab" in str(get_ipython()):
    ! pip3 install --upgrade google-cloud-aiplatform

    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

    from google.colab import auth as google_auth

    google_auth.authenticate_user()

### 设置环境变量

此笔记本支持位于 https://huggingface.co/docs/timm/models 中的模型。

您还可以在本地运行
`python -c "from timm import list_models; print(list_models(pretrained=True))"`
以查看所有预训练模型。

以下模型已手动验证可与此笔记本一起使用：

* vit_tiny_patch16_224
* beit_base_patch16_224
* deit3_small_patch16_224
* efficientnet_b2
* mobilenetv2_100
* resnet50
* resnest50d
* convnext_base
* cspdarknet53
* inception_v4

In [None]:
# The cloud project id.
PROJECT_ID = ""  # @param {type:"string"}
# The region for running jobs.
REGION = "us-central1"  # @param {type:"string"}

# The model you want to train and serve. Please select a model from the verified model list above.
# We use a ViT model as the example.
MODEL_NAME = "vit_tiny_patch16_224"  # @param {type:"string"}

# The Cloud Storage bucket name without gs:// prefix for training outputs.
# For example: test_bucket
GCS_BUCKET = ""  # @param {type:"string"}

# The service account for deploying fine tuned model. It looks like:
# '<account_name>@<project>.iam.gserviceaccount.com'
# Follow step 6 above to create this account.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

## 运行本地推理

本部分使用上面选择的模型在图像上运行本地推理。

### 导入库

In [None]:
import urllib

import timm
import torch
from PIL import Image
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

加载一个预先训练好的模型

In [None]:
model = timm.create_model(MODEL_NAME, pretrained=True)
model.eval()

### 载入并预处理图像

In [None]:
config = resolve_data_config({}, model=model)
transform = create_transform(**config)

# The example downloads a test image. You can upload and use your own images
# by changing IMAGE_FILENAME.
! wget https://github.com/pytorch/hub/raw/master/images/dog.jpg -O test.jpg
IMAGE_FILENAME = "test.jpg"  # @param {type:"string"}

# You can also copy over images stored in a GCS bucket with the line below.
# ! gsutil cp "gs://path/to/image" "test.jpg"

img = Image.open(IMAGE_FILENAME).convert("RGB")
tensor = transform(img).unsqueeze(0)  # transform and add batch dimension
display(img)

获取模型预测结果

In [None]:
with torch.no_grad():
    out = model(tensor)
probabilities = torch.nn.functional.softmax(out[0], dim=0)
print(probabilities.shape)

获取前5个预测类别的名称

In [None]:
# Get imagenet class mappings
url, filename = (
    "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt",
    "imagenet_classes.txt",
)
urllib.request.urlretrieve(url, filename)
with open("imagenet_classes.txt") as f:
    categories = [s.strip() for s in f.readlines()]

### 每个图像打印出顶级类别

In [None]:
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())
# prints class names and probabilities like:
# [('Samoyed', 0.6425196528434753), ('Pomeranian', 0.04062102362513542), ('keeshond', 0.03186424449086189), ('white wolf', 0.01739676296710968), ('Eskimo dog', 0.011717947199940681)]

## 运行训练作业

本部分在Vertex AI上运行常规的训练作业或超参数调整作业。

在创建训练作业之前，您需要准备用于训练和评估的数据集。

例如，您可以使用存储在云存储桶上的[ImageNet-1K](https://huggingface.co/datasets/imagenet-1k)作为输入数据集。

In [None]:
# The prebuilt training docker uri.
TRAIN_DOCKER_URI = (
    "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-timm-train"
)

# The path to data directory on Cloud Storage without gs:// prefix.
# In the form of: <bucket-name>/path-to-data
GCS_DATA_DIR = ""  # @param {type:"string"}

在 Vertex AI 上创建一个训练作业。如果您想创建一个超参数调整作业，可以跳过到下一节。

In [None]:
from google.cloud import aiplatform

# Init common setup.
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

# Input and output path.
data_dir = f"/gcs/{GCS_DATA_DIR}"
output_dir = f"/gcs/{GCS_BUCKET}/timm"

# Worker pool spec.
# Single node with multiple GPUs.
machine_type = "n1-highmem-32"
num_nodes = 1
gpu_type = "NVIDIA_TESLA_P100"  # @param {type:"string"}
num_gpus = 4  # @param {type:"integer"}

# Model specific config.
job_name = f"pytorch-{MODEL_NAME}"
batch_size = 32
epochs = 2

job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)
model = job.run(
    args=[
        "--standalone",
        f"--nnodes={num_nodes}",
        f"--nproc_per_node={num_gpus}",
        "train.py",
        data_dir,
        f"--model={MODEL_NAME}",
        "--pretrained",
        f"--output={output_dir}",
        f"--batch-size={batch_size}",
        f"--epochs={epochs}",
    ],
    replica_count=num_nodes,
    machine_type=machine_type,
    accelerator_type=gpu_type,
    accelerator_count=num_gpus,
)

在Vertex AI上创建一个超参数调整作业

您可以使用[超参数调整](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview)作业来找到您的超参数的最佳配置。

如果您已在上一节训练了一个模型并且不想调整超参数，可以跳过此部分。

In [None]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

# Init common setup.
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

# Input and output path.
data_dir = f"/gcs/{GCS_DATA_DIR}"
output_dir = f"/gcs/{GCS_BUCKET}/timm"

# Model specific config.
job_name = f"pytorch-hp-{MODEL_NAME}"
batch_size = 32
epochs = 2

# Worker pool spec.
machine_type = "n1-highmem-16"
num_nodes = 1
gpu_type = "NVIDIA_TESLA_V100"  # @param {type:"string"}
num_gpus = 2  # @param {type:"integer"}
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": gpu_type,
            "accelerator_count": num_gpus,
        },
        "replica_count": num_nodes,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "args": [
                "--standalone",
                f"--nnodes={num_nodes}",
                f"--nproc_per_node={num_gpus}",
                "train.py",
                data_dir,
                f"--model={MODEL_NAME}",
                "--pretrained",
                f"--output={output_dir}",
                f"--batch-size={batch_size}",
                f"--epochs={epochs}",
            ],
        },
    }
]

# Hyperparameter job specs.
metric_spec = {"top1_accuracy": "maximize"}
parameter_spec = {
    "lr": hpt.DoubleParameterSpec(min=0.001, max=0.05, scale="log"),
}
max_trial_count = 2
parallel_trial_count = 2

# Launch jobs.
training_job = aiplatform.CustomJob(
    display_name=job_name, worker_pool_specs=worker_pool_specs
)
hp_job = aiplatform.HyperparameterTuningJob(
    display_name=job_name,
    custom_job=training_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=max_trial_count,
    parallel_trial_count=parallel_trial_count,
)
hp_job.run()

## 为在线预测部署模型

该部分将模型上传到 Model Registry 并部署到 Endpoint 资源上。

模型部署步骤将需要大约 15 分钟才能完成。

In [None]:
# The prebuilt serving docker uri.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/pytorch-timm-serve"
# The port number used by torchserve traffic.
SERVE_PORT = 7080
# The path to model checkpoint file, including gs:// prefix.
MODEL_PT_PATH = "gs://path_to_model_best.pth.tar"  # @param {type:"string"}
# [Optional] the path to index_to_name.json, including gs:// prefix.
INDEX_TO_NAME_FILE = "gs://path_to_index_to_name.json"  # @param {type:"string"}

### 在 Vertex AI 上上传和部署模型

In [None]:
# Upload model.
serving_env = {
    "MODEL_ID": "timm-mobilenetv2-100",
    "MODEL_NAME": MODEL_NAME,
    "MODEL_PT_PATH": MODEL_PT_PATH,
    "INDEX_TO_NAME_FILE": INDEX_TO_NAME_FILE,
    "DEPLOY_SOURCE": "notebook",
}
model = aiplatform.Model.upload(
    display_name=MODEL_NAME,
    serving_container_image_uri=SERVE_DOCKER_URI,
    serving_container_ports=[SERVE_PORT],
    serving_container_predict_route="/predictions/timm_serving",
    serving_container_health_route="/ping",
    serving_container_environment_variables=serving_env,
)
# Or reuse a pre-uploaded model.
# model = aiplatform.Model('projects/123456789/locations/us-central1/models/123456789@1')

# Create an endpoint.
endpoint = aiplatform.Endpoint.create(display_name="pytorch-timm-endpoint")
# Or reuse a pre-created endpoint.
# endpoint = aiplatform.Endpoint('projects/123456789/locations/us-central1/endpoints/123456789')

# Deploy model to endpoint.
model.deploy(
    endpoint=endpoint,
    machine_type="n1-standard-8",
    accelerator_type="NVIDIA_TESLA_T4",
    accelerator_count=1,
    traffic_percentage=100,
    service_account=SERVICE_ACCOUNT,
)

您可以在[模型注册表](https://console.cloud.google.com/vertex-ai/models)中管理您上传的模型，并在[端点](https://console.cloud.google.com/vertex-ai/endpoints)中管理您的端点。

### 在线预测测试

您现在将测试部署的端点。请准备一张要预测的图片。

In [None]:
import base64

# You can get the deployed endpoint object by its resource name returned by Endpoint.create(). For example:
# endpoint = aiplatform.Endpoint('projects/816369962409/locations/us-central1/endpoints/8809168414485512192')

# Please upload an image and enter its filename below.
IMAGE_FILENAME = "test.jpg"  # @param {type:"string"}

# Alternatively, uncomment the following line to download a cat image for demonstration.
# ! wget http://images.cocodataset.org/val2017/000000039769.jpg -O test.jpg

with open(IMAGE_FILENAME, "rb") as f:
    image_b64 = base64.b64encode(f.read()).decode("utf-8")
instances = [{"data": {"b64": image_b64}}]

prediction = endpoint.predict(instances=instances)
print(prediction)

清理资源

In [None]:
endpoint.undeploy_all()
model.delete()