In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI 模型花园 MediaPipe 文本分类

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_mediapipe_text_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在 Colab 中运行
    </a>
  </td>

  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_mediapipe_text_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在 GitHub 上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_mediapipe_image_classification.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
在 Vertex AI 工作台中打开
    </a>
  </td>
</table>

注意：这个笔记本已在以下环境中进行测试：

* Python版本= 3.9

注意：在Colab中链接的检查点和数据集不是由谷歌拥有或分发的，而是由第三方提供。在使用检查点和数据之前，请先查看第三方提供的条款和条件。

## 概述

本笔记本演示了如何使用[MediaPipe Model Maker](https://developers.google.com/mediapipe/solutions/model_maker)在Vertex AI Model Garden中训练一个设备上的文本分类模型。

### 目标

* 训练新模型
  * 将输入数据转换为训练格式
  * 创建[自定义作业](https://cloud.google.com/vertex-ai/docs/training/create-custom-job)来训练新模型
  * 导出模型

* 清理资源

### 成本

本教程使用 Google Cloud 的按量计费组件:

* Vertex AI
* Cloud Storage

了解[Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing)和[Cloud Storage 价格](https://cloud.google.com/storage/pricing)，并使用[Pricing 计算器](https://cloud.google.com/products/calculator/)根据您的预期用量生成成本估算。

在开始之前

### 仅限在Colab上运行
运行以下命令来安装依赖库，并在Colab上认证Google Cloud。

In [None]:
! pip3 install --upgrade pip

import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform

    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

    from google.colab import auth as google_auth

    google_auth.authenticate_user()

设置项目ID

**如果您不知道您的项目ID**，请查看支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### 区域

您还可以更改 Vertex AI 使用的 `REGION` 变量。了解有关[Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations)的更多信息。

In [None]:
REGION = "us-central1"  # @param {type: "string"}
REGION_PREFIX = REGION.split("-")[0]
assert REGION_PREFIX in (
    "us",
    "europe",
    "asia",
), f'{REGION} is not supported. It must be prefixed by "us", "asia", or "europe".'

创建一个云存储桶

创建一个存储桶，用于存储中间产物，如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

只有在您的存储桶不存在时: 运行以下单元格来创建您的云存储存储桶。

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

###导入库

In [None]:
import json
import os
from datetime import datetime

from google.cloud import aiplatform

### 初始化Python的Vertex AI SDK

为您的项目初始化Python的Vertex AI SDK。

In [None]:
now = datetime.now().strftime("%Y%m%d-%H%M%S")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temp/%s" % now)

EVALUATION_RESULT_OUTPUT_DIRECTORY = os.path.join(STAGING_BUCKET, "evaluation")
EVALUATION_RESULT_OUTPUT_FILE = os.path.join(
    EVALUATION_RESULT_OUTPUT_DIRECTORY, "evaluation.json"
)

EXPORTED_MODEL_OUTPUT_DIRECTORY = os.path.join(STAGING_BUCKET, "model")
EXPORTED_MODEL_OUTPUT_FILE = os.path.join(
    EXPORTED_MODEL_OUTPUT_DIRECTORY, "model.tflite"
)

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

定义培训机规格。

In [None]:
TRAINING_JOB_DISPLAY_NAME = "mediapipe_text_classifier_%s" % now
TRAINING_CONTAINER = f"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/mediapipe-train"
TRAINING_MACHINE_TYPE = "n1-highmem-16"
TRAINING_ACCELERATOR_TYPE = "NVIDIA_TESLA_V100"
TRAINING_ACCELERATOR_COUNT = 2

## 训练您的定制模型

获取数据集

以下代码块使用[SST-2](https://nlp.stanford.edu/sentiment/index.html)（斯坦福情感树库）数据集，其中包含67,349条用于训练的电影评论和872条用于测试的电影评论。该数据集有两类：正面和负面的电影评论。正面评论标记为1，负面评论标记为0。

SST-2数据集保存为TSV文件。TSV和CSV格式之间唯一的区别是TSV使用制表符`\t`作为分隔符，而CSV使用逗号`,`。

In [None]:
training_data_path = (
    "gs://mediapipe-tasks/text_classifier/SST-2/train.tsv"  # @param {type:"string"}
)
validation_data_path = (
    "gs://mediapipe-tasks/text_classifier/SST-2/dev.tsv"  # @param {type:"string"}
)

# The delimiter used in the dataset.
delimiter = "\t"  # @param {type:"string"}

# Character used to quote fields that contain special characters
# like the `delimiter`.
quotechar = "\t"  # @param {type:"string"}

# Sequence of keys for the CSV columns (represented as a comma
# separated list). If empty, the first row of the CSV file is used
# as the keys
fieldnames = ""  # @param {type:"string"}

# Column name for the input text.
text_column = "sentence"  # @param {type:"string"}

# Column name for the labels.
label_column = "label"  # @param {type:"string"}

### 设置微调选项

您可以在不同的模型架构之间进行选择，以进一步定制您的训练：

* 平均词嵌入模型
* BERT分类器

要设置模型架构和其他训练参数，请调整以下数值：

In [None]:
model_architecture = (
    "average_word_embedding"  # @param ["average_word_embedding", "mobilebert"]
)

# The learning rate to use for gradient descent-based
# optimizers. Defaults to 3e-5 for the BERT-based classifier
# and 0 for the average word-embedding classifier because
# it does not need such an optimizer.
learning_rate: float = 0.0  # @param {type:"number"}

# Batch size for training. Defaults to 32 for the average
# word-embedding classifier and 48 for the BERT-based
# classifier.
batch_size: int = 48  # @param {type:"number"}

# Number of training iterations over the dataset. Defaults
# to 10 for the average word-embedding classifier and 3
# for the BERT-based classifier.
epochs: int = 10  # @param {type:"slider", min:0, max:100, step:1}

# An integer that indicates the number of training steps per
# epoch. If set to 0, the training pipeline calculates the
# default steps per epoch as the training dataset size
# divided by batch size.
steps_per_epoch: int = 0  # @param {type:"number"}

# Controls whether the dataset is shuffled before training.
shuffle: bool = False  # @param {type:"boolean"}

# Length of the sequence to feed into the model.
seq_len: int = 256  # @param {type:"number"}

# Whether to convert all uppercase characters to lowercase
# during preprocessing.
do_lower_case: bool = True  # @param {type:"boolean"}

# The rate for dropout.
dropout_rate: float = 0.2  # @param {type:"number"}

# Dimension of the word embedding. Only used for the Average Word
# Embedding Model.
wordvec_dim: int = 16  # @param {type:"number"}

# Number of words to generate the vocabulary from data.
# Only used for the Average Word Embedding Model.
vocab_size: int = 10000  # @param {type:"number"}

### 运行微调
准备好你的训练数据集和微调选项后，你就可以开始微调过程了。这个过程需要大量资源，根据模型架构和你可用的计算资源，可能需要几分钟到几个小时不等。在使用GPU处理的Vertex AI上，下面示例的微调需要花费2-3分钟来训练一个在SST-2数据集上的平均词嵌入模型。

要开始微调过程，请使用以下代码：

In [None]:
model_export_path = EXPORTED_MODEL_OUTPUT_DIRECTORY
evaluation_result_path = EVALUATION_RESULT_OUTPUT_DIRECTORY

preprocessing_params = {
    "text_column": text_column,
    "label_column": label_column,
    "delimiter": delimiter,
    "quotechar": quotechar,
}
if fieldnames:
    preprocessing_params["fieldnames"] = [
        fieldname.strip() for fieldname in fieldnames.split(",")
    ]

hparams = {
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs,
    "shuffle": shuffle,
}
if steps_per_epoch:
    hparams["steps_per_epoch"] = steps_per_epoch

model_options = {
    "dropout_rate": dropout_rate,
    "wordvec_dim": wordvec_dim,
    "do_lower_case": do_lower_case,
    "vocab_size": vocab_size,
    "dropout_rate": dropout_rate,
}

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": TRAINING_MACHINE_TYPE,
            "accelerator_type": TRAINING_ACCELERATOR_TYPE,
            "accelerator_count": TRAINING_ACCELERATOR_COUNT,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAINING_CONTAINER,
            "command": [],
            "args": [
                "--task_name=text_classifier",
                "--training_data_path=%s" % training_data_path,
                "--validation_data_path=%s" % validation_data_path,
                "--evaluation_result_path=%s" % evaluation_result_path,
                "--model_export_path=%s" % model_export_path,
                "--model_architecture=%s" % model_architecture,
                "--preprocessing_params=%s" % json.dumps(preprocessing_params),
                "--hparams=%s" % json.dumps(hparams),
                "--model_options=%s" % json.dumps(model_options),
            ],
        },
    }
]

training_job = aiplatform.CustomJob(
    display_name=TRAINING_JOB_DISPLAY_NAME,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

training_job.run()

## 导出模型

调整完毕后，你可以保存TensorFlow Lite模型，然后在MediaPipe Studio的[文本分类](https://mediapipe-studio.webapps.google.com/demo/text_classifier)演示中进行尝试，或者根据[文本分类任务指南](https://developers.google.com/mediapipe/solutions/text/text_classifier)将其集成到您的设备应用中。导出的模型包含所需的模型元数据，以及一个分类标签文件。

In [None]:
import sys


def copy_model(model_source, model_dest):
    ! gsutil cp {model_source} {model_dest}

copy_model(EXPORTED_MODEL_OUTPUT_FILE, "text_classification_model.tflite")

if "google.colab" in sys.modules:
    from google.colab import files

    files.download("text_classification_model.tflite")

清理

In [None]:
# Delete training data and jobs.
if training_job.list(filter=f'display_name="{TRAINING_JOB_DISPLAY_NAME}"'):
    training_job.delete()

!gsutil rm -r {STAGING_BUCKET}