In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

顶点SDK：AutoML自然语言文本分类模型

## 安装

安装最新（预览）版本的Vertex SDK。

In [None]:
! pip3 install -U google-cloud-aiplatform --user

安装Google的云存储库。

In [None]:
! pip3 install google-cloud-storage

### 重新启动内核

安装了 Vertex SDK 和 Google *cloud-storage* 之后，您需要重新启动笔记本的内核，以便它可以找到这些包。

In [None]:
import os

if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 在开始之前

### GPU 运行时

*如果有这个选项的话，请确保你在 GPU 运行时中运行这个笔记本。在 Colab 中，选择* **运行时 > 更改运行时类型 > GPU**

### 设置您的 GCP 项目

**无论您的笔记本环境如何，以下步骤都是必需的。**

1. [选择或创建一个 GCP 项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建账户时，您将获得 $300 的免费信用额用于您的计算/存储成本。

2. [确保您的项目已启用计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 Vertex API 和 Compute Engine API。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 已安装在 Google Cloud 笔记本中。

5. 在下面的单元格中输入您的项目 ID。然后运行该单元格，确保Cloud SDK在此笔记本中的所有命令中使用正确的项目。

**注意**：Jupyter 运行以`!`开头的行作为 shell 命令，并将以`$`开头的 Python 变量插入这些命令中。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

地区

您还可以更改“REGION”变量，该变量用于笔记本其余部分的操作。以下是支持Vertex AI的地区。我们建议在可能的情况下选择离您最近的地区。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您不能在Vertex上使用多地区存储桶进行训练。并非所有地区都支持所有Vertex服务。有关每个地区的最新支持情况，请参见[Vertex AI服务的地区支持](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您正在进行现场教程会话，您可能正在使用共享的测试帐户或项目。为了避免用户在创建的资源之间发生名称冲突，您为每个实例会话创建一个时间戳，并将其附加到将在本教程中创建的资源的名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 验证您的GCP账户

**如果您正在使用Google Cloud笔记本**，您的环境已经经过验证。请跳过此步骤。

*注意：如果您在Vertex笔记本上运行单元格，则单元格知道跳过执行认证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Vertex, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

### 创建一个云存储桶

**无论您使用的是哪种笔记本环境，以下步骤都是必需的。**

本教程旨在使用公共云存储桶中的训练数据以及本地云存储桶用于您的批量预测。您也可以使用自己存储在本地云存储桶中的训练数据。

在下面设置您的云存储桶的名称。它必须在所有云存储桶中是唯一的。

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP

只有在你的存储桶尚不存在时才能运行以下代码以创建你的云存储存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

最后，通过检查其内容来验证对您的云存储桶的访问。

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### 设置变量

接下来，设置一些在教程中使用的变量。
### 导入库并定义常数

#### 导入Vertex SDK

将Vertex SDK导入到我们的Python环境中。

In [None]:
import base64
import json
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip
from google.protobuf import json_format
from google.protobuf.json_format import MessageToJson, ParseDict
from google.protobuf.struct_pb2 import Struct, Value

#### Vertex AI 常量

为 Vertex AI 设置以下常量：

- `API_ENDPOINT`：用于数据集、模型、任务、流水线和端点服务的 Vertex AI API 服务端点。
- `PARENT`：用于数据集、模型和端点资源的 Vertex AI 位置根路径。

In [None]:
# API Endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# Vertex AI location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### AutoML常量

接下来，设置独特于AutoML文本分类数据集和训练的常量：

- 数据集模式：告诉托管数据集服务数据集的类型。
- 数据标注（注释）模式：告诉托管数据集服务数据如何被标记（注释）。
- 数据集训练模式：告诉Vertex AI管道服务为哪个任务（例如分类）训练模型。

In [None]:
# Text Dataset type
TEXT_SCHEMA = "google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml"
# Text Labeling type
IMPORT_SCHEMA_TEXT_CLASSIFICATION = "gs://google-cloud-aiplatform/schema/dataset/ioformat/text_classification_single_label_io_format_1.0.0.yaml"
# Text Training task
TRAINING_TEXT_CLASSIFICATION_SCHEMA = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_classification_1.0.0.yaml"

## 客户端 Vertex AI

Vertex SDK 作为客户端/服务器模型工作。在您的一侧（Python脚本）中，您将创建一个客户端，该客户端从服务器（Vertex）发送请求并接收响应。

在本教程中，您将使用多个客户端，因此请提前设置它们。

- 用于管理数据集的 Dataset Service。
- 用于管理模型的 Model Service。
- 用于训练的 Pipeline Service。
- 用于部署的 Endpoint Service。
- 用于提供预测的 Prediction Service。*注意*：预测具有不同的服务端点。

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}


def create_dataset_client():
    client = aip.DatasetServiceClient(client_options=client_options)
    return client


def create_model_client():
    client = aip.ModelServiceClient(client_options=client_options)
    return client


def create_pipeline_client():
    client = aip.PipelineServiceClient(client_options=client_options)
    return client


def create_endpoint_client():
    client = aip.EndpointServiceClient(client_options=client_options)
    return client


def create_prediction_client():
    client = aip.PredictionServiceClient(client_options=client_options)
    return client


def create_job_client():
    client = aip.JobServiceClient(client_options=client_options)
    return client


clients = {}
clients["dataset"] = create_dataset_client()
clients["model"] = create_model_client()
clients["pipeline"] = create_pipeline_client()
clients["endpoint"] = create_endpoint_client()
clients["prediction"] = create_prediction_client()
clients["job"] = create_job_client()

for client in clients.items():
    print(client)

In [None]:
IMPORT_FILE = "gs://cloud-ml-data/NL-classification/happiness.csv"

In [None]:
! gsutil cat $IMPORT_FILE | head -n 10

*示例输出*：
```
我与一个我感到同情和有联系的人成功地约会了。, 亲情
当我儿子在考试中得到90%的分数时我很开心。, 亲情
今天早上我去了健身房做瑜伽。, 运动
我们与一些最近有些靠不住的朋友进行了一次认真的谈话。他们理解了，我们度过了一个愉快的晚上。, 羁绊
我和孙子们去了克罗恩温室参观蝴蝶展。, 亲情
我昨晚冥想了。, 休闲
“我尝试了一个新的农民面包食谱，结果很棒！”，成就
我从哥哥那里收到了一个非常让我惊讶的礼物。, 亲情
昨天是我妈妈的生日，所以我很快乐。, 享受当下
和我的三个十几岁的孩子一起看杯子蛋糕大战。, 亲情
```

创建一个数据集

[projects.locations.datasets.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/create)

将上面的英文文本翻译成中文：[projects.locations.datasets.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/create)

请求

In [None]:
DATA_SCHEMA = TEXT_SCHEMA

dataset = {
    "display_name": "happiness_" + TIMESTAMP,
    "metadata_schema_uri": "gs://" + DATA_SCHEMA,
}

print(
    MessageToJson(
        aip.CreateDatasetRequest(parent=PARENT, dataset=dataset).__dict__["_pb"]
    )
)

*示例输出*:
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataset": {
    "displayName": "happiness_20210226015238",
    "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml"
  }
}
```

#### 电话

In [None]:
request = clients["dataset"].create_dataset(parent=PARENT, dataset=dataset)

####回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  “名称”: “projects/116273516712/locations/us-central1/datasets/574578388396670976”,
  “displayName”: “happiness_20210226015238”，
  “metadataSchemaUri”: “gs://google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml”，
  “标签”: {
    “aiplatform.googleapis.com/dataset_metadata_schema”: “文本”
  }，
  “元数据”: {
    “dataItemSchemaUri”: “gs://google-cloud-aiplatform/schema/dataset/dataitem/text_1.0.0.yaml”
  }
}

In [None]:
# The full unique ID for the dataset
dataset_id = result.name
# The short numeric ID for the dataset
dataset_short_id = dataset_id.split("/")[-1]

print(dataset_id)

### [项目.位置.数据集.导入](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/import)

请求

In [None]:
LABEL_SCHEMA = IMPORT_SCHEMA_TEXT_CLASSIFICATION

import_config = {
    "gcs_source": {"uris": [IMPORT_FILE]},
    "import_schema_uri": LABEL_SCHEMA,
}

print(
    MessageToJson(
        aip.ImportDataRequest(
            name=dataset_short_id, import_configs=[import_config]
        ).__dict__["_pb"]
    )
)

{
  "name": "574578388396670976",
  "importConfigs": [
    {
      "gcsSource": {
        "uris": [
          "gs://cloud-ml-data/NL-classification/happiness.csv"
        ]
      },
      "importSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/ioformat/text_classification_single_label_io_format_1.0.0.yaml"
    }
  ]
}
```
*示例输出*：

### 电话

In [None]:
request = clients["dataset"].import_data(
    name=dataset_id, import_configs=[import_config]
)

回覆

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{}
```

训练一个模型

### [projects.locations.trainingPipelines.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/create)

### [projects.locations.trainingPipelines.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/create)

####请求

In [None]:
TRAINING_SCHEMA = TRAINING_TEXT_CLASSIFICATION_SCHEMA

task = json_format.ParseDict(
    {
        "multi_label": False,
    },
    Value(),
)

training_pipeline = {
    "display_name": "happiness_" + TIMESTAMP,
    "input_data_config": {"dataset_id": dataset_short_id},
    "model_to_upload": {"display_name": "happiness_" + TIMESTAMP},
    "training_task_definition": TRAINING_SCHEMA,
    "training_task_inputs": task,
}

print(
    MessageToJson(
        aip.CreateTrainingPipelineRequest(
            parent=PARENT, training_pipeline=training_pipeline
        ).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "trainingPipeline": {
    "displayName": "happiness_20210226015238",
    "inputDataConfig": {
      "datasetId": "574578388396670976"
    },
    "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_classification_1.0.0.yaml",
    "trainingTaskInputs": {
      "multi_label": false
    },
    "modelToUpload": {
      "displayName": "happiness_20210226015238"
    }
  }
}

呼叫

In [None]:
request = clients["pipeline"].create_training_pipeline(
    parent=PARENT, training_pipeline=training_pipeline
)

回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/trainingPipelines/2903115317607661568",
  "displayName": "happiness_20210226015238",
  "inputDataConfig": {
    "datasetId": "574578388396670976"
  },
  "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_classification_1.0.0.yaml",
  "trainingTaskInputs": {},
  "modelToUpload": {
    "displayName": "happiness_20210226015238"
  },
  "state": "PIPELINE_STATE_PENDING",
  "createTime": "2021-02-26T02:23:54.166560Z",
  "updateTime": "2021-02-26T02:23:54.166560Z"
}

In [None]:
# The full unique ID for the training pipeline
training_pipeline_id = request.name
# The short numeric ID for the training pipeline
training_pipeline_short_id = training_pipeline_id.split("/")[-1]

print(training_pipeline_id)

### [projects.locations.trainingPipelines.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/get)

### [projects.locations.trainingPipelines.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/get)

#### 电话

In [None]:
request = clients["pipeline"].get_training_pipeline(name=training_pipeline_id)

回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "名称": "projects/116273516712/locations/us-central1/trainingPipelines/2903115317607661568",
  "显示名称": "happiness_20210226015238",
  "inputDataConfig": {
    "数据集ID": "574578388396670976"
  },
  "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_classification_1.0.0.yaml",
  "trainingTaskInputs": {},
  "modelToUpload": {
    "名称": "projects/116273516712/locations/us-central1/models/2369051733671280640",
    "显示名称": "happiness_20210226015238"
  },
  "状态": "PIPELINE_STATE_SUCCEEDED",
  "创建时间": "2021-02-26T02:23:54.166560Z",
  "开始时间": "2021-02-26T02:23:54.396088Z",
  "结束时间": "2021-02-26T06:08:06.548524Z",
  "更新时间": "2021-02-26T06:08:06.548524Z"
}

In [None]:
while True:
    response = clients["pipeline"].get_training_pipeline(name=training_pipeline_id)
    if response.state != aip.PipelineState.PIPELINE_STATE_SUCCEEDED:
        print("Training job has not completed:", response.state)
        model_to_deploy_name = None
        if response.state == aip.PipelineState.PIPELINE_STATE_FAILED:
            break
    else:
        model_id = response.model_to_upload.name
        print("Training Time:", response.end_time - response.start_time)
        break
    time.sleep(20)

print(model_id)

评估模型##

### [projects.locations.models.evaluations.list](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/list)

### [projects.locations.models.evaluations.list](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/list)

#### 电话

In [None]:
request = clients["model"].list_model_evaluations(parent=model_id)

#### 回复

In [None]:
model_evaluations = [json.loads(MessageToJson(mel.__dict__["_pb"])) for mel in request]

print(json.dumps(model_evaluations, indent=2))

# The evaluation slice
evaluation_slice = request.model_evaluations[0].name

```
[
  {
    "name": "projects/116273516712/locations/us-central1/models/2369051733671280640/evaluations/1541152463304785920",
    "metricsSchemaUri": "gs://google-cloud-aiplatform/schema/modelevaluation/classification_metrics_1.0.0.yaml",
    "metrics": {
      "confusionMatrix": {
        "annotationSpecs": [
          {
            "displayName": "exercise",
            "id": "952213353537732608"
          },
          {
            "id": "1528674105841156096",
            "displayName": "achievement"
          },
          {
            "id": "3258056362751426560",
            "displayName": "leisure"
          },
          {
            "id": "3834517115054850048",
            "displayName": "bonding"
          },
          {
            "id": "5563899371965120512",
            "displayName": "enjoy_the_moment"
          },
          {
            "id": "6140360124268544000",
            "displayName": "nature"
          },
          {
            "id": "8446203133482237952",
            "displayName": "affection"
          }
        ],
        "rows": [
          [
            19.0,
            1.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0
          ],
          [
            0.0,
            342.0,
            5.0,
            2.0,
            13.0,
            2.0,
            13.0
          ],
          [
            2.0,
            10.0,
            42.0,
            1.0,
            12.0,
            0.0,
            2.0
          ],
          [
            0.0,
            4.0,
            0.0,
            121.0,
            1.0,
            0.0,
            4.0
          ],
          [
            2.0,
            29.0,
            3.0,
            2.0,
            98.0,
            0.0,
            6.0
          ],
          [
            0.0,
            3.0,
            0.0,
            1.0,
            0.0,
            21.0,
            1.0
          ],
          [
            0.0,
            7.0,
            0.0,
            1.0,
            6.0,
            0.0,
            409.0
          ]
        ]
      },
      "confidenceMetrics": [
        {
          "f1Score": 0.25,
          "recall": 1.0,
          "f1ScoreAt1": 0.88776374,
          "precisionAt1": 0.88776374,
          "precision": 0.14285715,
          "recallAt1": 0.88776374
        },
        {
          "confidenceThreshold": 0.05,
          "recall": 0.9721519,
          "f1Score": 0.8101266,
          "recallAt1": 0.88776374,
          "f1ScoreAt1": 0.88776374,
          "precisionAt1": 0.88776374,
          "precision": 0.69439423
        },
        
        # 由于简洁起见已删除

        {
          "f1Score": 0.0033698399,
          "recall": 0.0016877637,
          "confidenceThreshold": 1.0,
          "recallAt1": 0.0016877637,
          "f1ScoreAt1": 0.0033698399,
          "precisionAt1": 1.0,
          "precision": 1.0
        }
      ],
      "auPrc": 0.95903283,
      "logLoss": 0.08260541
    },
    "createTime": "2021-02-26T06:07:48.967028Z",
    "sliceDimensions": [
      "annotationSpec"
    ]
  }
]
```

### [projects.locations.models.evaluations.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/get) 

### [项目.位置.模型.评估.获取](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/get)

#### 电话

In [None]:
request = clients["model"].get_model_evaluation(name=evaluation_slice)

#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

```
{
  "name": "projects/116273516712/locations/us-central1/models/2369051733671280640/evaluations/1541152463304785920",
  "metricsSchemaUri": "gs://google-cloud-aiplatform/schema/modelevaluation/classification_metrics_1.0.0.yaml",
  "metrics": {
    "confusionMatrix": {
      "annotationSpecs": [
        {
          "displayName": "exercise",
          "id": "952213353537732608"
        },
        {
          "displayName": "achievement",
          "id": "1528674105841156096"
        },
        {
          "id": "3258056362751426560",
          "displayName": "leisure"
        },
        {
          "id": "3834517115054850048",
          "displayName": "bonding"
        },
        {
          "displayName": "enjoy_the_moment",
          "id": "5563899371965120512"
        },
        {
          "displayName": "nature",
          "id": "6140360124268544000"
        },
        {
          "id": "8446203133482237952",
          "displayName": "affection"
        }
      ],
      "rows": [
        [
          19.0,
          1.0,
          0.0,
          0.0,
          0.0,
          0.0,
          0.0
        ],
        [
          0.0,
          342.0,
          5.0,
          2.0,
          13.0,
          2.0,
          13.0
        ],
        [
          2.0,
          10.0,
          42.0,
          1.0,
          12.0,
          0.0,
          2.0
        ],
        [
          0.0,
          4.0,
          0.0,
          121.0,
          1.0,
          0.0,
          4.0
        ],
        [
          2.0,
          29.0,
          3.0,
          2.0,
          98.0,
          0.0,
          6.0
        ],
        [
          0.0,
          3.0,
          0.0,
          1.0,
          0.0,
          21.0,
          1.0
        ],
        [
          0.0,
          7.0,
          0.0,
          1.0,
          6.0,
          0.0,
          409.0
        ]
      ]
    },
    "logLoss": 0.08260541,
    "confidenceMetrics": [
      {
        "precision": 0.14285715,
        "precisionAt1": 0.88776374,
        "recall": 1.0,
        "f1ScoreAt1": 0.88776374,
        "recallAt1": 0.88776374,
        "f1Score": 0.25
      },
      {
        "f1Score": 0.8101266,
        "recall": 0.9721519,
        "precision": 0.69439423,
        "confidenceThreshold": 0.05,
        "recallAt1": 0.88776374,
        "precisionAt1": 0.88776374,
        "f1ScoreAt1": 0.88776374
      },
      
      # REMOVED FOR BREVITY
      
      {
        "confidenceThreshold": 1.0,
        "f1Score": 0.0033698399,
        "f1ScoreAt1": 0.0033698399,
        "precisionAt1": 1.0,
        "precision": 1.0,
        "recall": 0.0016877637,
        "recallAt1": 0.0016877637
      }
    ],
    "auPrc": 0.95903283
  },
  "createTime": "2021-02-26T06:07:48.967028Z",
  "sliceDimensions": [
    "annotationSpec"
  ]
}
```

进行批量预测

### 为批量预测准备文件

In [None]:
test_item = ! gsutil cat $IMPORT_FILE | head -n1
test_item, test_label = str(test_item[0]).split(",")

print(test_item, test_label)

我和一个我同情和有联系感的人成功约会。感情

制作批量输入文件

现在让我们制作一个批量输入文件，将其存储在您的本地云存储桶中。批量输入文件可以是 CSV 或 JSONL 格式。在本教程中，您将使用 JSONL 格式。对于 JSONL 文件，您需要为每个文本文件的每一行创建一个字典条目。该字典包含以下键值对：

- `content`：文本文件的云存储路径。
- `mimeType`：内容类型。在我们的例子中，是一个 `text/plain` 文件。

In [None]:
import json

import tensorflow as tf

test_item_uri = "gs://" + BUCKET_NAME + "/test.txt"
with tf.io.gfile.GFile(test_item_uri, "w") as f:
    f.write(test_item + "\n")

gcs_input_uri = "gs://" + BUCKET_NAME + "/test.jsonl"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    data = {"content": test_item_uri, "mime_type": "text/plain"}
    f.write(json.dumps(data) + "\n")

In [None]:
! gsutil cat $gcs_input_uri
! gsutil cat $test_item_uri

{"content": "gs://migration-ucaip-trainingaip-20210226015238/test.txt", "mime_type": "text/plain"}
我和某人成功地约会了，我感到同情和连接。

### [projects.locations.batchPredictionJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/create)

### [项目.位置.批量预测作业.创建](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/create)

#### 请求

In [None]:
batch_prediction_job = {
    "display_name": "happiness_" + TIMESTAMP,
    "model": model_id,
    "input_config": {
        "instances_format": "jsonl",
        "gcs_source": {"uris": [gcs_input_uri]},
    },
    "output_config": {
        "predictions_format": "jsonl",
        "gcs_destination": {
            "output_uri_prefix": "gs://" + f"{BUCKET_NAME}/batch_output/"
        },
    },
    "dedicated_resources": {
        "machine_spec": {
            "machine_type": "n1-standard-2",
            "accelerator_count": 0,
        },
        "starting_replica_count": 1,
        "max_replica_count": 1,
    },
}

print(
    MessageToJson(
        aip.CreateBatchPredictionJobRequest(
            parent=PARENT, batch_prediction_job=batch_prediction_job
        ).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "batchPredictionJob": {
    "displayName": "happiness_20210226015238",
    "model": "projects/116273516712/locations/us-central1/models/2369051733671280640",
    "inputConfig": {
      "instancesFormat": "jsonl",
      "gcsSource": {
        "uris": [
          "gs://migration-ucaip-trainingaip-20210226015238/test.jsonl"
        ]
      }
    },
    "outputConfig": {
      "predictionsFormat": "jsonl",
      "gcsDestination": {
        "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210226015238/batch_output/"
      }
    },
    "dedicatedResources": {
      "machineSpec": {
        "machineType": "n1-standard-2"
      },
      "startingReplicaCount": 1,
      "maxReplicaCount": 1
    }
  }
}

#### 电话

In [None]:
request = clients["job"].create_batch_prediction_job(
    parent=PARENT, batch_prediction_job=batch_prediction_job
)

回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/batchPredictionJobs/4770983263059574784",
  "displayName": "happiness_20210226015238",
  "model": "projects/116273516712/locations/us-central1/models/2369051733671280640",
  "inputConfig": {
    "instancesFormat": "jsonl",
    "gcsSource": {
      "uris": [
        "gs://migration-ucaip-trainingaip-20210226015238/test.jsonl"
      ]
    }
  },
  "outputConfig": {
    "predictionsFormat": "jsonl",
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210226015238/batch_output/"
    }
  },
  "state": "JOB_STATE_PENDING",
  "completionStats": {
    "incompleteCount": "-1"
  },
  "createTime": "2021-02-26T09:37:44.471843Z",
  "updateTime": "2021-02-26T09:37:44.471843Z"
}

In [None]:
# The fully qualified ID for the batch job
batch_job_id = request.name
# The short numeric ID for the batch job
batch_job_short_id = batch_job_id.split("/")[-1]

print(batch_job_id)

### [projects.locations.batchPredictionJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/get)

### [projects.locations.batchPredictionJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/get)

#### 电话

In [None]:
request = clients["job"].get_batch_prediction_job(name=batch_job_id)

#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

*示例输出*：
```
{
  "name": "projects/116273516712/locations/us-central1/batchPredictionJobs/4770983263059574784",
  "displayName": "happiness_20210226015238",
  "model": "projects/116273516712/locations/us-central1/models/2369051733671280640",
  "inputConfig": {
    "instancesFormat": "jsonl",
    "gcsSource": {
      "uris": [
        "gs://migration-ucaip-trainingaip-20210226015238/test.jsonl"
      ]
    }
  },
  "outputConfig": {
    "predictionsFormat": "jsonl",
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210226015238/batch_output/"
    }
  },
  "state": "JOB_STATE_PENDING",
  "completionStats": {
    "incompleteCount": "-1"
  },
  "createTime": "2021-02-26T09:37:44.471843Z",
  "updateTime": "2021-02-26T09:37:44.471843Z"
}
```

In [None]:
def get_latest_predictions(gcs_out_dir):
    """ Get the latest prediction subfolder using the timestamp in the subfolder name"""
    folders = !gsutil ls $gcs_out_dir
    latest = ""
    for folder in folders:
        subfolder = folder.split("/")[-2]
        if subfolder.startswith("prediction-"):
            if subfolder > latest:
                latest = folder[:-1]
    return latest


while True:
    response = clients["job"].get_batch_prediction_job(name=batch_job_id)
    if response.state != aip.JobState.JOB_STATE_SUCCEEDED:
        print("The job has not completed:", response.state)
        if response.state == aip.JobState.JOB_STATE_FAILED:
            break
    else:
        folder = get_latest_predictions(
            response.output_config.gcs_destination.output_uri_prefix
        )
        ! gsutil ls $folder/prediction*.jsonl

        ! gsutil cat $folder/prediction*.jsonl
        break
    time.sleep(60)

示例输出：
```
gs://migration-ucaip-trainingaip-20210226015238/batch_output/prediction-happiness_20210226015238-2021-02-26T09:37:44.261133Z/predictions_00001.jsonl
{"instance":{"content":"gs://migration-ucaip-trainingaip-20210226015238/test.txt","mimeType":"text/plain"},"prediction":{"ids":["8446203133482237952","3834517115054850048","1528674105841156096","5563899371965120512","952213353537732608","3258056362751426560","6140360124268544000"],"displayNames":["affection","bonding","achievement","enjoy_the_moment","exercise","leisure","nature"],"confidences":[0.9183423,0.045685068,0.024327256,0.0057157497,0.0040851077,0.0012627868,5.8173126E-4]}}
```

进行在线预测

### [projects.locations.endpoints.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/create)

### [projects.locations.endpoints.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/create)

请示

In [None]:
endpoint = {"display_name": "happiness_" + TIMESTAMP}

print(
    MessageToJson(
        aip.CreateEndpointRequest(parent=PARENT, endpoint=endpoint).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "endpoint": {
    "displayName": "幸福_20210226015238"
  }
}

#### 电话

In [None]:
request = clients["endpoint"].create_endpoint(parent=PARENT, endpoint=endpoint)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/endpoints/7367713068517687296"
}
```

中文翻译：
```
{
  "name": "projects/116273516712/locations/us-central1/endpoints/7367713068517687296"
}
```

In [None]:
# The fully qualified ID for the endpoint
endpoint_id = result.name
# The short numeric ID for the endpoint
endpoint_short_id = endpoint_id.split("/")[-1]

print(endpoint_id)

### [projects.locations.endpoints.deployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/deployModel)

### [projects.locations.endpoints.deployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/deployModel)

#### 请求

In [None]:
deployed_model = {
    "model": model_id,
    "display_name": "happiness_" + TIMESTAMP,
    "automatic_resources": {"min_replica_count": 1, "max_replica_count": 1},
}

traffic_split = {"0": 100}

print(
    MessageToJson(
        aip.DeployModelRequest(
            endpoint=endpoint_id,
            deployed_model=deployed_model,
            traffic_split=traffic_split,
        ).__dict__["_pb"]
    )
)

{
  "端点": "projects/116273516712/locations/us-central1/endpoints/7367713068517687296",
  "已部署模型": {
    "模型": "projects/116273516712/locations/us-central1/models/2369051733671280640",
    "显示名称": "happiness_20210226015238",
    "自动资源": {
      "最小复制品数量": 1,
      "最大复制品数量": 1
    }
  },
  "流量分流": {
    "0": 100
  }
}
```

#### 呼叫

In [None]:
request = clients["endpoint"].deploy_model(
    endpoint=endpoint_id, deployed_model=deployed_model, traffic_split=traffic_split
)

#### 回應

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "部署的模型": {
    "id": "418518105996656640"
  }
}

In [None]:
# The unique ID for the deployed model
deployed_model_id = result.deployed_model.id

print(deployed_model_id)

### [projects.locations.endpoints.predict](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/predict)

### [projects.locations.endpoints.predict](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/predict)

#### 请求

In [None]:
test_item = ! gsutil cat $IMPORT_FILE | head -n1
test_item, test_label = str(test_item[0]).split(",")

instances_list = [{"content": test_item}]
instances = [json_format.ParseDict(s, Value()) for s in instances_list]

request = aip.PredictRequest(
    endpoint=endpoint_id,
)
request.instances.append(instances)

print(MessageToJson(request.__dict__["_pb"]))

*示例输出*：
```
{
  "endpoint": "projects/116273516712/locations/us-central1/endpoints/7367713068517687296",
  "instances": [
    [
      {
        "content": "我和一个我感到同情和连接的人成功约会了。"
      }
    ]
  ]
}
```

电话

In [None]:
request = clients["prediction"].predict(endpoint=endpoint_id, instances=instances)

回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

```
{
  "predictions": [
    {
      "confidences": [
        0.8867673277854919,
        0.024743923917412758,
        0.0034913308918476105,
        0.07936617732048035,
        0.0013463868526741862,
        0.0002393187169218436,
        0.0040455833077430725
      ],
      "displayNames": [
        "喜爱",
        "成就",
        "享受当下",
        "联系",
        "休闲",
        "大自然",
        "锻炼"
      ],
      "ids": [
        "8446203133482237952",
        "1528674105841156096",
        "5563899371965120512",
        "3834517115054850048",
        "3258056362751426560",
        "6140360124268544000",
        "952213353537732608"
      ]
    }
  ],
  "deployedModelId": "418518105996656640"
}
```

### [projects.locations.endpoints.undeployModel] (https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/undeployModel)

### [项目.位置.端点.卸载模型] (https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/undeployModel)

电话

In [None]:
request = clients["endpoint"].undeploy_model(
    endpoint=endpoint_id, deployed_model_id=deployed_model_id, traffic_split={}
)

####回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{}
```

清理

要清理此项目中使用的所有GCP资源，您可以删除用于本教程的[GCP项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。

In [None]:
delete_dataset = True
delete_model = True
delete_endpoint = True
delete_pipeline = True
delete_batchjob = True
delete_bucket = True

# Delete the dataset using the Vertex AI fully qualified identifier for the dataset
try:
    if delete_dataset:
        clients["dataset"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)

# Delete the model using the Vertex AI fully qualified identifier for the model
try:
    if delete_model:
        clients["model"].delete_model(name=model_id)
except Exception as e:
    print(e)

# Delete the endpoint using the Vertex AI fully qualified identifier for the endpoint
try:
    if delete_endpoint:
        clients["endpoint"].delete_endpoint(name=endpoint_id)
except Exception as e:
    print(e)

# Delete the training pipeline using the Vertex AI fully qualified identifier for the training pipeline
try:
    if delete_pipeline:
        clients["pipeline"].delete_training_pipeline(name=training_pipeline_id)
except Exception as e:
    print(e)

# Delete the batch job using the Vertex AI fully qualified identifier for the batch job
try:
    if delete_batchjob:
        clients["job"].delete_batch_prediction_job(name=batch_job_id)
except Exception as e:
    print(e)

if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r gs://$BUCKET_NAME