In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 顶点 SDK：数据标注

安装

安装最新（预览）版本的Vertex SDK。

In [None]:
! pip3 install -U google-cloud-aiplatform --user

安装谷歌*云存储*库。

In [None]:
! pip3 install google-cloud-storage

### 重新启动内核

一旦您安装了 Vertex SDK 和 Google *cloud-storage*，您需要重新启动笔记本内核，以便它能找到这些包。

In [None]:
import os

if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 开始之前

### GPU 运行时

*确保如果有选择的话，在GPU运行时中运行此笔记本。在Colab中，选择* **Runtime > Change Runtime Type > GPU**

### 设置您的GCP项目

**无论您使用的是哪种笔记本环境，以下步骤都是必须的。**

1. [选择或创建一个GCP项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建帐户时，您将获得$300的免费信用用于您的计算/存储成本。

2. [确保为您的项目启用了计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 Vertex APIs 和 Compute Engine APIs。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 已经安装在Google Cloud笔记本中。

5. 在下方的单元格中输入您的项目ID。然后运行该单元格，以确保Cloud SDK为此笔记本中的所有命令使用正确的项目。

**注意**: Jupyter会将带有 `!` 前缀的行作为shell命令运行，并将用 `$` 前缀的Python变量插入这些命令中。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### 区域

您还可以更改`REGION`变量，该变量用于笔记本其余部分的操作。 以下是Vertex AI支持的区域。我们建议在可能的情况下选择距离您最近的区域。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太：`asia-east1`

您不能在Vertex中使用多区域存储桶进行培训。并非所有区域都支持所有Vertex服务。有关每个区域的最新支持，请参阅[Vertex AI服务的区域支持](https://cloud.google.com/vertex-ai/docs/general/locations)。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您在进行现场教程会话，您可能会使用共享测试帐户或项目。为了避免用户在创建的资源上发生命名冲突，您需要为每个实例会话创建一个时间戳，并附加到本教程中将要创建的资源的名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 认证您的GCP账户

**如果您正在使用Google云笔记本**，您的环境已经通过认证。可以跳过这一步。

*注意：如果您使用的是Vertex笔记本并运行该单元格，则该单元格会知道跳过执行认证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Vertex, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

创建一个云存储桶

**无论您使用的是哪种笔记本环境，都需要按照以下步骤操作。**

本教程旨在使用存储在公共云存储桶中的训练数据以及用于批量预测的本地云存储桶。您也可以使用您在本地云存储桶中存储的自有训练数据。

请在下方设置您的云存储桶的名称。该名称必须在所有云存储桶中保持唯一。

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP

只有当您的储存桶尚不存在时才运行以下单元格以创建您的云存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

最后，通过检查云存储桶的内容来验证对其的访问权限：

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### 设置变量

接下来，设置一些在教程中使用的变量。
### 导入库并定义常量

#### 导入Vertex SDK

将Vertex SDK导入到我们的Python环境中。

In [None]:
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip
from google.protobuf import json_format
from google.protobuf.json_format import MessageToJson, ParseDict
from google.protobuf.struct_pb2 import Struct, Value

#### Vertex AI 常量

为 Vertex AI 设置以下常量:

- `API_ENDPOINT`: 用于数据集、模型、作业、流水线和端点服务的 Vertex AI API 服务端点。
- `API_PREDICT_ENDPOINT`: 用于预测的 Vertex AI API 服务端点。
- `PARENT`: 用于数据集、模型和端点资源的 Vertex AI 位置根路径。

In [None]:
# API Endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# Vertex AI location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### 自动机器学习常量

接下来，设置针对自动机器学习图片分类数据集和训练的常量：

- 数据集模式：告诉托管数据集服务数据集的类型是什么。
- 数据标记（注释）模式：告诉托管数据集服务数据如何标记（注释）。
- 数据集训练模式：告诉 Vertex AI 管道服务为哪项任务（例如分类）训练模型。

In [None]:
# Image Dataset type
IMAGE_SCHEMA = "google-cloud-aiplatform/schema/dataset/metadata/image_1.0.0.yaml"
# Image Labeling type
IMPORT_SCHEMA_IMAGE_CLASSIFICATION = "gs://google-cloud-aiplatform/schema/dataset/ioformat/image_classification_single_label_io_format_1.0.0.yaml"
# Image labeling task
LABELING_SCHEMA_IMAGE = "gs://google-cloud-aiplatform/schema/datalabelingjob/inputs/image_classification_1.0.0.yaml"

## 客户端

Vertex SDK作为一个客户端/服务器模型。在您的一侧（Python脚本）中，您将创建一个客户端，该客户端向服务器（Vertex）发送请求并接收响应。

在本教程中，您将使用多个客户端，因此请提前设置它们。

- 为托管数据集的数据集服务。
- 用于批处理作业和自定义训练的作业服务。

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}


def create_dataset_client():
    client = aip.DatasetServiceClient(client_options=client_options)
    return client


def create_job_client():
    client = aip.JobServiceClient(client_options=client_options)
    return client


clients = {}
clients["dataset"] = create_dataset_client()
clients["job"] = create_job_client()

for client in clients.items():
    print(client)

In [None]:
import tensorflow as tf

LABELING_FILES = [
    "https://raw.githubusercontent.com/googleapis/python-aiplatform/master/samples/snippets/resources/daisy.jpg"
]

IMPORT_FILE = "gs://" + BUCKET_NAME + "/labeling.csv"
with tf.io.gfile.GFile(IMPORT_FILE, "w") as f:
    for lf in LABELING_FILES:
        ! wget {lf} | gsutil cp {lf.split("/")[-1]} gs://{BUCKET_NAME}
        f.write("gs://" + BUCKET_NAME + "/" + lf.split("/")[-1] + "\n")

In [None]:
! gsutil cat $IMPORT_FILE

示例输出：
```
gs://migration-ucaip-trainingaip-20210303215432/daisy.jpg
```

中文翻译：
```
gs://migration-ucaip-trainingaip-20210303215432/daisy.jpg
```

创建一个数据集

### [projects.locations.datasets.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/create)

### [项目.地点.数据集.创建](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/create)

#### 请求

In [None]:
DATA_SCHEMA = IMAGE_SCHEMA

dataset = {
    "display_name": "labeling_" + TIMESTAMP,
    "metadata_schema_uri": "gs://" + DATA_SCHEMA,
}

print(
    MessageToJson(
        aip.CreateDatasetRequest(parent=PARENT, dataset=dataset).__dict__["_pb"]
    )
)

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataset": {
    "displayName": "labeling_20210303215432",
    "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/image_1.0.0.yaml"
  }
}
```

#### 电话

In [None]:
request = clients["dataset"].create_dataset(parent=PARENT, dataset=dataset)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/datasets/1165112889535627264",
  "displayName": "labeling_20210303215432",
  "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/image_1.0.0.yaml",
  "labels": {
    "aiplatform.googleapis.com/dataset_metadata_schema": "IMAGE"
  },
  "metadata": {
    "dataItemSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/dataitem/image_1.0.0.yaml"
  }
} 
```
{
  "name": "projects/116273516712/locations/us-central1/datasets/1165112889535627264",
  "displayName": "labeling_20210303215432",
  "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/image_1.0.0.yaml",
  "labels": {
    "aiplatform.googleapis.com/dataset_metadata_schema": "IMAGE"
  },
  "metadata": {
    "dataItemSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/dataitem/image_1.0.0.yaml"
  }
}

In [None]:
# The full unique ID for the dataset
dataset_id = result.name
# The short numeric ID for the dataset
dataset_short_id = dataset_id.split("/")[-1]

print(dataset_id)

### [projects.locations.datasets.import](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/import)

### [projects.locations.datasets.import](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/import)

请求

In [None]:
LABEL_SCHEMA = IMPORT_SCHEMA_IMAGE_CLASSIFICATION

import_config = {
    "gcs_source": {"uris": [IMPORT_FILE]},
    "import_schema_uri": LABEL_SCHEMA,
}

print(
    MessageToJson(
        aip.ImportDataRequest(
            name=dataset_short_id, import_configs=[import_config]
        ).__dict__["_pb"]
    )
)

{
  "name": "1165112889535627264",
  "importConfigs": [
    {
      "gcsSource": {
        "uris": [
          "gs://migration-ucaip-trainingaip-20210303215432/labeling.csv"
        ]
      },
      "importSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/ioformat/image_classification_single_label_io_format_1.0.0.yaml"
    }
  ]
} 

{
  "name": "1165112889535627264",
  "importConfigs": [
    {
      "gcsSource": {
        "uris": [
          "gs://migration-ucaip-trainingaip-20210303215432/labeling.csv"
        ]
      },
      "importSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/ioformat/image_classification_single_label_io_format_1.0.0.yaml"
    }
  ]
}
```

#### 电话

In [None]:
request = clients["dataset"].import_data(
    name=dataset_id, import_configs=[import_config]
)

回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{}
```

创建数据标注专家团队

如果您无法访问标注服务，请执行本部分。

In [None]:
# add client for specialist pool
clients["specialist_pool"] = aip.SpecialistPoolServiceClient(
    client_options=client_options
)

### [projects.locations.specialistPools.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.specialistPools/createe)

### [projects.locations.specialistPools.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.specialistPools/createe)

请求

在这个部分，您将用您的电子邮件地址替换 [your-email-address] 。 这将使您成为标记请求的专家和接收者。

In [None]:
EMAIL = "[your-email-address]"

specialist_pool = {
    "name": "labeling_" + TIMESTAMP,  # he resource name of the SpecialistPool.
    "display_name": "labeling_" + TIMESTAMP,  # user-defined name of the SpecialistPool
    "specialist_manager_emails": [EMAIL],
}

print(
    MessageToJson(
        aip.CreateSpecialistPoolRequest(
            parent=PARENT, specialist_pool=specialist_pool
        ).__dict__["_pb"]
    )
)

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "specialistPool": {
    "name": "labeling_20210303215432",
    "displayName": "labeling_20210303215432",
    "specialistManagerEmails": [
      "dev@fourteen33.com"
    ]
  }
}
```

呼叫

In [None]:
request = clients["specialist_pool"].create_specialist_pool(
    parent=PARENT, specialist_pool=specialist_pool
)

回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/specialistPools/1167839678372511744"
}
```

中文翻译：
{
  "name": "projects/116273516712/locations/us-central1/specialistPools/1167839678372511744"
}

In [None]:
specialist_name = result.name

specialist_id = specialist_name.split("/")[-1]

print(specialist_name)

创建数据标注工作

### [projects.locations.dataLabelingJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.dataLabelingJobs/create)

### [projects.locations.dataLabelingJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.dataLabelingJobs/create)

In [None]:
# create placeholder file for valid PDF file with instruction for data labeling
! echo "this is instruction" >> instruction.txt | gsutil cp instruction.txt gs://$BUCKET_NAME

请求

In [None]:
LABLEING_SCHEMA = LABELING_SCHEMA_IMAGE
INSTRUCTION_FILE = "gs://" + BUCKET_NAME + "/instruction.txt"

inputs = json_format.ParseDict({"annotation_specs": ["rose"]}, Value())

data_labeling_job = {
    "display_name": "labeling_" + TIMESTAMP,
    "datasets": [dataset_id],
    "labeler_count": 1,
    "instruction_uri": INSTRUCTION_FILE,
    "inputs_schema_uri": LABLEING_SCHEMA,
    "inputs": inputs,
    "annotation_labels": {
        "aiplatform.googleapis.com/annotation_set_name": "data_labeling_job_specialist_pool"
    },
    "specialist_pools": [specialist_name],
}

print(
    MessageToJson(
        aip.CreateDataLabelingJobRequest(
            parent=PARENT, data_labeling_job=data_labeling_job
        ).__dict__["_pb"]
    )
)

```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataLabelingJob": {
    "displayName": "labeling_20210303215432",
    "datasets": [
      "projects/116273516712/locations/us-central1/datasets/1165112889535627264"
    ],
    "labelerCount": 1,
    "instructionUri": "gs://migration-ucaip-trainingaip-20210303215432/instruction.txt",
    "inputsSchemaUri": "gs://google-cloud-aiplatform/schema/datalabelingjob/inputs/image_classification_1.0.0.yaml",
    "inputs": {
      "annotation_specs": [
        "rose"
      ]
    },
    "annotationLabels": {
      "aiplatform.googleapis.com/annotation_set_name": "data_labeling_job_specialist_pool"
    },
    "specialistPools": [
      "projects/116273516712/locations/us-central1/specialistPools/1167839678372511744"
    ]
  }
}
```

####电话

In [None]:
request = clients["job"].create_data_labeling_job(
    parent=PARENT, data_labeling_job=data_labeling_job
)

### 回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/dataLabelingJobs/3830883229125050368",
  "displayName": "labeling_20210303215432",
  "datasets": [
    "projects/116273516712/locations/us-central1/datasets/1165112889535627264"
  ],
  "labelerCount": 1,
  "instructionUri": "gs://migration-ucaip-trainingaip-20210303215432/instruction.txt",
  "inputsSchemaUri": "gs://google-cloud-aiplatform/schema/datalabelingjob/inputs/image_classification_1.0.0.yaml",
  "inputs": {
    "annotationSpecs": [
      "rose"
    ]
  },
  "state": "JOB_STATE_PENDING",
  "createTime": "2021-03-03T21:55:31.239049Z",
  "updateTime": "2021-03-03T21:55:31.239049Z"
}

In [None]:
labeling_task_name = request.name

print(labeling_task_name)

### [projects.locations.dataLabelingJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.dataLabelingJobs/get)

### [projects.locations.dataLabelingJobs.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.dataLabelingJobs/get)

电话

In [None]:
request = clients["job"].get_data_labeling_job(name=labeling_task_name)

####回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/dataLabelingJobs/3830883229125050368",
  "displayName": "labeling_20210303215432",
  "datasets": [
    "projects/116273516712/locations/us-central1/datasets/1165112889535627264"
  ],
  "labelerCount": 1,
  "instructionUri": "gs://migration-ucaip-trainingaip-20210303215432/instruction.txt",
  "inputsSchemaUri": "gs://google-cloud-aiplatform/schema/datalabelingjob/inputs/image_classification_1.0.0.yaml",
  "inputs": {
    "annotationSpecs": [
      "rose"
    ]
  },
  "state": "JOB_STATE_PENDING",
  "createTime": "2021-03-03T21:55:31.239049Z",
  "updateTime": "2021-03-03T21:55:31.239049Z",
  "specialistPools": [
    "projects/116273516712/locations/us-central1/specialistPools/1167839678372511744"
  ]
}

### [projects.locations.dataLabelingJobs.cancel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.dataLabelingJobs/cancel) 

### [projects.locations.dataLabelingJobs.cancel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.dataLabelingJobs/cancel)

#### 电话

In [None]:
request = clients["job"].cancel_data_labeling_job(name=labeling_task_name)

#### 响应

In [None]:
print(request)

示例输出：
```
无
```

In [None]:
while True:
    response = clients["job"].get_data_labeling_job(name=labeling_task_name)
    if response.state == aip.JobState.JOB_STATE_CANCELLED:
        print("Labeling job CANCELED")
        break
    else:
        print("Canceling labeling job:", response.state)
        time.sleep(60)

清理工作

要清理此项目中使用的所有GCP资源，您可以删除用于本教程的[ GCP 项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。

In [None]:
delete_dataset = True
delete_job = True
delete_specialist_pool = True
delete_bucket = True

# Delete the dataset using the Vertex AI fully qualified identifier for the dataset
try:
    if delete_dataset:
        clients["dataset"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)

# Delete the labeling job using the Vertex AI fully qualified identifier for the dataset
try:
    if delete_job:
        request = clients["job"].delete_data_labeling_job(name=labeling_task_name)
except Exception as e:
    print(e)

# Delete the specialist pool using the Vertex AI fully qualified identifier for the dataset
try:
    if delete_specialist_pool:
        clients["specialist_pool"].delete_specialist_pool(name=specialist_name)
except Exception as e:
    print(e)


if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r gs://$BUCKET_NAME