In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


AutoML SDK：数据标签

*免责声明*：本笔记本仅用于说明目的。

安装

安装最新版本（预览版）的AutoML SDK。

In [None]:
! pip3 install -U google-cloud-automl --user


In [None]:
! pip3 install -U google-cloud-datalabeling --user


安装谷歌*云存储*库。

In [None]:
! pip3 install google-cloud-storage


重新启动内核

安装了AutoML SDK和Google *cloud-storage*之后，您需要重新启动笔记本内核，以便它能找到这些包。

In [None]:
import os


if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)


## 开始之前

### GPU 运行时

*确保如果可以的话，请在 GPU 运行时中运行这个笔记本。在 Colab 中，选择* **运行时 > 更改运行时类型 > GPU**

### 设置您的 GCP 项目

**无论您的笔记本环境如何，以下步骤都是必需的。**

1. [选择或创建一个 GCP 项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建一个帐户时，您将获得 $300 的免费信用用于计算/存储费用。

2. [确保为您的项目启用了计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 AutoML API 和 Compute Engine API。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 在 AutoML 笔记本中已经安装。

5. 在下面的单元格中输入您的项目 ID。然后运行该单元格，确保 Cloud SDK 对这个笔记本中的所有命令使用正确的项目。

**注意**：Jupyter 运行以 `!` 为前缀的行作为 shell 命令，并将以 `$` 为前缀的 Python 变量插入这些命令中。

In [None]:
PROJECT_ID = "[your-project-id]" #@param {type:"string"}


In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)


In [None]:
! gcloud config set project $PROJECT_ID


#### 区域

您还可以更改“REGION”变量，该变量在本笔记本的其余部分中使用。下面是AutoML支持的区域。我们建议尽可能选择离您最近的区域。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您不能使用多区域存储桶进行AutoML训练。并非所有区域都支持所有AutoML服务。有关每个区域最新的支持，请参阅[AutoML服务的区域支持]()。

In [None]:
REGION = 'us-central1' #@param {type: "string"}


时间戳

如果您正在进行实时教程会话，您可能正在使用共享的测试账号或项目。为了避免用户在创建资源时发生名称冲突，您为每个实例会话创建一个时间戳，并将其附加到在本教程中将要创建的资源的名称中。

In [None]:
from datetime import datetime


TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")


### 验证您的GCP账户

**如果您正在使用AutoML笔记本**，您的环境已经被验证过。请跳过这一步。

*注意：如果您在AutoML笔记本上运行单元格，该单元格会自动跳过执行验证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on AutoML, then don't execute this code
if not os.path.exists('/opt/deeplearning/metadata/env_version'):
    if 'google.colab' in sys.modules:
        from google.colab import auth as google_auth
        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login


### 创建一个云存储桶

**无论您使用的笔记本环境如何，都需要按照以下步骤进行操作。**

本教程旨在使用公共云存储桶中的训练数据以及本地云存储桶进行批量预测。您也可以选择使用您自己存储在本地云存储桶中的训练数据。

在下方设置您的云存储桶的名称。它必须在所有云存储桶中保持唯一。

In [None]:
BUCKET_NAME = "[your-bucket-name]" #@param {type:"string"}


In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP


只有当您的存储桶不存在时：运行以下单元格以创建您的云存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME


最后，通过查看云存储桶中的内容来验证对其的访问权限：

In [None]:
! gsutil ls -al gs://$BUCKET_NAME


### 设置变量

接下来，设置一些在教程中使用的变量。
### 导入库并定义常量

导入AutoML SDK

将AutoML SDK导入我们的Python环境。

In [None]:
import json
import os
import sys
import time


from google.cloud import automl
from google.cloud import datalabeling_v1beta1 as datalabeling


from google.protobuf.json_format import MessageToJson
from google.protobuf.json_format import ParseDict
from googleapiclient.discovery import build


#### AutoML 常量

为AutoML设置以下常量：

- `PARENT`：数据集、模型和端点资源的AutoML位置根路径。

In [None]:
# AutoML location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION


客户端

AutoML SDK 以客户端/服务器模型运行。在您的一侧（Python 脚本）中，您将创建一个客户端，向服务器（AutoML）发送请求并接收响应。

在本教程中，您将使用多个客户端，因此请提前设置它们。

In [None]:
def data_labeling_client():
    return datalabeling.DataLabelingServiceClient()


def operations_client():
    return datalabeling.DataLabelingServiceClient()._transport.operations_client


clients = {}
clients["labeling"] = data_labeling_client()
clients["operations"] = operations_client()

for client in clients.items():
    print(client)


In [None]:
import tensorflow as tf


LABELING_FILES = [
    "https://raw.githubusercontent.com/googleapis/python-aiplatform/master/samples/snippets/resources/daisy.jpg"
]

IMPORT_FILE = "gs://" + BUCKET_NAME + '/labeling.csv'
with tf.io.gfile.GFile(IMPORT_FILE, 'w') as f:
    for lf in LABELING_FILES:
        ! wget {lf} | gsutil cp {lf.split("/")[-1]} gs://{BUCKET_NAME}
        f.write("gs://" + BUCKET_NAME + "/" + lf.split("/")[-1] + "\n")
    

In [None]:
! gsutil cat $IMPORT_FILE


以下是中文翻译：
```
gs://migration-ucaip-trainingaip-20210303171756/daisy.jpg
```

创建一个数据集

### [projects.locations.datasets.create](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/create)

### [projects.locations.datasets.create](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/create)

#### 请求

In [None]:
dataset = {
    "display_name": "labeling_" + TIMESTAMP,
    "description": "labeling_" + TIMESTAMP
}

print(MessageToJson(
    datalabeling.CreateDatasetRequest(
        parent=PARENT,
        dataset=dataset,
    ).__dict__["_pb"])
)


```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataset": {
    "displayName": "labeling_20210303171756",
    "description": "labeling_20210303171756"
  }
}
```

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataset": {
    "displayName": "labeling_20210303171756",
    "description": "labeling_20210303171756"
  }
}
```

#### 电话

In [None]:
request = clients["labeling"].create_dataset(
    parent=PARENT,
    dataset=dataset
)


#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))


*Example output*：
```
{
  "name": "projects/migration-ucaip-training/datasets/60401f09_0000_2cac_bcb5_3c286d3b27b6",
  "displayName": "labeling_20210303171756",
  "description": "labeling_20210303171756",
  "createTime": "2021-03-04T13:13:20.227435060Z"
}
```

In [None]:
# The full unique ID for the dataset
dataset_id = request.name
# The short numeric ID for the dataset
dataset_short_id = dataset_id.split('/')[-1]

print(dataset_id)



### [projects.locations.datasets.importData](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/importData) 

### [项目.位置.数据集.导入数据](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/importData)

#### 请求

In [None]:
input_config = {
    "data_type": "IMAGE",
    "gcs_source": {
        "input_uri": IMPORT_FILE, 
        "mime_type": "text/csv"
    }
}

print(MessageToJson(
    datalabeling.ImportDataRequest(
        name=dataset_id,
        input_config=input_config
    ).__dict__["_pb"])
)


示例输出:
```
{
  "name": "projects/migration-ucaip-training/datasets/60401f09_0000_2cac_bcb5_3c286d3b27b6",
  "inputConfig": {
    "dataType": "IMAGE",
    "gcsSource": {
      "inputUri": "gs://migration-ucaip-trainingaip-20210303171756/labeling.csv",
      "mimeType": "text/csv"
    }
  }
}
```

电话

In [None]:
request = clients["labeling"].import_data(
    name=dataset_id,
    input_config=input_config
)


Response 响应

In [None]:
result = request.result()

print(MessageToJson(result))


*示例输出*：
```
{}
```

## 创建数据标注工作
本节中的方法仅用于示例目的。

### [projects.annotationSpecSets.create](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.annotationSpecSets/create)

### [projects.annotationSpecSets.create](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.annotationSpecSets/create)

#### 请求

In [None]:
annotation_spec_set = {
    "display_name": "labeling_" + TIMESTAMP,
    "description": "description",
    "annotation_specs": [
        {
            "display_name": "rose",
            "description": "rose description"
            
        }
    ]
}

print(MessageToJson(
    datalabeling.CreateAnnotationSpecSetRequest(
        parent="projects/" + PROJECT_ID,
        annotation_spec_set=annotation_spec_set
    ).__dict__["_pb"])
)


*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training",
  "annotationSpecSet": {
    "displayName": "labeling_20210303171756",
    "description": "description",
    "annotationSpecs": [
      {
        "displayName": "rose",
        "description": "rose description"
      }
    ]
  }
}
```

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training",
  "annotationSpecSet": {
    "displayName": "labeling_20210303171756",
    "description": "description",
    "annotationSpecs": [
      {
        "displayName": "rose",
        "description": "rose description"
      }
    ]
  }
}
```

呼叫

In [None]:
request = clients["labeling"].create_annotation_spec_set(
    parent="projects/" + PROJECT_ID,
    annotation_spec_set=annotation_spec_set
)


回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))


{
  "name": "projects/migration-ucaip-training/annotationSpecSets/60401d46_0000_2c8d_aa13_883d24f7e3c8",
  "displayName": "labeling_20210303171756",
  "description": "description",
  "annotationSpecs": [
    {
      "displayName": "rose",
      "description": "rose description"
    }
  ]
}
```

{
  "name": "projects/migration-ucaip-training/annotationSpecSets/60401d46_0000_2c8d_aa13_883d24f7e3c8",
  "displayName": "labeling_20210303171756",
  "description": "description",
  "annotationSpecs": [
    {
      "displayName": "rose",
      "description": "rose description"
    }
  ]
}

In [None]:
annotation_spec_set_name = request.name

print(annotation_spec_set_name)


### [projects.instructions.create](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.instructions/create)

### [项目.使用说明.创建](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.instructions/create)

In [None]:
# create placeholder file for valid PDF file with instruction for data labeling
! echo "this is instruction" >> instruction.txt | gsutil cp instruction.txt gs://$BUCKET_NAME    


#### 请求

In [None]:
INSTRUCTION_FILE = "gs://" + BUCKET_NAME + "/instruction.txt" 

instruction = {
    "display_name": "labeling_" + TIMESTAMP,
    "description": "description",
    "data_type": "IMAGE",
    "pdf_instruction": {
        "gcs_file_uri": INSTRUCTION_FILE
    }
}

print(MessageToJson(
    datalabeling.CreateInstructionRequest(
        parent="projects/" + PROJECT_ID,
        instruction=instruction
    ).__dict__["_pb"])
)
   

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training",
  "instruction": {
    "displayName": "labeling_20210303171756",
    "description": "description",
    "dataType": "IMAGE",
    "pdfInstruction": {
      "gcsFileUri": "gs://migration-ucaip-trainingaip-20210303171756/instruction.txt"
    }
  }
}
``` 

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training",
  "instruction": {
    "displayName": "labeling_20210303171756",
    "description": "description",
    "dataType": "IMAGE",
    "pdfInstruction": {
      "gcsFileUri": "gs://migration-ucaip-trainingaip-20210303171756/instruction.txt"
    }
  }
}
```

####电话

In [None]:
request = clients["labeling"].create_instruction(
    parent="projects/" + PROJECT_ID,
    instruction=instruction
)


#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))


{
  "name": "projects/migration-ucaip-training/instructions/60401ef8_0000_2cac_bcb5_3c286d3b27b6",
  "displayName": "labeling_20210303171756",
  "description": "description",
  "createTime": "1970-01-01T00:00:00Z",
  "dataType": "IMAGE",
  "pdfInstruction": {
    "gcsFileUri": "gs://migration-ucaip-trainingaip-20210303171756/instruction.txt"
  }
}  
```
将以上英文文本翻译为中文：
```
{
  "name"："projects/migration-ucaip-training/instructions/60401ef8_0000_2cac_bcb5_3c286d3b27b6",
  "displayName"："labeling_20210303171756",
  "description"："description",
  "createTime"："1970-01-01T00:00:00Z",
  "dataType"："IMAGE",
  "pdfInstruction"：{
    "gcsFileUri"："gs://migration-ucaip-trainingaip-20210303171756/instruction.txt"
  }
}

In [None]:
instruction_name = result.name

print(instruction_name)


### [projects.datasets.image.label](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.datasets.image/label)

### [项目.数据集.图片.标签](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.datasets.image/label)

请求

In [None]:
EMAIL = "dev@fourteen33.com"

basic_config = {
    "instruction": instruction_name,
    "annotated_dataset_display_name": "labeling_" + TIMESTAMP,
    "label_group": "rose",
    "replica_count":1,
    "contributor_emails": [EMAIL]
}

feature = "CLASSIFICATION"

config = {
    "annotation_spec_set": annotation_spec_set_name,
    "allow_multi_label": False,
    "answer_aggregation_type": "MAJORITY_VOTE"
}

print(MessageToJson(
    datalabeling.LabelImageRequest(
        parent=dataset_id,
        basic_config=basic_config,
        feature=feature,
        image_classification_config=config    
    ).__dict__["_pb"])
)


*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1/datasets/60401f09_0000_2cac_bcb5_3c286d3b27b6",
  "basicConfig": {
    "instruction": "projects/migration-ucaip-training/instructions/60401ef8_0000_2cac_bcb5_3c286d3b27b6",
    "annotatedDatasetDisplayName": "labeling_20210303171756",
    "labelGroup": "rose",
    "replicaCount": 1,
    "contributorEmails": [
      "dev@fourteen33.com"
    ]
  },
  "feature": "CLASSIFICATION",
  "imageClassificationConfig": {
    "annotationSpecSet": "projects/migration-ucaip-training/annotationSpecSets/60401d46_0000_2c8d_aa13_883d24f7e3c8",
    "answerAggregationType": "MAJORITY_VOTE"
  }
}
```

电话

In [None]:
request = clients["labeling"].label_image(
    request={
        "parent": dataset_id,
        "basic_config": basic_config,
        "feature": feature,
        "image_classification_config": config
    }
)


回答

In [None]:
print(MessageToJson(request.__dict__["_pb"]))


*例子输出*:

In [None]:
labeling_job_name = request.operation.name

print(labeling_job_name)


### [projects.operations.get](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.operations/get)

### [projects.operations.get](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.operations/get)

#### 电话

In [None]:
request = clients["operations"].get_operation(
    name=labeling_job_name
)


回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))


*例子输出*：

### [projects.operations.cancel](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.operations/cancel)

### [项目.运营.取消](https://cloud.google.com/ai-platform/data-labeling/docs/reference/rest/v1beta1/projects.operations/cancel)

#### 电话

In [None]:
request = clients["operations"].cancel_operation(
    name=labeling_job_name
)


回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))


*示例输出*:

清理工作

要清理本项目中使用的所有GCP资源，您可以 [删除用于教程的GCP项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的单个资源。

In [None]:
delete_dataset = True
delete_annotation_spec = True
delete_instruction = True
delete_labeling_job = True
delete_bucket = True


try:
    if delete_dataset:
        clients["operations"].delete_operation(name=labeling_job_name)
except Exception as e:
    print(e)
    
try:
    if delete_dataset:
        clients["labeling"].delete_instruction(name=instruction_name)
except Exception as e:
    print(e)
    
try:
    if delete_dataset:
        clients["labeling"].delete_annotation_spec_set(name=annotation_spec_set_name)
except Exception as e:
    print(e)

try:
    if delete_dataset:
        clients["labeling"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)


if delete_bucket and 'BUCKET_NAME' in globals():
    ! gsutil rm -r gs://$BUCKET_NAME
