In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 自动机器学习自然语言文本分类模型

安装

安装最新版本的AutoML SDK。

In [None]:
! pip3 install google-cloud-automl

安装谷歌*云存储*库。

In [None]:
! pip3 install google-cloud-storage

### 重新启动内核

安装完AutoML SDK和谷歌*云存储*后，您需要重新启动笔记本内核，以便它能找到这些软件包。

In [None]:
import os

if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 开始之前

### GPU 运行时

*确保如果有这个选项的话，在 GPU 运行时中运行这个笔记本。在 Colab 中，选择* **Runtime > Change Runtime Type > GPU**

### 设置你的 GCP 项目

**无论你的笔记本环境如何，以下步骤都是必要的。**

1. [选择或创建一个 GCP 项目](https://console.cloud.google.com/cloud-resource-manager)。当你第一次创建一个账户时，你会获得 $300 的免费信用用于支持你的计算/存储成本。

2. [确保你的项目已启用计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 AutoML API 和 Compute Engine API。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 已经在 AutoML 笔记本中安装好了。

5. 在下面的单元格中输入你的项目 ID。然后运行该单元格，确保 Cloud SDK 在本笔记本中的所有命令中都使用正确的项目。

**注意**: Jupyter 运行以 `!` 开头的行作为 shell 命令，并且它插入带有 `$` 前缀的 Python 变量到这些命令中。

项目ID

**如果你不知道你的项目ID**，请尝试使用`gcloud`命令在下面执行第二个单元格来获取你的项目ID。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### 地区

您还可以更改`REGION`变量，该变量用于本笔记本的其余操作。以下是AutoML支持的地区。我们建议在可能的情况下，选择离您最近的地区。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您不能使用多地区存储桶来进行AutoML训练。并非所有地区都支持所有AutoML服务。有关每个地区的最新支持情况，请参阅[AutoML服务地区支持]()。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您正在进行现场教程会话，则可能正在使用共享测试账户或项目。为了避免在创建的资源上发生用户之间的名称冲突，您需要为每个实例会话创建一个时间戳，并附加到在本教程中将创建的资源的名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 对您的GCP账户进行身份验证

**如果您正在使用AutoML笔记本**，您的环境已经通过身份验证。跳过这一步。

*提示: 如果您正在使用AutoML笔记本并运行该单元格，单元格会自动跳过执行身份验证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Vertex, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

### 创建一个云存储桶

**无论您使用的笔记本环境如何，下面的步骤都是必需的。**

本教程旨在使用位于公共云存储桶中的训练数据，并为您的批量预测使用本地云存储桶。您也可以使用自己存储在本地云存储桶中的训练数据。

请在下方设置您的云存储桶的名称。该名称必须在所有云存储桶中唯一。

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP

只有在您的存储桶不存在时才运行以下单元格来创建您的云存储存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

最后，通过检查云存储桶的内容来验证对其的访问。

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### 设置变量

接下来，设置一些在教程中使用的变量。
### 导入库并定义常量

#### 导入AutoML SDK

将AutoM SDK导入我们的Python环境中。

In [None]:
import json
import time

from google.cloud import automl
from google.protobuf.json_format import MessageToJson

#### 自动机器学习常量

为自动机器学习设置以下常量：

- `PARENT`：用于数据集、模型和端点资源的AutoML根路径。

In [None]:
# AutoM location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

## 客户端

AutoML SDK 作为一个客户端/服务器模型运行。在你的一侧（Python脚本），你将创建一个客户端，该客户端向服务器（AutoML）发送请求并接收响应。

在本教程中，您将使用多个客户端，因此请提前设置好它们。

In [None]:
def automl_client():
    return automl.AutoMlClient()


def prediction_client():
    return automl.PredictionServiceClient()


def operations_client():
    return automl.AutoMlClient()._transport.operations_client


clients = {}
clients["automl"] = automl_client()
clients["prediction"] = prediction_client()
clients["operations"] = operations_client()

for client in clients.items():
    print(client)

In [None]:
IMPORT_FILE = "gs://cloud-ml-data/NL-classification/happiness.csv"

In [None]:
! gsutil cat $IMPORT_FILE | head -n 10

*示例输出*：
我和一个我感到同情和联系的人成功约会了.,深情
当我儿子考试得了90%的分数时我很高兴,深情
今天早上我去了健身房做瑜伽.,运动
我们和最近有点靠不住的一些朋友认真地谈了一次。他们理解了，我们一起度过了一个愉快的晚上。,联结
我和孙子孙女一起去了克罗恩植物园的蝴蝶展,深情
昨晚我冥想了.,休闲
"我做了一个新的农民面包食谱，而且做得非常棒!",成就
我收到了哥哥送的礼物，真的让我很惊讶,深情
昨天是我妈妈的生日，所以我很享受,enjoy_the_moment
我和我三个青少年孩子一起看杯子蛋糕大战。,深情

创建一个数据集

### [projects.locations.datasets.create](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/create) 

### [项目位置数据集创建](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/create)

#### 请求

In [None]:
dataset = {
    "display_name": "happiness_" + TIMESTAMP,
    "text_classification_dataset_metadata": {"classification_type": "MULTICLASS"},
}

print(
    MessageToJson(
        automl.CreateDatasetRequest(parent=PARENT, dataset=dataset).__dict__["_pb"]
    )
)

{
  "父项": "projects/migration-ucaip-training/locations/us-central1",
  "数据集": {
    "displayName": "happiness_20210228224317",
    "textClassificationDatasetMetadata": {
      "classificationType": "MULTICLASS"
    }
  }
}

#### 呼叫

In [None]:
request = clients["automl"].create_dataset(parent=PARENT, dataset=dataset)

#### 回應

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/datasets/TCN2705019056410329088"
}
```

示例输出：
```
{
  "name": "projects/116273516712/locations/us-central1/datasets/TCN2705019056410329088"
}
```

In [None]:
# The full unique ID for the dataset
dataset_id = result.name
# The short numeric ID for the dataset
dataset_short_id = dataset_id.split("/")[-1]

print(dataset_id)

### [项目.位置.数据集.导入数据](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/importData)

请求

In [None]:
input_config = {"gcs_source": {"input_uris": [IMPORT_FILE]}}

print(
    MessageToJson(
        automl.ImportDataRequest(name=dataset_id, input_config=input_config).__dict__[
            "_pb"
        ]
    )
)

{
  "name": "projects/116273516712/locations/us-central1/datasets/TCN2705019056410329088",
  "inputConfig": {
    "gcsSource": {
      "inputUris": [
        "gs://cloud-ml-data/NL-classification/happiness.csv"
      ]
    }
  }
} 
*示例输出*：

#### 召唤

In [None]:
request = clients["automl"].import_data(name=dataset_id, input_config=input_config)

### 响应

In [None]:
result = request.result()

print(MessageToJson(result))

示例输出：
```
{}
```

训练一个模型

### [projects.locations.models.create](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/create) 

### [项目.位置.模型.创建](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/create)

#### 请求

In [None]:
model = automl.Model(
    display_name="happiness_" + TIMESTAMP,
    dataset_id=dataset_short_id,
    text_classification_model_metadata=automl.TextClassificationModelMetadata(),
)

print(
    MessageToJson(automl.CreateModelRequest(parent=PARENT, model=model).__dict__["_pb"])
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "model": {
    "displayName": "happiness_20210228224317",
    "datasetId": "TCN2705019056410329088",
    "textClassificationModelMetadata": {}
  }
} 

*示例输出*:
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "model": {
    "displayName": "happiness_20210228224317",
    "datasetId": "TCN2705019056410329088",
    "textClassificationModelMetadata": {}
  }
}


#### 电话

In [None]:
request = clients["automl"].create_model(parent=PARENT, model=model)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

示例输出：
```
{
  "name": "projects/116273516712/locations/us-central1/models/TCN5333697920992542720"
}
```

In [None]:
# The full unique ID for the training pipeline
model_id = result.name
# The short numeric ID for the training pipeline
model_short_id = model_id.split("/")[-1]

print(model_short_id)

## 评估模型

### [projects.locations.models.modelEvaluations.list](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/list)

### [项目.位置.模型.模型评估.列表](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/list)

#### 电话

In [None]:
request = clients["automl"].list_model_evaluations(parent=model_id, filter="")

#### 回复

In [None]:
evaluations_list = [
    json.loads(MessageToJson(me.__dict__["_pb"])) for me in request.model_evaluation
]

print(json.dumps(evaluations_list, indent=2))
# The  evaluation slice
evaluation_slice = request.model_evaluation[0].name

*示例输出*：
```
[
  {
    "name": "projects/116273516712/locations/us-central1/models/TCN5333697920992542720/modelEvaluations/1436745357261371663",
    "annotationSpecId": "3130761503557287936",
    "createTime": "2021-03-01T02:56:28.878044Z",
    "evaluatedExampleCount": 1193,
    "classificationEvaluationMetrics": {
      "auPrc": 0.99065405,
      "confidenceMetricsEntry": [
        {
          "recall": 1.0,
          "precision": 0.01424979,
          "f1Score": 0.028099174
        },
        {
          "confidenceThreshold": 0.05,
          "recall": 1.0,
          "precision": 0.5862069,
          "f1Score": 0.73913044
        },
        {
          "confidenceThreshold": 0.94,
          "recall": 0.64705884,
          "precision": 1.0,
          "f1Score": 0.7857143
        },
        
        # 省略部分
        
        {
          "confidenceThreshold": 0.999,
          "recall": 0.21372032,
          "precision": 1.0,
          "f1Score": 0.35217392
        },
        {
          "confidenceThreshold": 1.0,
          "recall": 0.0026385225,
          "precision": 1.0,
          "f1Score": 0.005263158
        }
      ],
      "logLoss": 0.14686257
    },
    "displayName": "achievement"
  }
]
```

### [projects.locations.models.modelEvaluations.get](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/get)

### [projects.locations.models.modelEvaluations.get](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/get)

电话

In [None]:
request = clients["automl"].get_model_evaluation(name=evaluation_slice)

#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

*示例输出*：
```
{
  "name": "projects/116273516712/locations/us-central1/models/TCN5333697920992542720/modelEvaluations/1436745357261371663",
  "annotationSpecId": "3130761503557287936",
  "createTime": "2021-03-01T02:56:28.878044Z",
  "evaluatedExampleCount": 1193,
  "classificationEvaluationMetrics": {
    "auPrc": 0.99065405,
    "confidenceMetricsEntry": [
      {
        "recall": 1.0,
        "precision": 0.01424979,
        "f1Score": 0.028099174
      },
      {
        "confidenceThreshold": 0.05,
        "recall": 1.0,
        "precision": 0.5862069,
        "f1Score": 0.73913044
      },
      
      # 省略部分
      
      {
        "confidenceThreshold": 0.999,
        "recall": 0.23529412,
        "precision": 1.0,
        "f1Score": 0.3809524
      },
      {
        "confidenceThreshold": 1.0,
        "precision": 1.0
      }
    ],
    "logLoss": 0.005436425
  },
  "displayName": "exercise"
}
```

进行批量预测

### 为批处理预测准备文件

In [None]:
test_item = ! gsutil cat $IMPORT_FILE | head -n1
test_item, test_label = str(test_item[0]).split(",")

print(test_item, test_label)

In [None]:
import json

import tensorflow as tf

test_item_uri = "gs://" + BUCKET_NAME + "/test.txt"
with tf.io.gfile.GFile(test_item_uri, "w") as f:
    f.write(test_item + "\n")

gcs_input_uri = "gs://" + BUCKET_NAME + "/batch.csv"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    f.write(test_item_uri + "\n")

In [None]:
! gsutil cat $gcs_input_uri
! gsutil cat $test_item_uri

示例输出：
```
gs://migration-ucaip-trainingaip-20210228224317/test.txt
我跟一个我感到同情和有连接的人成功约会了。
```

### [projects.locations.models.batchPredict](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/batchPredict)

### [项目.地点.模型.批量预测](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/batchPredict)

请求

In [None]:
input_config = {"gcs_source": {"input_uris": [gcs_input_uri]}}

output_config = {
    "gcs_destination": {"output_uri_prefix": "gs://" + f"{BUCKET_NAME}/batch_output/"}
}

print(
    MessageToJson(
        automl.BatchPredictRequest(
            name=model_id, input_config=input_config, output_config=output_config
        ).__dict__["_pb"]
    )
)

{
  "name": "projects/116273516712/locations/us-central1/models/TCN5333697920992542720",
  "inputConfig": {
    "gcsSource": {
      "inputUris": [
        "gs://migration-ucaip-trainingaip-20210228224317/batch.csv"
      ]
    }
  },
  "outputConfig": {
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210228224317/batch_output/"
    }
  }
}

电话

In [None]:
request = clients["prediction"].batch_predict(
    name=model_id, input_config=input_config, output_config=output_config
)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{}
```

In [None]:
destination_uri = output_config["gcs_destination"]["output_uri_prefix"][:-1]

! gsutil ls $destination_uri/*
! gsutil cat $destination_uri/prediction*/*.jsonl

*示例输出*：
```
gs://migration-ucaip-trainingaip-20210228224317/batch_output/prediction-happiness_20210228224317-2021-03-01T02:57:02.004934Z/text_classification_1.jsonl
gs://migration-ucaip-trainingaip-20210228224317/batch_output/prediction-happiness_20210228224317-2021-03-01T02:57:02.004934Z/text_classification_2.jsonl
{"textSnippet":{"contentUri":"gs://migration-ucaip-trainingaip-20210228224317/test.txt"},"annotations":[{"annotationSpecId":"5436604512770981888","classification":{"score":0.93047273},"displayName":"affection"},{"annotationSpecId":"3707222255860711424","classification":{"score":0.002518793},"displayName":"achievement"},{"annotationSpecId":"7742447521984675840","classification":{"score":1.3182563E-4},"displayName":"enjoy_the_moment"},{"annotationSpecId":"824918494343593984","classification":{"score":0.06613126},"displayName":"bonding"},{"annotationSpecId":"1977839998950440960","classification":{"score":1.5267624E-5},"displayName":"leisure"},{"annotationSpecId":"8318908274288099328","classification":{"score":8.887557E-6},"displayName":"nature"},{"annotationSpecId":"3130761503557287936","classification":{"score":7.2130124E-4},"displayName":"exercise"}]}
```

进行在线预测

### [projects.locations.models.deploy](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/deploy)

### [projects.locations.models.deploy](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/deploy)

#### 电话

In [None]:
request = clients["automl"].deploy_model(name=model_id)

####回应

In [None]:
result = request.result()

print(MessageToJson(result))

*示例输出*：
``` 
{}
```

### [projects.locations.models.predict](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/predict)

### [projects.locations.models.predict](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/predict)

### 为在线预测准备数据项

In [None]:
test_item = ! gsutil cat $IMPORT_FILE | head -n1
test_item, test_label = str(test_item[0]).split(",")

#### 请求

In [None]:
payload = {"text_snippet": {"content": test_item, "mime_type": "text/plain"}}

request = automl.PredictRequest(name=model_id, payload=payload)

print(MessageToJson(request.__dict__["_pb"]))

```
{
  "名称": "projects/116273516712/locations/us-central1/models/TCN5333697920992542720",
  "有效载荷": {
    "文本片段": {
      "内容": "我和一个我感到同情和连接的人成功约会了。",
      "MIME 类型": "text/plain"
    }
  }
}
```

#### 呼叫

In [None]:
request = clients["prediction"].predict(request=request)

#### 响应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "输出": [
    {
      "注释规范ID": "5436604512770981888",
      "分类": {
        "得分": 0.9272586
      },
      "显示名称": "情感"
    },
    {
      "注释规范ID": "824918494343593984",
      "分类": {
        "得分": 0.068884976
      },
      "显示名称": "结合"
    },
    {
      "注释规范ID": "3707222255860711424",
      "分类": {
        "得分": 0.0028119811
      },
      "显示名称": "成就"
    },
    {
      "注释规范ID": "3130761503557287936",
      "分类": {
        "得分": 0.0008869726
      },
      "显示名称": "运动"
    },
    {
      "注释规范ID": "7742447521984675840",
      "分类": {
        "得分": 0.00013229548
      },
      "显示名称": "享受当下"
    },
    {
      "注释规范ID": "1977839998950440960",
      "分类": {
        "得分": 1.5584701e-05
      },
      "显示名称": "休闲"
    },
    {
      "注释规范ID": "8318908274288099328",
      "分类": {
        "得分": 9.5975e-06
      },
      "显示名称": "自然"
    }
  ]
}

整理清理

要清理此项目中使用的所有GCP资源，您可以[删除您用于本教程的GCP项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除本教程中创建的各个资源。

In [None]:
delete_dataset = True
delete_model = True
delete_bucket = True

# Delete the dataset using the AutoML fully qualified identifier for the dataset
try:
    if delete_dataset:
        clients["automl"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)

# Delete the model using the AutoML fully qualified identifier for the model
try:
    if delete_model:
        clients["automl"].delete_model(name=model_id)
except Exception as e:
    print(e)


if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r gs://$BUCKET_NAME