In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 自动机器学习文本情感分析

## 安装

安装最新版本的AutoML SDK。

In [None]:
! pip3 install google-cloud-automl

安装Google云存储库。

In [None]:
! pip3 install google-cloud-storage

### 重新启动内核

安装完AutoML SDK和Google *cloud-storage*后，您需要重新启动笔记本内核，以便它可以找到这些包。

In [None]:
import os

if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 开始之前

### GPU 运行时

*如果有的话，请确保在 GPU 运行时中运行此笔记本。在 Colab 中，选择* **运行时 > 更改运行时类型 > GPU**

### 设置您的 GCP 项目

**以下步骤是必需的，无论您的笔记本环境如何。**

1. [选择或创建一个 GCP 项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建帐户时，您将获得 $300 的免费信用，用于计算/存储成本。

2. [确保您的项目已启用计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 AutoML API 和 Compute Engine API。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 已经安装在 AutoML 笔记本中。

5. 在下面的单元格中输入您的项目 ID。然后运行单元格，确保 Cloud SDK 对本笔记本中的所有命令使用正确的项目。

**注意**: Jupyter 以 `!` 开头的行作为 shell 命令，并且它将以 `$` 为前缀的 Python 变量插入这些命令中。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### 区域

您还可以更改`REGION`变量，该变量用于笔记本的其余部分的操作。以下是AutoML支持的区域。我们建议尽可能选择最接近您的区域。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您不能使用多区域存储桶进行AutoML训练。并非所有区域都支持所有AutoML服务。有关每个区域的最新支持，请参见[AutoML服务的区域支持]()。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您在一个直播教程会话中，您可能正在使用一个共享的测试账户或项目。为了避免在创建的资源中用户之间的名称冲突，您为每个实例会话创建一个时间戳，并附加到将在本教程中创建的资源的名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 验证您的GCP账号

**如果您正在使用AutoML笔记本**，您的环境已经经过身份验证。请跳过这一步。

*注意：如果您在AutoML笔记本上并运行该单元格，该单元格将自动跳过执行身份验证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Vertex, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

### 创建一个云端存储桶

**无论您的笔记本环境如何，都需要执行以下步骤。**

本教程旨在使用存储在公共云端存储桶中的训练数据以及本地云端存储桶进行批量预测。您也可以使用您存储在本地云端存储桶中的自己的训练数据。

在下方设置您的云端存储桶的名称。它必须在所有云端存储桶中是唯一的。

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP

只有当您的存储桶不存在时：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

最后，通过检查云存储桶的内容来验证访问权限。

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### 设置变量

接下来，设置一些在本教程中使用的变量。
### 导入库并定义常量

#### 导入 AutoML SDK

将 AutoML SDK 导入我们的 Python 环境。

In [None]:
import json
import os
import sys
import time

from google.cloud import automl
from google.protobuf.json_format import MessageToJson
from google.protobuf.struct_pb2 import Value

#### AutoML常量

为AutoML设置以下常量：

- `PARENT`：用于数据集、模型和端点资源的AutoML位置根路径。

In [None]:
# AutoM location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

客户端

AutoML SDK作为一个客户端/服务器模型。在你的一侧（Python脚本）你将创建一个客户端，该客户端发送请求并从服务器（AutoML）接收响应。

在本教程中，你将使用多个客户端，因此请提前设置它们。

In [None]:
def automl_client():
    return automl.AutoMlClient()


def perdictions_client():
    return automl.PredictionServiceClient()


def operations_client():
    return automl.AutoMlClient()._transport.operations_client


clients = {}
clients["automl"] = automl_client()
clients["predictions"] = perdictions_client()
clients["operations"] = operations_client()

for client in clients.items():
    print(client)

In [None]:
import tensorflow as tf

IMPORT_FILE = "gs://cloud-samples-data/language/claritin.csv"
with tf.io.gfile.GFile(IMPORT_FILE, "r") as f:
    content = f.readlines()

IMPORT_FILE = "gs://" + BUCKET_NAME + "/claritin.csv"
with tf.io.gfile.GFile(IMPORT_FILE, "w") as f:
    for line in content:
        f.write(",".join(line.split(",")[0:-1]) + "\n")

In [None]:
! gsutil cat $IMPORT_FILE | head -n 10

*示例输出*:
```
@freewrytin 上帝对克拉利汀太好了, 2
我需要克拉利汀。太糟糕了。我什么时候开始被过敏困扰了？, 3
感谢上帝赐予克拉利汀。, 4
"更糟糕的是，昨天我达到了鼻喷雾的3天限制，这意味着我必须依赖克拉利汀。", 2
是时候服用一些克拉利汀或艾力吉或其他药物了。我需要我的声音, 3
哦我的RT @imsydneycharles：无论如何，我只是想在某个地方记录下我同时服用了克拉利汀和苯海拉明...以防我昏倒, 2
大约服用一颗克拉利汀_Ûª_Û_ÛÌâ FML !!, 3
Commander Loratadine Generic A Sarcelles: 沙尔塞尔克拉利汀指挥官 Loratadine Generic A Sarcelles =Ûª_Û_ http: //t.co/mOleL8AM, 2
"齐尔特，克拉利汀，速效头孢，鼻喷雾..我觉得自己像个吸毒成瘾的人，服用这些过敏药物。请过敏季节消失!!", 1
"Ûª_Ûª_ÛªÕ@SheLovesThatD：如果她有过敏，请给她克拉利汀D。Ûª_Û_Ì_å @Sweeno_thakid41 @B_Original16 @luke_CYwalker14", 3
```

创建一个数据集

### [projects.locations.datasets.create](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/create)

### [项目.地点.数据集.创建](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/create)

请求

In [None]:
dataset = {
    "display_name": "claritin_" + TIMESTAMP,
    "text_sentiment_dataset_metadata": {"sentiment_max": 4},
}


print(
    MessageToJson(
        automl.CreateDatasetRequest(parent=PARENT, dataset=dataset).__dict__["_pb"]
    )
)

{
   "parent":"projects/migration-ucaip-training/locations/us-central1",
   "dataset":{
      "displayName":"claritin_20210304132912",
      "textSentimentDatasetMetadata":{
         "sentimentMax":4
      }
   }
}

#### 电话

In [None]:
request = clients["automl"].create_dataset(parent=PARENT, dataset=dataset)

#### 回应

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{
  "name": "projects/116273516712/locations/us-central1/datasets/TST1994716952680988672"
}
```

In [None]:
# The full unique ID for the dataset
dataset_id = result.name
# The short numeric ID for the dataset
dataset_short_id = dataset_id.split("/")[-1]

print(dataset_id)

### [projects.locations.datasets.importData](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/importData)

### [项目.位置.数据集.导入数据](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.datasets/importData)

#### 请求

In [None]:
input_config = {"gcs_source": {"input_uris": [IMPORT_FILE]}}

print(
    MessageToJson(
        automl.ImportDataRequest(name=dataset_id, input_config=input_config).__dict__[
            "_pb"
        ]
    )
)

{
  "name": "projects/116273516712/locations/us-central1/datasets/TST1994716952680988672",
  "inputConfig": {
    "gcsSource": {
      "inputUris": [
        "gs://migration-ucaip-trainingaip-20210304132912/claritin.csv"
      ]
    }
  }
} 

示例输出：{
  "name": "projects/116273516712/locations/us-central1/datasets/TST1994716952680988672",
  "inputConfig": {
    "gcsSource": {
      "inputUris": [
        "gs://migration-ucaip-trainingaip-20210304132912/claritin.csv"
      ]
    }
  }
}

#### 呼叫

In [None]:
request = clients["automl"].import_data(name=dataset_id, input_config=input_config)

回复

In [None]:
result = request.result()

print(MessageToJson(result))

*示例输出*：
``` 
{}
```

训练一个模型

### [projects.locations.models.create](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/create) 

### [项目.位置.模型.创建](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/create)

#### 请求

In [None]:
model = {
    "display_name": "claritin_" + TIMESTAMP,
    "dataset_id": dataset_short_id,
    "text_sentiment_model_metadata": {},
}

print(
    MessageToJson(automl.CreateModelRequest(parent=PARENT, model=model).__dict__["_pb"])
)

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "model": {
    "displayName": "claritin_20210304132912",
    "datasetId": "TST1994716952680988672",
    "textSentimentModelMetadata": {}
  }
}
``` 

*示例输出*：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "model": {
    "displayName": "claritin_20210304132912",
    "datasetId": "TST1994716952680988672",
    "textSentimentModelMetadata": {}
  }
}
```

#### 电话

In [None]:
request = clients["automl"].create_model(parent=PARENT, model=model)

#### 回应

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272"
}
```
示例输出:
```
{
  "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272"
}
```

In [None]:
# The full unique ID for the training pipeline
model_id = result.name
# The short numeric ID for the training pipeline
model_short_id = model_id.split("/")[-1]

print(model_short_id)

## 评估模型

### [projects.locations.models.modelEvaluations.list](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/list)

### [projects.locations.models.modelEvaluations.list](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/list)

#### 电话

In [None]:
request = clients["automl"].list_model_evaluations(parent=model_id, filter="")

#### 回复

In [None]:
model_evaluations = [json.loads(MessageToJson(me.__dict__["_pb"])) for me in request]
# The evaluation slice
evaluation_slice = request.model_evaluation[0].name

print(json.dumps(model_evaluations, indent=2))

{
  "示例输出": [
    {
      "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/54870628009945864",
      "annotationSpecId": "8301667931964571648",
      "createTime": "2021-03-04T17:15:51.851420Z",
      "textSentimentEvaluationMetrics": {
        "precision": 0.33333334,
        "recall": 0.16666667,
        "f1Score": 0.22222222
      },
      "displayName": "4"
    },
    {
      "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/1597159550285093673",
      "annotationSpecId": "1384138904323489792",
      "createTime": "2021-03-04T17:15:51.851420Z",
      "textSentimentEvaluationMetrics": {
        "precision": 0.5,
        "recall": 0.296875,
        "f1Score": 0.37254903
      },
      "displayName": "1"
    },
    {
      "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/3521790980763365687",
      "createTime": "2021-03-04T17:15:51.851420Z",
      "evaluatedExampleCount": 452,
      "textSentimentEvaluationMetrics": {
        "precision": 0.6238938,
        "recall": 0.6238938,
        "f1Score": 0.6238938,
        "meanAbsoluteError": 0.47566372,
        "meanSquaredError": 0.69690263,
        "linearKappa": 0.41007927,
        "quadraticKappa": 0.45938763,
        "confusionMatrix": {
          "annotationSpecId": [
            "7148746427357724672",
            "1384138904323489792",
            "5995824922750877696",
            "3689981913537183744",
            "8301667931964571648"
          ],
          "row": [
            {
              "exampleCount": [
                2,
                4,
                1,
                1,
                1
              ]
            },
            {
              "exampleCount": [
                3,
                19,
                14,
                28,
                0
              ]
            },
            {
              "exampleCount": [
                0,
                7,
                67,
                63,
                1
              ]
            },
            {
              "exampleCount": [
                1,
                8,
                19,
                191,
                4
              ]
            },
            {
              "exampleCount": [
                0,
                0,
                0,
                15,
                3
              ]
            }
          ],
          "displayName": [
            "0",
            "1",
            "2",
            "3",
            "4"
          ]
        }
      }
    },
    {
      "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/3727703410992997127",
      "annotationSpecId": "3689981913537183744",
      "createTime": "2021-03-04T17:15:51.851420Z",
      "textSentimentEvaluationMetrics": {
        "precision": 0.6409396,
        "recall": 0.85650223,
        "f1Score": 0.7332054
      },
      "displayName": "3"
    },
    {
      "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/4692810493650008310",
      "annotationSpecId": "7148746427357724672",
      "createTime": "2021-03-04T17:15:51.851420Z",
      "textSentimentEvaluationMetrics": {
        "precision": 0.33333334,
        "recall": 0.22222222,
        "f1Score": 0.26666668
      },
      "displayName": "0"
    },
    {
      "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/8390011688796741170",
      "annotationSpecId": "5995824922750877696",
      "createTime": "2021-03-04T17:15:51.851420Z",
      "textSentimentEvaluationMetrics": {
        "precision": 0.6633663,
        "recall": 0.48550725,
        "f1Score": 0.5606694
      },
      "displayName": "2"
    }
  ]
}

### [projects.locations.models.modelEvaluations.get](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/get) 

### [projects.locations.models.modelEvaluations.get](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models.modelEvaluations/get)

#### 电话

In [None]:
request = clients["automl"].get_model_evaluation(name=evaluation_slice)

回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

**示例输出**：
```
{
  "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272/modelEvaluations/54870628009945864",
  "annotationSpecId": "8301667931964571648",
  "createTime": "2021-03-04T17:15:51.851420Z",
  "textSentimentEvaluationMetrics": {
    "precision": 0.33333334,
    "recall": 0.16666667,
    "f1Score": 0.22222222
  },
  "displayName": "4"
}
```

进行批量预测

制作批量输入文件

要从AutoML视频请求一批预测结果，请创建一个CSV文件，列出您想要标注的视频的Cloud Storage路径。您还可以指定开始和结束时间，告诉AutoML视频只标注视频的某一段（片段水平）。开始时间必须为零或更大，并且必须早于结束时间。结束时间必须大于开始时间且小于或等于视频的持续时间。您还可以使用inf来表示视频的结束。

In [None]:
import tensorflow as tf

gcs_input_uri = "gs://" + BUCKET_NAME + "/test.csv"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    item_1 = "gs://cloud-samples-data/language/sentiment-positive.txt"
    ! gsutil cp $item_1 gs://$BUCKET_NAME
    f.write("gs://" + BUCKET_NAME + "/sentiment-positive.txt" + "\n")

    item_2 = "gs://cloud-samples-data/language/sentiment-negative.txt"
    ! gsutil cp $item_2 gs://$BUCKET_NAME
    f.write("gs://" + BUCKET_NAME + "/sentiment-negative.txt")

! gsutil cat $gcs_input_uri

示例输出：
```
gs://migration-ucaip-trainingaip-20210304132912/情感-积极.txt
gs://migration-ucaip-trainingaip-20210304132912/情感-消极.txt
```

### [projects.locations.models.batchPredict](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/batchPredict) 

### [项目.位置.模型.批处理预测](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/batchPredict)

#### 请求

In [None]:
input_config = {"gcs_source": {"input_uris": [gcs_input_uri]}}

output_config = {
    "gcs_destination": {"output_uri_prefix": "gs://" + f"{BUCKET_NAME}/batch_output/"}
}

print(
    MessageToJson(
        automl.BatchPredictRequest(
            name=model_id, input_config=input_config, output_config=output_config
        ).__dict__["_pb"]
    )
)

{
  "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272",
  "inputConfig": {
    "gcsSource": {
      "inputUris": [
        "gs://migration-ucaip-trainingaip-20210304132912/test.csv"
      ]
    }
  },
  "outputConfig": {
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210304132912/batch_output/"
    }
  }
}  

{
  "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272",
  "inputConfig": {
    "gcsSource": {
      "inputUris": [
        "gs://migration-ucaip-trainingaip-20210304132912/test.csv"
      ]
    }
  },
  "outputConfig": {
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210304132912/batch_output/"
    }
  }
}

#### 呼叫

In [None]:
request = clients["predictions"].batch_predict(
    name=model_id, input_config=input_config, output_config=output_config
)

#### 响应

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
``` 
{}
```

做在线预测

为在线预测准备数据项

In [None]:
test_data = ! gsutil cat $IMPORT_FILE | head -n1

test_item = str(test_data[0]).split(",")[0]
test_label = str(test_data[0]).split(",")[1]

print((test_item, test_label))

*示例输出*：
```
('@freewrytin 上帝对克拉瑞汀太好了', '2')
```

### [projects.locations.models.deploy](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/deploy)

### [projects.locations.models.deploy](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/deploy)

#### 电话

In [None]:
request = clients["automl"].deploy_model(name=model_id)

回复

In [None]:
result = request.result()

print(MessageToJson(result))

*示例输出*：
```
{} 
```

### [projects.locations.models.predict](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/predict) 

### [projects.locations.models.predict](https://cloud.google.com/automl/docs/reference/rest/v1beta1/projects.locations.models/predict)

In [None]:
payload = {"text_snippet": {"content": test_item, "mime_type": "text/plain"}}

prediction_request = automl.PredictRequest(
    name=model_id,
    payload=payload,
)

print(MessageToJson(prediction_request.__dict__["_pb"]))

示例输出：
```
{
  "name": "projects/116273516712/locations/us-central1/models/TST4078882474816438272",
  "payload": {
    "textSnippet": {
      "content": "@freewrytin God is way too good for Claritin",
      "mimeType": "text/plain"
    }
  }
}
```

#### 电话

In [None]:
request = clients["predictions"].predict(request=prediction_request)

回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

*示例输出*：
```
{
  "payload": [
    {
      "textSentiment": {
        "sentiment": 3
      }
    }
  ],
  "metadata": {
    "sentiment_score": "0.30955505"
  }
}
```

清理

要清理此项目中使用的所有GCP资源，您可以[删除用于本教程的GCP项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在此教程中创建的各个资源。

In [None]:
delete_dataset = True
delete_model = True
delete_bucket = True

# Delete the dataset using the AutoML fully qualified identifier for the dataset
try:
    if delete_dataset:
        clients["automl"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)

# Delete the model using the AutoML fully qualified identifier for the model
try:
    if delete_model:
        clients["automl"].delete_model(name=model_id)
except Exception as e:
    print(e)

if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r gs://$BUCKET_NAME