In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI AutoML 文本实体提取模型

## 安装

安装最新（预览）版本的Vertex SDK。

In [None]:
! pip3 install -U google-cloud-aiplatform --user

安装Google *云存储*库。

In [None]:
! pip3 install google-cloud-storage

重新启动内核

安装了Vertex SDK和Google *cloud-storage*后，您需要重新启动笔记本内核，这样它才能找到这些包。

In [None]:
import os

if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## 在开始之前

### GPU 运行时

*如果您有这个选项，请确保在 GPU 运行时中运行此笔记本。在 Colab 中，选择* **运行时 > 更改运行时类型 > GPU**

### 设置您的 GCP 项目

**无论您的笔记本环境如何，以下步骤都是必需的。**

1. [选择或创建 GCP 项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建账户时，您会获得 $300 的免费信用，用于您的计算/存储成本。

2. [确保为您的项目启用计费。](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [启用 Vertex APIs 和 Compute Engine APIs。](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) 已经安装在 Google Cloud Notebooks 中。

5. 在下面的单元格中输入您的项目 ID。然后运行该单元格，确保 Cloud SDK 对该笔记本中所有命令使用正确的项目。

**注意**：Jupyter 运行以 `!` 开头的行作为 shell 命令，并将以 `$` 开头的 Python 变量插入这些命令中。

#### 项目标识

**如果你不知道你的项目标识**，请尝试使用`gcloud`命令，在下面执行第二个单元格以获取你的项目标识。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

区域

您还可以更改“REGION”变量，该变量用于本笔记本其余部分的操作。以下是 Vertex AI 支持的区域。我们建议尽可能选择离您最近的区域。

- 美洲：`us-central1`
- 欧洲：`europe-west4`
- 亚太地区：`asia-east1`

您不能在 Vertex 上使用多区域存储桶进行训练。并非所有区域都支持所有 Vertex 服务。有关最新的每个区域支持情况，请参阅[顶点 AI 服务的区域支持](https://cloud.google.com/vertex-ai/docs/general/locations)。

In [None]:
REGION = "us-central1"  # @param {type: "string"}

时间戳

如果您正在进行直播教程会话，您可能正在使用共享的测试账户或项目。为了避免用户在创建的资源上发生名称冲突，您为每个实例会话创建一个时间戳，并将其附加到在本教程中将要创建的资源的名称上。

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### 认证您的GCP帐户

**如果您正在使用Google Cloud笔记本**，您的环境已经通过身份验证。请跳过此步骤。

*注意：如果您正在使用Vertex笔记本并运行该单元格，则单元格知道跳过执行认证步骤。*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Vertex, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

创建一个云存储桶

**无论您使用的是什么笔记本环境，都需要执行以下步骤。**

本教程旨在使用位于公共云存储桶中的训练数据以及本地云存储桶用于批量预测。您也可以使用自己储存在本地云存储桶中的训练数据。

在下方设置您的云存储桶的名称。它必须在所有云存储桶中是唯一的。

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP

只有在您的存储桶不存在时才能运行以下单元格来创建您的云存储存储桶。

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

最后，通过检查云存储桶的内容来验证对其的访问。

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### 设置变量

接下来，设置一些在教程中使用的变量。
### 导入库并定义常量

#### 导入Vertex SDK

将Vertex SDK导入到我们的Python环境中。

In [None]:
import json
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip
from google.protobuf import json_format
from google.protobuf.json_format import MessageToJson
from google.protobuf.struct_pb2 import Struct, Value

### Vertex AI 常量

为 Vertex AI 设置以下常量：

- `API_ENDPOINT`：用于数据集、模型、作业、管道和端点服务的 Vertex AI API 服务端点。
- `PARENT`：用于数据集、模型和端点资源的 Vertex AI 位置根路径。

In [None]:
# API Endpoint
API_ENDPOINT = "us-central1-aiplatform.googleapis.com"

# Vertex AI location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### AutoML常量

接下来，设置与AutoML文本实体提取数据集和训练相关的常量：

- 数据集模式：告诉托管的数据集服务它属于哪种类型的数据集。
- 数据标注（注释）模式：告诉托管的数据集服务数据如何被标记（注释）。
- 数据集训练模式：告诉Vertex AI管道服务要为哪种任务（例如分类）训练模型。

In [None]:
# Text Dataset type
TEXT_SCHEMA = "google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml"
# Text Labeling type
IMPORT_SCHEMA_TEXT_EXTRACTION = "gs://google-cloud-aiplatform/schema/dataset/ioformat/text_extraction_io_format_1.0.0.yaml"
# Text Training task
TRAINING_TEXT_EXTRACTION_SCHEMA = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_extraction_1.0.0.yaml"

客户

Vertex SDK 作为客户/服务器模型运行。在您的一侧（Python 脚本）上，您将创建一个客户端，该客户端从服务器（Vertex）发送请求并接收响应。

在本教程中，您将使用几个客户端，因此请提前设置它们所有。

- 用于受管理数据集的数据集服务。
- 用于受管理模型的模型服务。
- 用于训练的管道服务。
- 用于部署的端点服务。
- 用于批处理作业和自定义训练的作业服务。
- 用于提供服务的预测服务。*注意*：预测有不同的服务端点。

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}


def create_dataset_client():
    client = aip.DatasetServiceClient(client_options=client_options)
    return client


def create_model_client():
    client = aip.ModelServiceClient(client_options=client_options)
    return client


def create_pipeline_client():
    client = aip.PipelineServiceClient(client_options=client_options)
    return client


def create_endpoint_client():
    client = aip.EndpointServiceClient(client_options=client_options)
    return client


def create_prediction_client():
    client = aip.PredictionServiceClient(client_options=client_options)
    return client


def create_job_client():
    client = aip.JobServiceClient(client_options=client_options)
    return client


clients = {}
clients["dataset"] = create_dataset_client()
clients["model"] = create_model_client()
clients["pipeline"] = create_pipeline_client()
clients["endpoint"] = create_endpoint_client()
clients["prediction"] = create_prediction_client()
clients["job"] = create_job_client()

for client in clients.items():
    print(client)

In [None]:
IMPORT_FILE = "gs://cloud-samples-data/language/ucaip_ten_dataset.jsonl"

In [None]:
! gsutil cat $IMPORT_FILE | head -n 1

*示例输出*：
```
{'text_segment_annotations': [{'endOffset': 54, 'startOffset': 27, 'displayName': 'SpecificDisease'}, {'endOffset': 173, 'startOffset': 156, 'displayName': 'SpecificDisease'}, {'endOffset': 179, 'startOffset': 176, 'displayName': 'SpecificDisease'}, {'endOffset': 246, 'startOffset': 243, 'displayName': 'Modifier'}, {'endOffset': 340, 'startOffset': 337, 'displayName': 'Modifier'}, {'endOffset': 698, 'startOffset': 695, 'displayName': 'Modifier'}], 'textContent': '1301937\tPennsylvania Dutch 中的Berks CountyPseudodeficiency和差六糖酶A缺陷的分子基础。\t在两名出生有Tay-Sachs病（TSD）的婴儿后，对一个非犹太，宾夕法尼亚荷兰族进行了TSD携带者的生化分析筛查。检测到了一群看起来是TSD杂合子的个体的高频率（Kelly等人，1975年）。临床和生化证据表明携带者频率的增加是由于至少两个差六糖酶A alpha亚基的改变突变。我们现在报告在这个宾夕法尼亚荷兰族中的两个突变突变，以及一个多态性。最初报告在一名法国TSD患者（Akli等人，1991年）身上的一个突变，是位于第9内含子的供体剪接位点的GT --> AT过渡。第二个，位于核苷酸739（Arg247Trp）的C--> T过渡，被Triggs-Raine等人（1992年）显示为与人工底物针对性酶活性降低相关的臨床良性的“假缺陷”等位基因。最后，描述了一个多态性 [G--> A（759）]，保留有效通路编码253处的缬氨酸，无变化。\n '}
```

创建一个数据集

### [projects.locations.datasets.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/create)

### [项目.位置.数据集.创建](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/create)

### 请求

In [None]:
DATA_SCHEMA = TEXT_SCHEMA

dataset = {
    "display_name": "ten_" + TIMESTAMP,
    "metadata_schema_uri": "gs://" + DATA_SCHEMA,
}

print(
    MessageToJson(
        aip.CreateDatasetRequest(parent=PARENT, dataset=dataset).__dict__["_pb"]
    )
)

```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataset": {
    "displayName": "ten_20210301154552",
    "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml"
  }
}
```
将以上英文文本翻译成中文：
```
{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "dataset": {
    "displayName": "ten_20210301154552",
    "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml"
  }
}
```

#### 电话

In [None]:
request = clients["dataset"].create_dataset(parent=PARENT, dataset=dataset)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{
  "name": "projects/116273516712/locations/us-central1/datasets/1309228077611483136",
  "displayName": "ten_20210301154552",
  "metadataSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/metadata/text_1.0.0.yaml",
  "labels": {
    "aiplatform.googleapis.com/dataset_metadata_schema": "TEXT"
  },
  "metadata": {
    "dataItemSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/dataitem/text_1.0.0.yaml"
  }
}
```

In [None]:
# The full unique ID for the dataset
dataset_id = result.name
# The short numeric ID for the dataset
dataset_short_id = dataset_id.split("/")[-1]

print(dataset_id)

### [projects.locations.datasets.import](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/import)

### [项目.位置.数据集.导入](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.datasets/import)

请求

In [None]:
LABEL_SCHEMA = IMPORT_SCHEMA_TEXT_EXTRACTION

import_config = {
    "gcs_source": {"uris": [IMPORT_FILE]},
    "import_schema_uri": LABEL_SCHEMA,
}

print(
    MessageToJson(
        aip.ImportDataRequest(name=dataset_id, import_configs=[import_config]).__dict__[
            "_pb"
        ]
    )
)

{
  "name": "projects/116273516712/locations/us-central1/datasets/1309228077611483136",
  "importConfigs": [
    {
      "gcsSource": {
        "uris": [
          "gs://cloud-samples-data/language/ucaip_ten_dataset.jsonl"
        ]
      },
      "importSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/ioformat/text_extraction_io_format_1.0.0.yaml"
    }
  ]
} 

*示例输出*： 
{
  "name": "projects/116273516712/locations/us-central1/datasets/1309228077611483136",
  "importConfigs": [
    {
      "gcsSource": {
        "uris": [
          "gs://cloud-samples-data/language/ucaip_ten_dataset.jsonl"
        ]
      },
      "importSchemaUri": "gs://google-cloud-aiplatform/schema/dataset/ioformat/text_extraction_io_format_1.0.0.yaml"
    }
  ]
}

#### 电话

In [None]:
request = clients["dataset"].import_data(
    name=dataset_id, import_configs=[import_config]
)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{}
```

训练模型

### [projects.locations.trainingPipelines.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/create)

### [projects.locations.trainingPipelines.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/create)

请求

In [None]:
TRAINING_SCHEMA = TRAINING_TEXT_EXTRACTION_SCHEMA

task = Value(
    struct_value=Struct(
        fields={
            "multi_label": Value(bool_value=False),
            "budget_milli_node_hours": Value(number_value=1000),
            "model_type": Value(string_value="CLOUD"),
            "disable_early_stopping": Value(bool_value=False),
        }
    )
)

training_pipeline = {
    "display_name": "ten_" + TIMESTAMP,
    "input_data_config": {"dataset_id": dataset_short_id},
    "model_to_upload": {"display_name": "ten_" + TIMESTAMP},
    "training_task_definition": TRAINING_SCHEMA,
    "training_task_inputs": task,
}

print(
    MessageToJson(
        aip.CreateTrainingPipelineRequest(
            parent=PARENT,
            training_pipeline=training_pipeline,
        ).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "trainingPipeline": {
    "displayName": "ten_20210301154552",
    "inputDataConfig": {
      "datasetId": "1309228077611483136"
    },
    "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_extraction_1.0.0.yaml",
    "trainingTaskInputs": {
      "budget_milli_node_hours": 1000.0,
      "multi_label": false,
      "model_type": "CLOUD",
      "disable_early_stopping": false
    },
    "modelToUpload": {
      "displayName": "ten_20210301154552"
    }
  }
}

### 电话

In [None]:
request = clients["pipeline"].create_training_pipeline(
    parent=PARENT, training_pipeline=training_pipeline
)

#### 回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/trainingPipelines/4643220011912003584",
  "displayName": "ten_20210301154552",
  "inputDataConfig": {
    "datasetId": "1309228077611483136"
  },
  "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_extraction_1.0.0.yaml",
  "trainingTaskInputs": {},
  "modelToUpload": {
    "displayName": "ten_20210301154552"
  },
  "state": "PIPELINE_STATE_PENDING",
  "createTime": "2021-03-01T15:55:29.352065Z",
  "updateTime": "2021-03-01T15:55:29.352065Z"
}

In [None]:
# The full unique ID for the training pipeline
training_pipeline_id = request.name
# The short numeric ID for the training pipeline
training_pipeline_short_id = training_pipeline_id.split("/")[-1]

print(training_pipeline_id)

### [projects.locations.trainingPipelines.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/get)

### [projects.locations.trainingPipelines.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.trainingPipelines/get)

呼叫

In [None]:
request = clients["pipeline"].get_training_pipeline(name=training_pipeline_id)

#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/trainingPipelines/4643220011912003584",
  "displayName": "ten_20210301154552",
  "inputDataConfig": {
    "datasetId": "1309228077611483136"
  },
  "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_extraction_1.0.0.yaml",
  "trainingTaskInputs": {},
  "modelToUpload": {
    "displayName": "ten_20210301154552"
  },
  "state": "PIPELINE_STATE_PENDING",
  "createTime": "2021-03-01T15:55:29.352065Z",
  "updateTime": "2021-03-01T15:55:29.352065Z"
}

In [None]:
while True:
    response = clients["pipeline"].get_training_pipeline(name=training_pipeline_id)
    if response.state != aip.PipelineState.PIPELINE_STATE_SUCCEEDED:
        print("Training job has not completed:", response.state)
        if response.state == aip.PipelineState.PIPELINE_STATE_FAILED:
            break
    else:
        model_id = response.model_to_upload.name
        print("Training Time:", response.end_time - response.start_time)
        break
    time.sleep(60)

print(model_id)

评估模型

### [projects.locations.models.evaluations.list](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/list)

### [projects.locations.models.evaluations.list](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/list)

#### 电话

In [None]:
request = clients["model"].list_model_evaluations(parent=model_id)

回复

In [None]:
model_evaluations = [json.loads(MessageToJson(mel.__dict__["_pb"])) for mel in request]

# The evaluation slice
evaluation_slice = request.model_evaluations[0].name

print(json.dumps(model_evaluations, indent=2))

*示例输出*：
```
[
  {
    "name": "projects/116273516712/locations/us-central1/models/4400738115568795648/evaluations/7959028222912364544",
    "metricsSchemaUri": "gs://google-cloud-aiplatform/schema/modelevaluation/text_extraction_metrics_1.0.0.yaml",
    "metrics": {
      "confusionMatrix": {
        "rows": [
          [
            0.0,
            24.0,
            23.0,
            1.0,
            27.0
          ],
          [
            9.0,
            40.0,
            0.0,
            0.0,
            10.0
          ],
          [
            11.0,
            0.0,
            87.0,
            0.0,
            2.0
          ],
          [
            3.0,
            0.0,
            0.0,
            5.0,
            0.0
          ],
          [
            32.0,
            16.0,
            7.0,
            1.0,
            186.0
          ]
        ],
        "annotationSpecs": [
          {
            "displayName": "NULL"
          },
          {
            "id": "2041829376663748608",
            "displayName": "DiseaseClass"
          },
          {
            "displayName": "Modifier",
            "id": "4347672385877442560"
          },
          {
            "displayName": "CompositeMention",
            "id": "6653515395091136512"
          },
          {
            "id": "7806436899697983488",
            "displayName": "SpecificDisease"
          }
        ]
      },
      "confidenceMetrics": [
        {
          "precision": 0.74125874,
          "f1Score": 0.7589499,
          "recall": 0.7775061,
          "confidenceThreshold": 0.04
        },
        {
          "recall": 0.7457213,
          "confidenceThreshold": 0.96,
          "precision": 0.8333333,
          "f1Score": 0.7870968
        },
        
        # 省略部分
        
        {
          "f1Score": 0.7596154,
          "recall": 0.77261615,
          "confidenceThreshold": 0.44,
          "precision": 0.7470449
        }
      ]
    },
    "createTime": "2021-03-01T17:59:23.638307Z",
    "sliceDimensions": [
      "annotationSpec"
    ]
  }
]
```

### [projects.locations.models.evaluations.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/get)

### [projects.locations.models.evaluations.get](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.models.evaluations/get)

#### 电话

In [None]:
request = clients["model"].get_model_evaluation(name=evaluation_slice)

#### 响应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/models/4400738115568795648/evaluations/7959028222912364544",
  "metricsSchemaUri": "gs://google-cloud-aiplatform/schema/modelevaluation/text_extraction_metrics_1.0.0.yaml",
  "metrics": {
    "confusionMatrix": {
      "rows": [
        [
          0.0,
          24.0,
          23.0,
          1.0,
          27.0
        ],
        [
          9.0,
          40.0,
          0.0,
          0.0,
          10.0
        ],
        [
          11.0,
          0.0,
          87.0,
          0.0,
          2.0
        ],
        [
          3.0,
          0.0,
          0.0,
          5.0,
          0.0
        ],
        [
          32.0,
          16.0,
          7.0,
          1.0,
          186.0
        ]
      ],
      "annotationSpecs": [
        {
          "displayName": "NULL"
        },
        {
          "id": "2041829376663748608",
          "displayName": "DiseaseClass"
        },
        {
          "displayName": "Modifier",
          "id": "4347672385877442560"
        },
        {
          "id": "6653515395091136512",
          "displayName": "CompositeMention"
        },
        {
          "displayName": "SpecificDisease",
          "id": "7806436899697983488"
        }
      ]
    },
    "confidenceMetrics": [
      {
        "precision": 0.74125874,
        "recall": 0.7775061,
        "confidenceThreshold": 0.04,
        "f1Score": 0.7589499
      },
      {
        "f1Score": 0.7870968,
        "recall": 0.7457213,
        "confidenceThreshold": 0.96,
        "precision": 0.8333333
      },
      
      # 去掉以保持简洁
      
      {
        "precision": 0.745283,
        "f1Score": 0.7587035,
        "recall": 0.77261615,
        "confidenceThreshold": 0.43
      },
      {
        "precision": 0.7470449,
        "recall": 0.77261615,
        "confidenceThreshold": 0.44,
        "f1Score": 0.7596154
      }
    ]
  },
  "createTime": "2021-03-01T17:59:23.638307Z",
  "sliceDimensions": [
    "annotationSpec"
  ]
}

进行批量预测

生成一个批量预测文件

In [None]:
import json

import tensorflow as tf

test_item = 'Molecular basis of hexosaminidase A deficiency and pseudodeficiency in the Berks County Pennsylvania Dutch.\\tFollowing the birth of two infants with Tay-Sachs disease ( TSD ) , a non-Jewish , Pennsylvania Dutch kindred was screened for TSD carriers using the biochemical assay . A high frequency of individuals who appeared to be TSD heterozygotes was detected ( Kelly et al . , 1975 ) . Clinical and biochemical evidence suggested that the increased carrier frequency was due to at least two altered alleles for the hexosaminidase A alpha-subunit . We now report two mutant alleles in this Pennsylvania Dutch kindred , and one polymorphism . One allele , reported originally in a French TSD patient ( Akli et al . , 1991 ) , is a GT-- > AT transition at the donor splice-site of intron 9 . The second , a C-- > T transition at nucleotide 739 ( Arg247Trp ) , has been shown by Triggs-Raine et al . ( 1992 ) to be a clinically benign " pseudodeficient " allele associated with reduced enzyme activity against artificial substrate . Finally , a polymorphism [ G-- > A ( 759 ) ] , which leaves valine at codon 253 unchanged , is described'

gcs_test_item = "gs://" + BUCKET_NAME + "/test.txt"
with tf.io.gfile.GFile(gcs_test_item, "w") as f:
    f.write(test_item + "\n")

gcs_input_uri = "gs://" + BUCKET_NAME + "/test.jsonl"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    f.write(json.dumps({"content": gcs_test_item, "mime_type": "text/plain"}) + "\n")

! gsutil cat $gcs_input_uri
! gsutil cat $gcs_test_item

{"content": "gs://migration-ucaip-trainingaip-20210301154552/test.txt", "mime_type": "text/plain"}
宾夕法尼亚州荷兰人的巴克斯县假性差和假华轮酸酯酶A不足的分子基础。两名出生有泰氏病（TSD）的婴儿后，对一个非犹太人的宾夕法尼亚荷兰人亲属进行了TSD携带者的生化分析筛选。发现了一个很高频率的个体，看起来是TSD杂合子（Kelly等人，1975年）。临床和生化证据表明增加携带者频率是由于至少有两个改变的hexosaminidase A alpha-亚基的等位基因。我们现在报道了这个宾夕法尼亚荷兰人亲属中的两个突变等位基因和一个多态性。一种等位基因，最初报道在一个法国TSD患者（Akli等人，1991年）中，是位于第9内含子供体拼接位点的GT-->AT过渡。第二个是位于核苷酸739（Arg247Trp）的C-->T过渡，已经被Triggs-Raine等人（1992年）证明是与对人工底物的酶活性减少相关的临床良性“假不足”等位基因。最后，描述了一种多态性[G-->A (759)]，保持密码子253的缬氨酸不变。

### [projects.locations.batchPredictionJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/create)

### [projects.locations.batchPredictionJobs.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/create)

请求

In [None]:
batch_prediction_job = {
    "display_name": "ten_" + TIMESTAMP,
    "model": model_id,
    "input_config": {
        "instances_format": "jsonl",
        "gcs_source": {"uris": [gcs_input_uri]},
    },
    "output_config": {
        "predictions_format": "jsonl",
        "gcs_destination": {
            "output_uri_prefix": "gs://" + f"{BUCKET_NAME}/batch_output/"
        },
    },
    "dedicated_resources": {
        "machine_spec": {"machine_type": "n1-standard-2", "accelerator_count": 0},
        "starting_replica_count": 1,
        "max_replica_count": 1,
    },
}

print(
    MessageToJson(
        aip.CreateBatchPredictionJobRequest(
            parent=PARENT, batch_prediction_job=batch_prediction_job
        ).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "batchPredictionJob": {
    "displayName": "ten_20210301154552",
    "model": "projects/116273516712/locations/us-central1/models/4400738115568795648",
    "inputConfig": {
      "instancesFormat": "jsonl",
      "gcsSource": {
        "uris": [
          "gs://migration-ucaip-trainingaip-20210301154552/test.jsonl"
        ]
      }
    },
    "outputConfig": {
      "predictionsFormat": "jsonl",
      "gcsDestination": {
        "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210301154552/batch_output/"
      }
    },
    "dedicatedResources": {
      "machineSpec": {
        "machineType": "n1-standard-2"
      },
      "startingReplicaCount": 1,
      "maxReplicaCount": 1
    }
  }
}

#### 电话

In [None]:
request = clients["job"].create_batch_prediction_job(
    parent=PARENT, batch_prediction_job=batch_prediction_job
)

#### 回应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/batchPredictionJobs/3588251799200464896",
  "displayName": "ten_20210301154552",
  "model": "projects/116273516712/locations/us-central1/models/4400738115568795648",
  "inputConfig": {
    "instancesFormat": "jsonl",
    "gcsSource": {
      "uris": [
        "gs://migration-ucaip-trainingaip-20210301154552/test.jsonl"
      ]
    }
  },
  "outputConfig": {
    "predictionsFormat": "jsonl",
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210301154552/batch_output/"
    }
  },
  "state": "JOB_STATE_PENDING",
  "completionStats": {
    "incompleteCount": "-1"
  },
  "createTime": "2021-03-01T17:59:42.777083Z",
  "updateTime": "2021-03-01T17:59:42.777083Z"
}
```

In [None]:
# The fully qualified ID for the batch job
batch_job_id = request.name
# The short numeric ID for the batch job
batch_job_short_id = batch_job_id.split("/")[-1]

print(batch_job_id)

### [项目.地点.批量预测作业获取](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.batchPredictionJobs/get)

#### 呼叫

In [None]:
request = clients["job"].get_batch_prediction_job(name=batch_job_id)

响应

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/batchPredictionJobs/3588251799200464896",
  "displayName": "ten_20210301154552",
  "model": "projects/116273516712/locations/us-central1/models/4400738115568795648",
  "inputConfig": {
    "instancesFormat": "jsonl",
    "gcsSource": {
      "uris": [
        "gs://migration-ucaip-trainingaip-20210301154552/test.jsonl"
      ]
    }
  },
  "outputConfig": {
    "predictionsFormat": "jsonl",
    "gcsDestination": {
      "outputUriPrefix": "gs://migration-ucaip-trainingaip-20210301154552/batch_output/"
    }
  },
  "state": "JOB_STATE_PENDING",
  "completionStats": {
    "incompleteCount": "-1"
  },
  "createTime": "2021-03-01T17:59:42.777083Z",
  "updateTime": "2021-03-01T17:59:42.777083Z"
}

In [None]:
while True:
    response = clients["job"].get_batch_prediction_job(name=batch_job_id)
    if response.state != aip.JobState.JOB_STATE_SUCCEEDED:
        print("The job has not completed:", response.state)
        if response.state == aip.JobState.JOB_STATE_FAILED:
            break
    else:
        folder = response.output_config.gcs_destination.output_uri_prefix[:-1]
        ! gsutil ls $folder/prediction*/*.jsonl

        ! gsutil cat $folder/prediction*/*.jsonl
        break
    time.sleep(60)

*示例输出*：
```
gs://migration-ucaip-trainingaip-20210301154552/batch_output/prediction-ten_20210301154552-2021-03-01T17:59:42.638222Z/predictions_00001.jsonl
{"instance":{"content":"gs://migration-ucaip-trainingaip-20210301154552/test.txt","mimeType":"text/plain"},"prediction":{"ids":["7806436899697983488","7806436899697983488","7806436899697983488","4347672385877442560","4347672385877442560","4347672385877442560"],"displayNames":["SpecificDisease","SpecificDisease","SpecificDisease","Modifier","Modifier","Modifier"],"textSegmentStartOffsets":["149","19","169","236","688","330"],"textSegmentEndOffsets":["165","45","171","238","690","332"],"confidences":[0.99957836,0.9995628,0.9995044,0.9993287,0.9993144,0.99927235]}}
```

做在线预测

### [projects.locations.endpoints.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/create) 

### [projects.locations.endpoints.create](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/create)

#### 请求

In [None]:
endpoint = {"display_name": "ten_" + TIMESTAMP}

print(
    MessageToJson(
        aip.CreateEndpointRequest(parent=PARENT, endpoint=endpoint).__dict__["_pb"]
    )
)

{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "endpoint": {
    "displayName": "ten_20210301154552"
  }
} 

示例输出：{
  "parent": "projects/migration-ucaip-training/locations/us-central1",
  "endpoint": {
    "displayName": "ten_20210301154552"
  }
}

呼叫

In [None]:
request = clients["endpoint"].create_endpoint(parent=PARENT, endpoint=endpoint)

回应

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

{
  "name": "projects/116273516712/locations/us-central1/endpoints/8916247652891361280"
}
```

*示例输出*:
```
{
  "name": "projects/116273516712/locations/us-central1/endpoints/8916247652891361280"
}
```

In [None]:
# The fully qualified ID for the endpoint
endpoint_id = result.name
# The short numeric ID for the endpoint
endpoint_short_id = endpoint_id.split("/")[-1]

print(endpoint_id)

### [projects.locations.endpoints.deployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/deployModel)

### [项目.位置.端点.部署模型](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/deployModel)

#### 请求

In [None]:
deployed_model = {
    "model": model_id,
    "display_name": "ten_" + TIMESTAMP,
    "automatic_resources": {"min_replica_count": 1, "max_replica_count": 1},
}

traffic_split = {"0": 100}

print(
    MessageToJson(
        aip.DeployModelRequest(
            endpoint=endpoint_id,
            deployed_model=deployed_model,
            traffic_split=traffic_split,
        ).__dict__["_pb"]
    )
)

*示例输出*：
```
{
  "endpoint": "projects/116273516712/locations/us-central1/endpoints/8916247652891361280",
  "deployedModel": {
    "model": "projects/116273516712/locations/us-central1/models/4400738115568795648",
    "displayName": "ten_20210301154552",
    "automaticResources": {
      "minReplicaCount": 1,
      "maxReplicaCount": 1
    }
  },
  "trafficSplit": {
    "0": 100
  }
}
```

#### 电话

In [None]:
request = clients["endpoint"].deploy_model(
    endpoint=endpoint_id, deployed_model=deployed_model, traffic_split=traffic_split
)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{
  "deployedModel": {
    "id": "3958065938133155840"
  }
}
```
*示例输出*：
```
{
  "deployedModel": {
    "id": "3958065938133155840"
  }
}
```

In [None]:
# The unique ID for the deployed model
deployed_model_id = result.deployed_model.id

print(deployed_model_id)

### [projects.locations.endpoints.predict](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/predict)

### [projects.locations.endpoints.predict](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/predict)

为在线预测准备数据项目

In [None]:
test_item = 'Molecular basis of hexosaminidase A deficiency and pseudodeficiency in the Berks County Pennsylvania Dutch.\\tFollowing the birth of two infants with Tay-Sachs disease ( TSD ) , a non-Jewish , Pennsylvania Dutch kindred was screened for TSD carriers using the biochemical assay . A high frequency of individuals who appeared to be TSD heterozygotes was detected ( Kelly et al . , 1975 ) . Clinical and biochemical evidence suggested that the increased carrier frequency was due to at least two altered alleles for the hexosaminidase A alpha-subunit . We now report two mutant alleles in this Pennsylvania Dutch kindred , and one polymorphism . One allele , reported originally in a French TSD patient ( Akli et al . , 1991 ) , is a GT-- > AT transition at the donor splice-site of intron 9 . The second , a C-- > T transition at nucleotide 739 ( Arg247Trp ) , has been shown by Triggs-Raine et al . ( 1992 ) to be a clinically benign " pseudodeficient " allele associated with reduced enzyme activity against artificial substrate . Finally , a polymorphism [ G-- > A ( 759 ) ] , which leaves valine at codon 253 unchanged , is described'

请求

In [None]:
instances_list = [{"content": test_item}]

instances = [json_format.ParseDict(s, Value()) for s in instances_list]

prediction_request = aip.PredictRequest(
    endpoint=endpoint_id,
)
prediction_request.instances.append(instances)

print(MessageToJson(prediction_request.__dict__["_pb"]))

*示例输出*：
```
{
  "endpoint": "projects/116273516712/locations/us-central1/endpoints/8916247652891361280",
  "instances": [
    [
      {
        "content": "巴克斯县宾夕法尼亚德国人的TSD相关和伪缺陷的己烯胺酸酶A缺陷的分子基础。\\t在出生了两个Tay-Sachs病（TSD）患儿后，对一个非犹太宾夕法尼亚德国家族进行了TSD携带者的生化检测。检测到了一大批看起来是TSD杂合子的个体（Kelly等人，1975年）。临床和生化证据表明，增加的携带者频率至少是由于己烯胺酸酶Aα亚基的至少两个改变过的等位基因。我们现在在这个宾夕法尼亚德国家族中报告了两个突变基因，以及一个多态性。一种等位基因，最初在法国TSD患者（Akli等人，1991年）中报道，是在第9内含子供体剪接位点上的GT-->AT过渡。第二种，在核苷酸739处的C-->T过渡（Arg247Trp），由Triggs-Raine等人（1992年）证明与对人工底物的酶活性降低相关的临床良性“伪缺陷”等位基因。最后，描述了一种多态性[G-->A（759）]，使得在密码子253处的缬氨酸保持不变。"
      }
    ]
  ]
}
```

#### 电话

In [None]:
request = clients["prediction"].predict(endpoint=endpoint_id, instances=instances)

#### 回复

In [None]:
print(MessageToJson(request.__dict__["_pb"]))

{
  "predictions": [
    {
      "displayNames": [
        "具体疾病",
        "具体疾病",
        "具体疾病",
        "修饰词",
        "修饰词",
        "修饰词"
      ],
      "confidences": [
        0.9995627999305725,
        0.9995783567428589,
        0.9995043873786926,
        0.9993286728858948,
        0.999272346496582,
        0.9993144273757935
      ],
      "textSegmentStartOffsets": [
        19.0,
        149.0,
        169.0,
        236.0,
        330.0,
        688.0
      ],
      "ids": [
        "7806436899697983488",
        "7806436899697983488",
        "7806436899697983488",
        "4347672385877442560",
        "4347672385877442560",
        "4347672385877442560"
      ],
      "textSegmentEndOffsets": [
        46.0,
        166.0,
        172.0,
        239.0,
        333.0,
        691.0
      ]
    }
  ],
  "deployedModelId": "3958065938133155840"
}

### [projects.locations.endpoints.undeployModel](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/undeployModel)

### [项目.位置.端点.取消部署模型](https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.endpoints/undeployModel)

####电话

In [None]:
request = clients["endpoint"].undeploy_model(
    endpoint=endpoint_id,
    deployed_model_id=deployed_model_id,
    traffic_split={},
)

#### 回复

In [None]:
result = request.result()

print(MessageToJson(result.__dict__["_pb"]))

*示例输出*：
```
{}
```

清理工作 

要清理此项目中使用的所有GCP资源，您可以删除用于教程的[GCP项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在此教程中创建的单个资源。

In [None]:
delete_dataset = True
delete_model = True
delete_endpoint = True
delete_pipeline = True
delete_batchjob = True
delete_bucket = True

# Delete the dataset using the Vertex AI fully qualified identifier for the dataset
try:
    if delete_dataset:
        clients["dataset"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)

# Delete the model using the Vertex AI fully qualified identifier for the model
try:
    if delete_model:
        clients["model"].delete_model(name=model_id)
except Exception as e:
    print(e)

# Delete the endpoint using the Vertex AI fully qualified identifier for the endpoint
try:
    if delete_endpoint:
        clients["endpoint"].delete_endpoint(name=endpoint_id)
except Exception as e:
    print(e)

# Delete the training pipeline using the Vertex AI fully qualified identifier for the training pipeline
try:
    if delete_pipeline:
        clients["pipeline"].delete_training_pipeline(name=training_pipeline_id)
except Exception as e:
    print(e)

# Delete the batch job using the Vertex AI fully qualified identifier for the batch job
try:
    if delete_batchjob:
        clients["job"].delete_batch_prediction_job(name=batch_job_id)
except Exception as e:
    print(e)


if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r gs://$BUCKET_NAME