In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用 Vertex AI Feature Store 在线特征服务和向量检索 BigQuery 数据


<table align="left">

  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/online_feature_serving_and_vector_retrieval_bigquery_data_with_feature_store.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在 Colab 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fblob%2Fmain%2Fnotebooks%2Fofficial%2Ffeature_store%2Fonline_feature_serving_and_vector_retrieval_bigquery_data_with_feature_store.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在 Colab Enterprise 中打开
    </a>
  </td>  
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/online_feature_serving_and_vector_retrieval_bigquery_data_with_feature_store.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在 Workbench 中打开
    </a>
  </td> 
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/online_feature_serving_and_vector_retrieval_bigquery_data_with_feature_store.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在 GitHub 上查看
    </a>
  </td>                                                                                              
</table>

## 概述

本教程演示了如何在BigQuery中使用Vertex AI Feature Store进行在线服务和特征值的向量检索。

了解更多关于[Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore/overview)。

### 目标

在本教程中，您将学习如何使用在线特征存储实例在BigQuery中托管和提供数据，在端到端的特征提供和向量检索用户旅程中使用Vertex AI Feature Store。

本教程使用以下Google Cloud ML服务和资源：

- Vertex AI Feature Store

执行的步骤包括：

- 配置一个在线特征存储实例来托管和提供数据。
- 创建一个在线特征存储实例来提供一个BigQuery表。
- 使用在线服务器搜索最近的邻居。

###注意
这是一个公开的预览版本。通过使用该功能，您确认已了解存在的问题，并且该预览版本是根据预发布的服务条款提供的“原样”状态。

数据集

本教程使用了来自BigQuery公共数据集的[Google专利公开数据](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data)数据集。

成本

本教程使用Google Cloud的可计费组件：

* Vertex AI
* BigQuery

了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing) 和
[BigQuery定价](https://cloud.google.com/bigquery/pricing)
并使用[定价计算器](https://cloud.google.com/products/calculator/)
基于您的预期使用量生成成本估算。

开始吧

### 为 Python 安装 Vertex AI SDK 和其他所需的软件包

In [None]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform\
                                 google-cloud-bigquery\
                                 db-dtypes

### 重新启动运行时（仅限Colab）

为了使用新安装的软件包，您必须在Google Colab上重新启动运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ 内核即将重新启动，请等待它完成后再继续进行下一步。⚠️</b>
</div>

### 在Google Colab上验证您的笔记本环境

在Google Colab上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置谷歌云项目信息并为 Python 初始化 Vertex AI SDK

要开始使用 Vertex AI，您必须拥有一个现有的谷歌云项目并启用 [Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于 [设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment) 的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type: "string"}

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

API_ENDPOINT = f"{LOCATION}-aiplatform.googleapis.com"

### 导入库

In [None]:
from google.cloud import bigquery
from google.cloud.aiplatform_v1.types import NearestNeighborQuery
from vertexai.resources.preview import (FeatureOnlineStore, FeatureView,
                                        FeatureViewBigQuerySource)
from vertexai.resources.preview.feature_store import utils

## 在BigQuery中设置数据源

### 要求
数据源必须是一个BigQuery表或BigQuery视图，具有以下列要求：
1. [*必需*] 一个实体id列，类型：字符串
2. [*必需*] 一个嵌入列，类型：double数组
3. [*可选*] 一个或多个过滤列，类型：字符串或字符串数组
4. [*可选*] 一个拥挤列，类型：整数。拥挤确保结果是多样化的，通过仅返回最多k' < k个具有任何单个拥挤属性的邻居来实现对k个总邻居的限制。

### 测试数据源

从`patents-public-data.google_patents_research.publications_202304`表中选择一个子集，并排除不兼容 Feature Store 的重复记录类型的列：

为了演示目的创建一个小数据集（<=100MB），如果需要可以使用完整数据集。

In [None]:
FEATURE_EXTRACT_QUERY_FULL = """
SELECT publication_number, embedding_v1 as embedding, url, country, publication_description,
cpc_low, cpc_inventive_low, top_terms, title, CAST(title_translated as INT) as title_translated,
abstract, CAST(abstract_translated as INT) as abstract_translated,
cited_by[safe_offset(0)].filing_date as cited_by_filing_date,
similar[safe_offset(0)].filing_date as similar_filing_date
FROM `patents-public-data.google_patents_research.publications_202304`
"""
FEATURE_EXTRACT_QUERY_SMALL = f"{FEATURE_EXTRACT_QUERY_FULL} WHERE cited_by[safe_offset(0)].filing_date is not NULL LIMIT 1000"

这个数据源有一些筛选列（例如，国家）和拥挤列（例如，按引用填写日期）。以下是我们在本指南中使用的出版物数据表的架构：

|列名 | 类型 | 模式 |
|------------|--------|----------|
|publication_number | STRING | NULLABLE
|embedding | FLOAT | REPEATED
|url | STRING | NULLABLE
|country | STRING | NULLABLE |
|publication_description | STRING | NULLABLE |
|cpc_low | STRING | REPEATED |
|cpc_inventive_low | STRING | REPEATED |
|top_terms | STRING | REPEATED |
|title | STRING | NULLABLE |
|title_translated | INTEGER | NULLABLE |
|abstract | STRING | NULLABLE |
|abstract_translated | INTEGER | NULLABLE |
|cited_by_filing_date | INTEGER | NULLABLE |
|similar_filing_date | INTEGER | NULLABLE |

查看检索到的数据。

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)

product_data = bq_client.query(FEATURE_EXTRACT_QUERY_SMALL).result().to_dataframe()

print(product_data.shape)
product_data.head()

### 创建 BigQuery 数据集

创建一个 BigQuery 数据集来保存本教程所需的 BigQuery 表。数据集必须与您的在线商店实例所在的地区相同。由于本教程的源数据位于 `美国` 地区，您可以将数据集复制到所需的地区。如果您使用自己的数据和数据集，也可以使用该数据集来创建 BigQuery 表。

In [None]:
def create_bq_dataset(datasetId, region):
    dataset = bigquery.Dataset(f"{PROJECT_ID}.{datasetId}")
    dataset.location = region
    dataset = bq_client.create_dataset(
        dataset, exists_ok=True, timeout=30
    )  # Make an API request.

    # Confirm dataset created.
    print(f"Created dataset {dataset} in region {region}")

In [None]:
# First, create a dataset to keep the feature store source data if it doesn't already exist.
BQ_DATASET_ID = "featurestore_demo_us"  # @param {type:"string"}
create_bq_dataset(BQ_DATASET_ID, "US")

创建一个BigQuery表

In [None]:
# Second, execute the query and store the results into a table
BQ_TABLE_ID = "publications_202304_small"  # @param {type:"string"}
BQ_TABLE_ID_FQN = f"{PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}"

job_config = bigquery.QueryJobConfig(destination=BQ_TABLE_ID_FQN)
query_job = bq_client.query(FEATURE_EXTRACT_QUERY_SMALL, job_config=job_config)

try:
    query_job.result()
except Exception as e:
    # Table already exists
    print("Error: ", e.message)

print(f"Created table: {BQ_TABLE_ID}")
DATA_SOURCE = f"bq://{BQ_TABLE_ID_FQN}"

## 设置并启动在线服务

为了在特征存储中提供嵌入式数据，您需要执行以下操作：

1. 创建一个在线存储集群来托管数据。
2. 定义要由新创建的实例提供的数据（FeatureView）。

### 创建功能在线商店

创建一个经过优化的功能在线商店。长时间运行 (LRO) 完成后，结果将被显示。

> **注意：** 这个操作可能需要长达 10 分钟的时间才能完成。

In [None]:
FEATURE_ONLINE_STORE_ID = "my_feature_online_store_unique"  # @param {type: "string"}

In [None]:
my_fos = FeatureOnlineStore.create_optimized_store(FEATURE_ONLINE_STORE_ID)

In [None]:
# get full information of the created feature online store instance
my_fos.gca_resource

验证 `FeatureOnlineStore` 实例是否已创建，方法是获取在线商店实例。

In [None]:
# Use get to verify the store is created.
FeatureOnlineStore(FEATURE_ONLINE_STORE_ID).gca_resource

列出此地区所有的在线商店

In [None]:
# Use list to verify the store is created.
all_foss = FeatureOnlineStore.list()
for fos in all_foss:
    print(fos.gca_resource)

#### 可选：使用在当地创建的在线商店

In [None]:
# another_fos = FeatureOnlineStore("an_existing_feature_online_store")  # @param {type: "string"}

创建特征视图实例

在创建`FeatureOnlineStore`实例之后，您可以定义要使用的特征。要做到这一点，创建一个`FeatureView`实例，其中指定以下内容：

* 一个数据源（BigQuery表或视图URI或FeatureGroup/features）同步到用于提供服务的`FeatureOnlineStore`实例。
* 运行同步流水线的cron计划。

在特征视图创建过程中，会安排一个同步作业，可以立即启动或按照cron计划启动。在同步作业中，数据将被导出，索引被构建，并部署到特征存储后端。

In [None]:
FEATURE_VIEW_ID = "feature_view_publications"  # @param {type: "string"}
# A schedule is created based on cron setting.
CRON_SCHEDULE = "TZ=America/Los_Angeles 00 13 11 8 *"  # @param {type: "string"}

In [None]:
# Index configs
DIMENSIONS = 64  # @param {type: "number"}
EMBEDDING_COLUMN = "embedding"  # @param {type: "string"}
# Optional
LEAF_NODE_EMBEDDING_COUNT = 10000  # @param {type: "number"}
# Optional
CROWDING_COLUMN = "cited_by_filing_date"  # @param {type: "string"}
# Optional
FILTER_COLUMNS = ["country"]  # @param

In [None]:
big_query_source = FeatureViewBigQuerySource(
    uri=DATA_SOURCE, entity_id_columns=["publication_number"]
)

index_config = utils.IndexConfig(
    embedding_column=EMBEDDING_COLUMN,
    dimensions=DIMENSIONS,
    crowding_column=CROWDING_COLUMN,
    filter_columns=FILTER_COLUMNS,
    algorithm_config=utils.TreeAhConfig(),
)

print(f"index_config: {index_config}")

my_fv = my_fos.create_feature_view(
    FEATURE_VIEW_ID,
    source=big_query_source,
    sync_config=CRON_SCHEDULE,  # Optional, can be set to None.
    index_config=index_config,
)

In [None]:
# get full information of the created feature view
my_fv.gca_resource

### 验证特征视图实例创建

通过获取特征视图来验证`FeatureView`实例是否已创建。

In [None]:
FeatureView(
    FEATURE_VIEW_ID, feature_online_store_id=FEATURE_ONLINE_STORE_ID
).gca_resource

验证 `FeatureView` 实例是否被创建，方法是列出在线商店中所有的特色视图。

In [None]:
all_fvs = FeatureView.list(feature_online_store_id=FEATURE_ONLINE_STORE_ID)
for fv in all_fvs:
    print(fv.gca_resource)

### 特征视图同步

同步流水线根据`FeatureView`实例中指定的时间表执行。

要跳过等待并立即执行同步流水线，请手动启动同步。

In [None]:
sync_response = my_fv.sync()

`sync_response` 包含了同步作业的 ID。

#### 使用 `get_feature_view_sync` 来检查作业的状态。

In [None]:
import time

while True:
    feature_view_sync = my_fv.get_sync(
        sync_response.resource_name.split("/")[9]
    ).gca_resource
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}. \n {feature_view_sync}")
        # wait a little more for the job to properly shutdown
        time.sleep(30)
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

使用`list_feature_view_syncs`查看所有您的同步。

In [None]:
fv_syncs = fv.list_syncs()
for fv_sync in fv_syncs:
    print(f" * {fv_sync.gca_resource}")

在线服务开始

数据同步完成后，使用`FetchFeatureValuesRequest`和`SearchNearestEntities` API来检索数据。

设置`最近邻查询.字符串过滤器`

In [None]:
results_df = (
    bq_client.query(f"select publication_number from {BQ_TABLE_ID_FQN} limit 1")
    .result()
    .to_dataframe()
)
ENTITY_ID = results_df.loc[0, "publication_number"]
print(f"Sample publication number: {ENTITY_ID}")

In [None]:
country_filter = NearestNeighborQuery.StringFilter(
    name="country",
    allow_tokens=["WIPO (PCT)"],  # try different allow tokens
    deny_tokens=["United States"],  # try different deny tokens
)

#### 使用`ENTITY_ID`进行搜索

In [None]:
# It takes some time for the DNS to be fully ready
time.sleep(300)

my_fv.search(
    entity_id=ENTITY_ID,
    neighbor_count=5,
    string_filters=[country_filter],
    return_full_entity=True,  # returning entities with metadata
)

使用`嵌入`来进行搜索

In [None]:
EMBEDDINGS = [1] * DIMENSIONS

In [None]:
my_fv.search(
    embedding_value=EMBEDDINGS,
    neighbor_count=10,
    string_filters=[country_filter],
    return_full_entity=True,  # returning entities with metadata
)

使用`FetchFeatureValues` API来检索完整数据，无需搜索。

In [None]:
my_fv.read(key=[ENTITY_ID])

清理

要清理此项目中使用的所有谷歌云资源，您可以[删除用于本教程的谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的个别资源。

In [None]:
# Delete Feature View
my_fv.delete()

# Delete Feature Online Store
my_fos.delete(force=True)

# Delete BigQuery dataset. Uncomment and run the command below if you want to delete the BigQuery set.
# Do this only if the dataset is created for this demo.
# bq_client.delete_dataset(BQ_DATASET_ID, delete_contents=True, not_found_ok=True)