In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 基于 Vertex AI 特征存储的 LLM 接地教程

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在 Colab 中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在 GitHub 上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/feature_store/vertex_ai_feature_store_based_llm_grounding_tutorial.ipynb.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在 Vertex AI Workbench 中打开
    </a>
  </td>                                                                                               
</table>

## 概述

在本教程中，您将学习如何对用户提供的数据进行分块，并使用具有嵌入生成功能的Vertex LLM（大型语言模型）为每个块生成嵌入向量。然后，可以将生成的嵌入向量数据集加载到Vertex AI特征存储中，实现快速特征检索和高效的在线服务。

了解有关[Vertex AI特征存储](https://cloud.google.com/vertex-ai/docs/featurestore/overview)的更多信息。

### 目标

在本教程中，您将学习如何使用在线特征存储实例在BigQuery中托管和提供数据，在最终功能服务和矢量检索用户旅程的端到端工作流中使用Vertex AI Feature Store。

本教程使用以下谷歌云ML服务和资源：

- Vertex AI Feature Store

执行的步骤包括：

- 配置一个在线特征存储实例来托管和提供数据。
- 创建一个在线特征存储实例来提供BigQuery表。
- 使用在线服务器搜索最近的邻居。

### 注意
这是一个预览版本。使用该功能时，您需要意识到存在一些问题，并且该预览版本是根据发布前GA的服务条款提供的“原样”服务。

数据集

本教程使用了来自BigQuery公共数据集的[Google专利公共数据](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data)。

### 成本

此教程使用 Google Cloud 的可计费组件：

* Vertex AI
* BigQuery
* Cloud Storage

了解 [Vertex AI 定价](https://cloud.google.com/vertex-ai/pricing)、[BigQuery 定价](https://cloud.google.com/bigquery/pricing)、[Cloud Storage 定价](https://cloud.google.com/storage/pricing)，
并使用 [定价计算器](https://cloud.google.com/products/calculator/) 根据您的预期使用量生成成本估算。

## 安装

安装以下所需的软件包以运行这个笔记本。

In [None]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform\
                                 google-cloud-bigquery\
                                 db-dtypes

! pip3 install --upgrade kfp -q --no-warn-conflicts

只有Colab: 取消下面的单元格注释以重新启动内核。

In [None]:
# # Automatically restart the kernel after installation so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## 开始之前

### 设置你的谷歌云项目

**无论你使用什么笔记本环境，下面的步骤都是必需的。**

1. [选择或创建一个谷歌云项目](https://console.cloud.google.com/cloud-resource-manager)。当你首次创建账户时，你会获得$300的免费信用用于支付计算/存储成本。

2. [确保为你的项目启用了计费功能](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

4. 如果你在本地运行这个笔记本，你需要安装[Cloud SDK](https://cloud.google.com/sdk)。

#### 设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下操作：
- 运行 `gcloud config list`。
- 运行 `gcloud projects list`。
- 查看支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

地区

您也可以更改Vertex AI使用的`REGION`变量。了解有关[Vertex AI地区](https://cloud.google.com/vertex-ai/docs/general/locations)的更多信息。请注意，colab中显示的新的特征存储功能目前仅在以下地区提供：
* `us-central1`
* `us-east1`
* `us-west1`
* `europe-west4`
* `asia-southeast1`

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### 验证您的谷歌云帐户

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下说明进行操作：

1. 顶点 AI 工作台
* 什么也不用做，因为您已经进行了身份验证。

**2. 本地JupyterLab实例，取消注释并运行：**

In [None]:
# ! gcloud auth login

3. 合作，取消注释并运行：

In [None]:
# from google.colab import auth
# auth.authenticate_user()

4. 服务账户或其他
* 请查看如何授予您的服务账户云存储权限的方法，网址为https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples。

### 导入库

In [None]:
import uuid

from google.cloud import aiplatform, bigquery
from google.cloud.aiplatform_v1 import (FeatureOnlineStoreAdminServiceClient,
                                        FeatureOnlineStoreServiceClient)
from google.cloud.aiplatform_v1.types import NearestNeighborQuery
from google.cloud.aiplatform_v1.types import \
    feature_online_store as feature_online_store_pb2
from google.cloud.aiplatform_v1.types import \
    feature_online_store_admin_service as \
    feature_online_store_admin_service_pb2
from google.cloud.aiplatform_v1.types import \
    feature_online_store_service as feature_online_store_service_pb2
from google.cloud.aiplatform_v1.types import feature_view as feature_view_pb2

### 初始化用于 Python 的 Vertex AI SDK

为您的项目初始化用于 Python 的 Vertex AI SDK。

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

## 设置和启动在线服务

要在Vertex AI特征存储中提供嵌入数据，请执行以下操作：

1. 在BigQuery中准备数据源。
2. 创建一个FeatureOnlineStore实例来托管数据。
3. 定义要由新创建实例提供的数据（`FeatureView`）。

为特征视图创建准备BigQuery数据源

In [None]:
GCS_BUCKET = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

只有当您的存储桶不存在时才运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {GCS_BUCKET}

准备在Google Cloud存储（GCS）中的数据

In [None]:
INPUT_TEXT_GCS_DIR = f"{GCS_BUCKET}/fs_grounding/data"

import tarfile
from urllib.request import urlretrieve


def untar(file_name):
    output_folder_name = file_name[:-7]
    file = tarfile.open(file_name)
    file.extractall(output_folder_name)
    return output_folder_name


# Download data from https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz
url = "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz"
filename = "nfcorpus.tar.gz"
path, _ = urlretrieve(url, filename)
print(f"Downloaded {path}")

# Copy text files to GCS.
output_folder_name = f"{untar(path)}/nfcorpus"
dev_all_queries = f"{output_folder_name}/dev.all.queries"
dev_docs = f"{output_folder_name}/dev.docs"
! gsutil cp {dev_all_queries} {INPUT_TEXT_GCS_DIR}/queries
! gsutil cp {dev_docs} {INPUT_TEXT_GCS_DIR}/docs

创建BigQuery数据集

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)

BQ_DATASET_ID = "fs_grounding"  # @param {type:"string"}
dataset = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET_ID}")
dataset.location = REGION
dataset = bq_client.create_dataset(
    dataset, exists_ok=True, timeout=30
)  # Make an API request.

# Confirm dataset created.
print(f"Created dataset {dataset}.{BQ_DATASET_ID}")

启动管道

In [None]:
run_id = str(uuid.uuid4())

PIPELINE_TEMPLATE_URI = "gs://vertex-evaluation-pipeline-templates/20240117_0005/feature_store_grounding_pipeline_pipeline.yaml"
BIGQUERY_BP_INPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET_ID}.batch_predict_input"
BIGQUERY_BP_OUTPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET_ID}.batch_predict_output"

PARAMS = {
    "project": PROJECT_ID,
    "location": REGION,
    "bigquery_bp_input_uri": BIGQUERY_BP_INPUT_URI,
    "bigquery_bp_output_uri": BIGQUERY_BP_OUTPUT_URI,
    "input_text_gcs_dir": INPUT_TEXT_GCS_DIR,
    "output_text_gcs_dir": f"{GCS_BUCKET}/fs_grounding_{run_id}/chunking_output",
    "output_error_file_path": f"{GCS_BUCKET}/fs_grounding_{run_id}/chunking_error_output",
    "model_name": "publishers/google/models/textembedding-gecko@latest",
    "generation_threshold_microseconds": "0",
}


def run_pipeline(
    parameters: dict,
    project: str,
    pipeline_root: str,
    location: str = "us-central1",
) -> aiplatform.PipelineJob:
    aiplatform.init(
        project=project,
        location=location,
    )

    test_prefix = "your-test-prefix"  # @param {type:"string"}
    pipeline_name = "feature-store-grounding-pipeline"  # @param {type:"string"}

    test_name = f"{test_prefix}-{pipeline_name}-{run_id}"
    job = aiplatform.PipelineJob(
        display_name=test_name,
        template_path=PIPELINE_TEMPLATE_URI,
        job_id=test_name,
        pipeline_root=pipeline_root,
        parameter_values=parameters,
        enable_caching=False,
    )

    job.submit()

    return job


job = run_pipeline(
    parameters=PARAMS,
    project=PROJECT_ID,
    pipeline_root=f"{GCS_BUCKET}/fs_based/pipeline_root",
    location=REGION,
)
job.wait()

#### BQ 格式转换

In [None]:
def compose_bq_query_format_conversion(
    bigquery_bp_input_uri: str, bigquery_bp_output_uri: str
) -> str:
    """Compose the BQ query for format conversion.

    Args:
      bigquery_bp_input_uri: The URI to a bigquery table as the input for the
        batch prediction component. The chunking component will populate data to
        this uri first before batch prediction.
      bigquery_bp_output_uri: The URI to a bigquery table as the output for the
        batch prediction component.

    Returns:
      The composed query for BigQuery format conversion.
    """

    if bigquery_bp_input_uri.startswith("bq://"):
        bigquery_bp_input_uri = bigquery_bp_input_uri.replace("bq://", "")

    if bigquery_bp_output_uri.startswith("bq://"):
        bigquery_bp_output_uri = bigquery_bp_output_uri.replace("bq://", "")

    inseration_query = (
        f"UPDATE `{bigquery_bp_input_uri}` destTable"
        " SET embedding=ARRAY( select cast (str_element as float64) from"
        " unnest(JSON_VALUE_ARRAY(prediction, '$.embeddings.values')) as"
        " str_element)"
    )
    fetch_data_query = (
        "FROM (SELECT vertex_generated_chunk_id, prediction FROM"
        f" `{bigquery_bp_output_uri}` cross join"
        " unnest(JSON_EXTRACT_ARRAY(predictions)) as prediction) sourceTable"
        " WHERE"
        " destTable.vertex_generated_chunk_id=sourceTable.vertex_generated_chunk_id"
    )
    return f"{inseration_query} {fetch_data_query};"


bq_query = compose_bq_query_format_conversion(
    bigquery_bp_input_uri=BIGQUERY_BP_INPUT_URI,
    bigquery_bp_output_uri=BIGQUERY_BP_OUTPUT_URI,
)

bq_job = bq_client.query(bq_query)
bq_job.result()

### 初始化管理员服务客户端

加载功能商店SDK。

In [None]:
admin_client = FeatureOnlineStoreAdminServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

### 在线商店创建特性

创建一个具有嵌入式管理功能的特色在线商店。

In [None]:
FEATURE_ONLINE_STORE_ID = "my_feature_online_store_unique"  # @param {type: "string"}

In [None]:
online_store_config = feature_online_store_pb2.FeatureOnlineStore(
    optimized=feature_online_store_pb2.FeatureOnlineStore.Optimized(),
)

create_store_lro = admin_client.create_feature_online_store(
    feature_online_store_admin_service_pb2.CreateFeatureOnlineStoreRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}",
        feature_online_store_id=FEATURE_ONLINE_STORE_ID,
        feature_online_store=online_store_config,
    )
)

### 验证在线商店实例创建

长时间运行的操作（LRO）完成后，显示结果。

> **注意：** 该操作可能需要最多10分钟才能完成。

In [None]:
# Wait for the LRO to finish and get the LRO result.
print(create_store_lro.result())

通过检索在线商店实例来验证“FeatureOnlineStore”实例的创建

In [None]:
# Use get to verify the store is created.
admin_client.get_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)

列出该地点所有的在线商店

In [None]:
# Use list to verify the store is created.
admin_client.list_feature_online_stores(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}"
)

### 创建特征视图实例

在创建 `FeatureOnlineStore` 实例之后，您可以定义要提供的特征。要做到这一点，创建一个 `FeatureView` 实例，其中指定以下内容：

* 数据源（BigQuery表或视图URI或 `FeatureGroup/features`）与用于提供服务的 `FeatureOnlineStore` 实例同步。
* [cron](https://en.wikipedia.org/wiki/Cron) 计划以运行同步管道。

在创建特征视图时，将安排一个同步作业，要么立即开始，要么按照 cron 计划。在同步作业中，数据将被导出，索引将被构建并部署到 GKE 集群。

In [None]:
FEATURE_VIEW_ID = "fs_grounding_test_new"  # @param {type: "string"}
# A schedule will be created based on cron setting.
# If cron is unspecified, a sync job is started immediately.
CRON_SCHEDULE = "TZ=America/Los_Angeles 00 13 11 8 *"  # @param {type: "string"}

In [None]:
# Index building configs
DIMENSIONS = 768  # @param {type: "number"}
EMBEDDING_COLUMN = "embedding"  # @param {type: "string"}
# Optional
LEAF_NODE_EMBEDDING_COUNT = 10000  # @param {type: "number"}
# Optional
# CROWDING_COLUMN = "cited_by_filing_date"  # @param {type: "string"}
# # Optional
# FILTER_COLUMNS = ["country"]  # @param

In [None]:
DATA_SOURCE = BIGQUERY_BP_INPUT_URI

In [None]:
big_query_source = feature_view_pb2.FeatureView.BigQuerySource(
    uri=DATA_SOURCE, entity_id_columns=["vertex_generated_chunk_id"]
)

sync_config = feature_view_pb2.FeatureView.SyncConfig(cron=CRON_SCHEDULE)

index_config = feature_view_pb2.FeatureView.IndexConfig(
    embedding_column=EMBEDDING_COLUMN,
    # filter_columns=FILTER_COLUMNS,
    # crowding_column=CROWDING_COLUMN,
    embedding_dimension=DIMENSIONS,
    tree_ah_config=feature_view_pb2.FeatureView.IndexConfig.TreeAHConfig(),
)

print(f"index_config: {index_config}")

create_view_lro = admin_client.create_feature_view(
    feature_online_store_admin_service_pb2.CreateFeatureViewRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}",
        feature_view_id=FEATURE_VIEW_ID,
        feature_view=feature_view_pb2.FeatureView(
            big_query_source=big_query_source,
            sync_config=sync_config,
            index_config=index_config,
        ),
    )
)

等待LRO完成并显示结果。

In [None]:
print(create_view_lro.result())

验证特征视图创建

通过检索特征视图来验证`FeatureView`实例创建。

In [None]:
admin_client.get_feature_view(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

验证FeatureView实例是否通过列出在线商店中所有的特征视图来创建。

In [None]:
admin_client.list_feature_views(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)

In [None]:
# Optional: Delete feature views to avoid exceeding the deployed index nodes quota.
# views = admin_client.list_feature_views(
#     parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
# )
# for view in views:
#     admin_client.delete_feature_view(name=view.name)

特征视图同步

同步管道根据“FeatureView”实例中指定的计划执行。

要跳过等待并立即执行同步管道，请手动启动同步。

In [None]:
sync_response = admin_client.sync_feature_view(
    feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

`sync_response`包含同步作业的ID。

#### 使用`get_feature_view_sync`来检查作业的状态

In [None]:
import time

while True:
    feature_view_sync = admin_client.get_feature_view_sync(
        name=sync_response.feature_view_sync
    )
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}.")
        # wait a little more for the job to properly shutdown
        time.sleep(30)
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

使用`list_feature_view_syncs`来查看您所有的同步。

In [None]:
admin_client.list_feature_view_syncs(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

### 开始在线服务

数据同步完成后，使用`FetchFeatureValuesRequest`和`SearchNearestEntities` API来检索公共终端点域名。

获取公共终端点域名。

In [None]:
# Verify online store creation.
featore_online_store_instance = admin_client.get_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)
PUBLIC_ENDPOINT = (
    featore_online_store_instance.dedicated_serving_endpoint.public_endpoint_domain_name
)

print(f"PUBLIC_ENDPOINT for online serving: {PUBLIC_ENDPOINT}")

初始化数据客户端

In [None]:
# It will take some time for the DNS to be fully ready
time.sleep(300)

data_client = FeatureOnlineStoreServiceClient(
    client_options={"api_endpoint": PUBLIC_ENDPOINT}
)

使用`ENTITY_ID`搜索

In [None]:
bq_query = f'SELECT * FROM `{BIGQUERY_BP_INPUT_URI.replace("bq://", "")}` LIMIT 1'

bq_query_job = bq_client.query(bq_query)
result = bq_query_job.result().to_dataframe()

print(result)

ENTITY_ID = result["vertex_generated_chunk_id"][0]

In [None]:
# A vertex_generated_chunk_id for testing
data_client.search_nearest_entities(
    request=feature_online_store_service_pb2.SearchNearestEntitiesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        query=NearestNeighborQuery(
            entity_id=ENTITY_ID,
            neighbor_count=5,
        ),
        return_full_entity=True,  # returning entities with metadata
    )
)

#### 搜索与 `嵌入`

In [None]:
EMBEDDINGS = [1] * DIMENSIONS

In [None]:
data_client.search_nearest_entities(
    request=feature_online_store_service_pb2.SearchNearestEntitiesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        query=NearestNeighborQuery(
            embedding=NearestNeighborQuery.Embedding(value=EMBEDDINGS),
            neighbor_count=10,
        ),
        return_full_entity=True,  # returning entities with metadata
    )
)

使用`FetchFeatureValues` API来检索完整的数据，而无需搜索

In [None]:
data_client.fetch_feature_values(
    request=feature_online_store_service_pb2.FetchFeatureValuesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        data_key=feature_online_store_service_pb2.FeatureViewDataKey(key=ENTITY_ID),
    )
)

清理

为了清理这个项目中使用的所有谷歌云资源，请[删除您用于本教程的谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，删除您在本教程中创建的个别资源。

In [None]:
# Delete Feature View
admin_client.delete_feature_view(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

# Delete Feature Online Store
admin_client.delete_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}",
    force=True,
)

# Delete Cloud Storage objects that were created
import os

delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $GCS_BUCKET