In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

使用Vertex AI Vector Search和Vertex AI embeddings for text用于StackOverflow问题
![ ](https://www.google-analytics.com/collect?v=2&tid=G-L6X3ECH596&cid=1&en=page_view&sid=1&dt=sdk_vector_search_create_stack_overflow_embeddings_vertex.ipynb&dl=notebooks%2Fofficial%2Fvector_search%2Fsdk_vector_search_create_stack_overflow_embeddings_vertex.ipynb)
<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/vector_search/sdk_vector_search_create_stack_overflow_embeddings_vertex.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"><br> 在Colab中运行
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fvector_search%2Fsdk_vector_search_create_stack_overflow_embeddings_vertex.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在Colab Enterprise中打开
    </a>
  </td> 
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/vector_search/sdk_vector_search_create_stack_overflow_embeddings_vertex.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br>
      在GitHub上查看
    </a>
  </td>
      <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/vector_search/sdk_vector_search_create_stack_overflow_embeddings_vertex">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br>
      在Vertex AI Workbench中打开
    </a>
  </td>
</table>

## 概述

此示例演示了如何使用Vertex AI文本嵌入和StackOverflow数据集对文本嵌入进行编码。这些数据被上传到Vertex AI矢量搜索服务，这是一个用于在大型语料库中查找相似向量的高规模、低延迟的解决方案。矢量搜索是一个完全托管的产品，进一步降低了运营成本。它是基于Google研究开发的[近似最近邻（ANN）技术](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html)构建的。

了解有关[Vertex AI矢量搜索](https://cloud.google.com/vertex-ai/docs/vector-search/overview)和[Vertex AI文本嵌入](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)的更多信息。

### 目标

在这个笔记本中，您将学习如何对文本嵌入进行编码，创建一个近似最近邻居（ANN）索引，并针对索引进行查询。

本教程使用以下 Google Vertex AI 服务:

- Vertex AI Vector Search
- Vertex AI embeddings for text

执行的步骤包括:

* 将 BigQuery 数据集转换为嵌入。
* 创建一个索引。
* 将嵌入上传到索引。
* 创建一个索引端点。
* 将索引部署到索引端点。
* 执行在线查询。

### 数据集

本教程使用的数据集是[StackOverflow数据集](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow)。

> Stack Overflow 是程序员学习、分享知识和推进职业发展的最大在线社区。这个BigQuery数据集每季度更新一次，包括Stack Overflow内容的存档，包括帖子、投票、标签和徽章。该数据集更新以反映互联网档案馆上的Stack Overflow内容，并且还可通过Stack Exchange数据浏览器获取。

开始吧

### 为Python安装Vertex AI SDK和其他必需的软件包

In [None]:
# Install the packages
! pip3 install --upgrade google-cloud-aiplatform \
                        google-cloud-storage \
                        'google-cloud-bigquery[pandas]' 

### 重新启动运行时（仅限Colab）

要使用新安装的软件包，您必须重新启动Google Colab上的运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️内核将重新启动。请等待完成后再继续下一步。⚠️</b>
</div>

### 对笔记本环境进行身份验证（仅适用于Colab）

在Google Colab上对您的环境进行身份验证。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并初始化Python的Vertex AI SDK

要开始使用Vertex AI，您必须具有现有的Google Cloud项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

创建一个云存储桶

创建一个存储桶来存储中间产物，例如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

如果您的存储桶尚不存在：运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

准备数据

您可以使用托管在BigQuery上的[Stack Overflow数据集](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow)中的问题和答案。

> 这个公共数据集托管在Google BigQuery中，并包含在BigQuery的每月1TB免费处理额度中。这意味着每个用户每月可以获得1TB的免费BigQuery处理，可用于在这个公共数据集上运行查询。

BigQuery表太大而无法放入内存，因此您需要编写一个名为`query_bigquery_chunks`的生成器，用于产生数据框的块进行处理。此外，还添加了一个额外的列`title_with_body`，它是问题标题和正文的连接。

In [None]:
import math
from typing import Any, Generator

import pandas as pd
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title, q.body
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` where Score>0 ORDER BY View_Count desc) AS q 
        LIMIT {limit} OFFSET {offset};
        """


def query_bigquery_chunks(
    max_rows: int, rows_per_chunk: int, start_chunk: int = 0
) -> Generator[pd.DataFrame, Any, None]:
    for offset in range(start_chunk, max_rows, rows_per_chunk):
        query = QUERY_TEMPLATE.format(limit=rows_per_chunk, offset=offset)
        query_job = client.query(query)
        rows = query_job.result()
        df = rows.to_dataframe()
        df["title_with_body"] = df.title + "\n" + df.body
        yield df

In [None]:
# Get a dataframe of 1000 rows for demonstration purposes
df = next(query_bigquery_chunks(max_rows=1000, rows_per_chunk=1000))

# Examine the data
df.head()

实例化文本编码模型

使用由谷歌开发的[Vertex AI embeddings for text API](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)来将文本转换为嵌入向量。

文本嵌入是内容的一种密集向量表示，如果两个内容在语义上相似，它们各自的嵌入会在嵌入向量空间中靠近彼此。这种表示可以用来解决常见的自然语言处理任务，例如：
- 语义搜索：按语义相似性对文本进行排名搜索。
- 推荐：返回具有与给定文本相似文本属性的项目。
- 分类：返回具有与给定文本相似文本属性的项目类别。
- 聚类：对具有与给定文本相似文本属性的项目进行聚类。
- 异常检测：返回与给定文本最不相关的文本属性的项目。

#### 定义一个编码函数

定义一个函数，以后可用它将句子转换为嵌入向量。

In [None]:
from typing import List, Optional

# Load the "Vertex AI Embeddings for Text" model
from vertexai.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


# Define an embedding method that uses the model
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

#### 定义另外两个辅助函数来将文本转换为嵌入向量

- `generate_batches`: 根据文档，每个请求最多可以处理五个文本实例。因此，这个方法在发送到嵌入API之前将 `sentences` 分成五个一组的批次。
- `encode_text_to_embedding_batched`: 这个方法调用 `generate_batches` 来处理分批，然后通过 `encode_texts_to_embeddings` 调用嵌入API。 它还使用 `time.sleep` 处理速率限制。 对于生产用例，最好使用一个考虑重试的更复杂的速率限制机制。

In [None]:
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple

import numpy as np
from tqdm.auto import tqdm


# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

测试编码功能

对数据的一个子集进行编码，并查看嵌入和距离度量是否合理。

In [None]:
# Encode a subset of questions for validation
questions = df.title.tolist()[:500]
is_successful, question_embeddings = encode_text_to_embedding_batched(
    sentences=df.title.tolist()[:500]
)
# Filter for successfully embedded sentences
questions = np.array(questions)[is_successful]

在创建索引时保存尺寸尺寸以备以后使用。

In [None]:
DIMENSIONS = len(question_embeddings[0])

print(DIMENSIONS)

按照相似度的顺序排序问题。

根据[嵌入文档](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings#colab_example_of_semantic_search_using_embeddings)，嵌入的相似度是使用点积来计算的。

- 使用`np.dot`计算向量相似度。
- 按照相似度进行排序。
- 打印结果以便检查。

In [None]:
import random

question_index = random.randint(0, 99)

print(f"Query question = {questions[question_index]}")

# Get similarity scores for each embedding by using dot-product.
scores = np.dot(question_embeddings[question_index], question_embeddings.T)

# Print top 20 matches
for index, (question, score) in enumerate(
    sorted(zip(questions, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {question}: {score}")

将嵌入保存为JSONL格式

数据必须以JSONL格式进行格式化，这意味着每个嵌入字典都作为单独的JSON对象写在自己的一行上。

有关更多信息，请参阅[输入数据格式和结构](https://cloud.google.com/vertex-ai/docs/vector-search/setup/format-structure#data-file-formats)。

In [None]:
import tempfile
from pathlib import Path

# Create temporary file to write embeddings to
embeddings_file_path = Path(tempfile.mkdtemp())

print(f"Embeddings directory: {embeddings_file_path}")

将嵌入分批写入，以防止内存错误

In [None]:
import gc
import json

BQ_NUM_ROWS = 50000
BQ_CHUNK_SIZE = 1000
BQ_NUM_CHUNKS = math.ceil(BQ_NUM_ROWS / BQ_CHUNK_SIZE)

START_CHUNK = 0

# Create a rate limit of 300 requests per minute. Adjust this depending on your quota.
API_CALLS_PER_SECOND = 300 / 60
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 5

# Loop through each generated dataframe, convert
for i, df in tqdm(
    enumerate(
        query_bigquery_chunks(
            max_rows=BQ_NUM_ROWS, rows_per_chunk=BQ_CHUNK_SIZE, start_chunk=START_CHUNK
        )
    ),
    total=BQ_NUM_CHUNKS - START_CHUNK,
    position=-1,
    desc="Chunk of rows from BigQuery",
):
    # Create a unique output file for each chunk
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{i+START_CHUNK}.json"
    )
    with open(chunk_path, "a") as f:
        id_chunk = df.id

        # Convert batch to embeddings
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
            sentences=df.title_with_body.to_list(),
            api_calls_per_second=API_CALLS_PER_SECOND,
            batch_size=ITEMS_PER_REQUEST,
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
        ]
        f.writelines(embeddings_formatted)

        # Delete the DataFrame and any other large data structures
        del df
        gc.collect()

将训练数据上传至Google Cloud Storage存储桶。

In [None]:
remote_folder = f"{BUCKET_URI}/{embeddings_file_path.stem}/"
! gsutil -m cp -r {embeddings_file_path}/* {remote_folder}

创建索引##

### 创建ANN索引（用于生产使用）

In [None]:
DISPLAY_NAME = "stack_overflow"
DESCRIPTION = "question titles and bodies from stackoverflow"

创建索引配置 (CJianli suoyin peizhi)

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [None]:
DIMENSIONS = 768

tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=remote_folder,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description=DESCRIPTION,
)

In [None]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

使用资源名称检索现有的索引资源。

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

创建一个索引端点

In [None]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    public_endpoint_enabled=True,
)

部署索引

部署人工神经网络索引

In [None]:
DEPLOYED_INDEX_ID = "deployed_index_id_unique"

DEPLOYED_INDEX_ID

In [None]:
my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes

#### 验证声明的项目数量是否与嵌入数量匹配

每个IndexEndpoint可以部署多个索引。对于每个索引，您可以使用`index_endpoint._gca_resource.index_stats.vectors_count`来检索部署的向量数量。由于可能存在使用嵌入服务时的潜在故障，这些数字可能不会完全匹配。

In [None]:
number_of_vectors = sum(
    aiplatform.MatchingEngineIndex(
        deployed_index.index
    )._gca_resource.index_stats.vectors_count
    for deployed_index in my_index_endpoint.deployed_indexes
)

print(f"Expected: {BQ_NUM_ROWS}, Actual: {number_of_vectors}")

创建在线查询

在构建索引之后，您可以针对已部署的索引进行查询，以找到最近的邻居。

注意：对于`DOT_PRODUCT_DISTANCE`距离类型，每个MatchNeighbor返回的“distance”属性实际上是指相似度。

In [None]:
test_embeddings = encode_texts_to_embeddings(sentences=["Install GPU for Tensorflow"])

In [None]:
# Test query
NUM_NEIGHBOURS = 10

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
)

response

通过检查StackOverflow链接验证获取的结果是否相关。

In [None]:
for match_index, neighbor in enumerate(response[0]):
    print(f"https://stackoverflow.com/questions/{neighbor.id}")

清理

要清理此项目中使用的所有Google Cloud资源，您可以[删除用于教程的Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。
您还可以通过运行以下代码手动删除创建的资源。

In [None]:
import os

delete_bucket = False

# Force undeployment of indexes and delete endpoint
my_index_endpoint.delete(force=True)

# Delete indexes
tree_ah_index.delete()

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -rf {BUCKET_URI}