In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 利用嵌入进行语义搜索

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/text_embedding_api_semantic_search_with_scann.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在 Colab 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fgenerative_ai%2Ftext_embedding_api_semantic_search_with_scann.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在 Colab Enterprise 中打开
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/text_embedding_api_semantic_search_with_scann.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在 Workbench 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/text_embedding_api_semantic_search_with_scann.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在 GitHub 上查看
    </a>
  </td>
</table>

## 概述

语义搜索是一种利用词语、短语和语境的含义来查找最相关结果的搜索类型。语义搜索依赖于最能匹配用户查询与最相似结果的向量嵌入。

在这种情况下，嵌入是代表单词的向量。在向量空间中，物品越接近，它们就越相似。因此，当您查询一个嵌入时，返回与您的输入（来自您的训练输入）最接近的物品。

了解更多关于[文本嵌入](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)。

### 目标

在本教程中，我们将演示如何从文本中创建嵌入并进行语义搜索。这些嵌入是使用[Google ScaNN：高效的向量相似度搜索](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html)生成的。

本教程使用以下谷歌云ML服务和资源：
- Vertex LLM SDK
- ScaNN [github](https://github.com/google-research/google-research/tree/master/scann)

执行的步骤包括：
- 安装和导入
- 创建嵌入数据集
- 创建索引
- 查询索引

### 数据集

在本教程中，您将使用由Google Brain开发的名为"wide_and_deep_trainer_container_tests_input.jsonl"的样本数据集。

接下来的几步中，您将从Google Cloud示例数据桶导入该文件。

这里是文件的链接：

In [None]:
DATASET_URI = "gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl"  # @param {type:"string"}

费用

此教程使用 Google Cloud 的可计费组件：

* Vertex AI
* Cloud Storage

了解 [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing)，
以及 [Cloud Storage 价格](https://cloud.google.com/storage/pricing)，
并使用 [定价计算器](https://cloud.google.com/products/calculator/)
根据您的预期使用情况生成成本估算。

开始吧

### 为Python安装Vertex AI SDK和其他所需的软件包

In [None]:
!pip3 install google-cloud-aiplatform "shapely<2.0.0" --quiet
!pip install scann --quiet

### 重新启动运行时（仅限Colab）

为了使用新安装的包，你必须在Google Colab上重新启动运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
    <b>⚠️ 内核即将重新启动。请等待它完成后再继续下一步。⚠️</b>
</div>

### 在Google Colab上验证您的笔记本环境

在Google Colab上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置谷歌云项目信息并初始化 Vertex AI SDK

要开始使用Vertex AI，您必须拥有一个现有的谷歌云项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)的信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


import vertexai

# Initiate Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

### 导入库

In [None]:
import json
import time

import numpy as np
import pandas as pd
import scann
from vertexai.preview.language_models import TextEmbeddingModel

使用谷歌预训练模型初始化文本嵌入模型。

In [None]:
model = TextEmbeddingModel.from_pretrained("google/textembedding-gecko@001")

创建嵌入数据集

该数据集展示了文本嵌入API与向量数据库的使用。它不打算用于任何其他目的，比如评估模型。该数据集很小，不代表所有可能文本的全面样本。

In [None]:
!gsutil cp gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl .

In [None]:
# reads a JSON file and stores the records in a list
records = []
with open("wide_and_deep_trainer_container_tests_input.jsonl") as f:
    for line in f:
        record = json.loads(line)
        records.append(record)

In [None]:
# Peek at the data.
df = pd.DataFrame(records)
df.head(10)

In [None]:
# This function takes a text string as input
# and returns the embedding of the text


def get_embedding(text: str) -> list:
    try:
        embeddings = model.get_embeddings([text])
        return embeddings[0].values
    except:
        return []


get_embedding.counter = 0

# This may take several minutes to complete.
df["embedding"] = df["textContent"].apply(lambda x: get_embedding(x))

In [None]:
# Peek at the data.
df.head()

创建一个索引

In [None]:
record_count = len(records)
dataset = np.array([df.embedding[i] for i in range(record_count)])


normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = (
    scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product")
    .tree(
        num_leaves=record_count,
        num_leaves_to_search=record_count,
        training_sample_size=record_count,
    )
    .score_ah(2, anisotropic_quantization_threshold=0.2)
    .reorder(100)
    .build()
)

查询索引
这是如何使用ScaNN库执行近似最近邻搜索的一个很好的例子。该函数以查询字符串作为输入，返回查询的前3个最近邻居。该函数高效且可用于快速搜索大型数据集。

In [None]:
def search(query: str) -> None:
    start = time.time()
    query = model.get_embeddings([query])[0].values
    neighbors, distances = searcher.search(query, final_num_neighbors=3)
    end = time.time()

    for id, dist in zip(neighbors, distances):
        print(f"[docid:{id}] [{dist}] -- {df.textContent[int(id)][:125]}...")
    print("Latency (ms):", 1000 * (end - start))

你可以测试一些查询。

In [None]:
search("tell me about an animal")

In [None]:
search("tell me about an important moment or event in your life")