In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 文本嵌入新API

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/text_embedding_new_api.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在 Colab 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fgenerative_ai%2Ftext_embedding_new_api.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在 Colab Enterprise 中打开
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/text_embedding_new_api.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在工作台中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/text_embedding_new_api.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在 GitHub 上查看
    </a>
  </td>
</table>

## 概述

这本笔记本是一个代码示例，展示了如何调用我们新发布的文本嵌入模型（text-embedding-004、text-multilingual-embedding-002）。

了解更多关于[text embedding api](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings#api_changes_to_models_released_in_or_after_august_2023)。

### 目标

在本教程中，您将学习如何调用两个新的GA模型text-embedding-004和text-multilingual-embedding-002上的最新文本嵌入API。

本教程使用以下Google Cloud ML服务和资源：

- Vertex LLM SDK

执行的步骤包括：

- 安装和导入
- 生成嵌入

成本

本教程使用 Google Cloud 的计费组件：

* Vertex AI

了解 [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing)，
并使用 [定价计算器](https://cloud.google.com/products/calculator/)
根据您的预计使用量生成成本估算。

开始吧

### 为Python安装Vertex AI SDK以及其他必需的包

In [None]:
# Install Vertex AI SDK for Python
!pip3 install --quiet --upgrade google-cloud-aiplatform

### 重新启动运行时（仅限Colab）

为了使用新安装的软件包，您必须重新启动Google Colab上的运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class =“alert alert-block alert-warning”>
<b>⚠️内核将重新启动。在继续下一步之前，请等待它完成。⚠️</b>
</div>

### 认证您的笔记本环境（仅限Colab）

在Google Colab上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并为Python初始化Vertex AI SDK

要开始使用Vertex AI，您必须拥有现有的Google Cloud项目并启用Vertex AI API。了解有关设置项目和开发环境的更多信息。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

### 导入库

In [None]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

生成嵌入

1. 设置模型名称。最新的模型是
     * 英语使用 "text-embedding-004"。
     * i18n使用 "text-multilingual-embedding-002"。
     
     可查看[语言覆盖范围](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#language_coverage_for_textembedding-gecko-multilingual_models)获取支持的语言列表。
     
     查看[公共文档](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings)获取支持模型的完整列表。
2. 设置任务类型、文本和标题（*可选并仅适用于任务类型 "RETRIEVAL_DOCUMENT"*）。有效的任务类型包括：
     * "RETRIEVAL_QUERY"
     * "RETRIEVAL_DOCUMENT"
     * "SEMANTIC_SIMILARITY"
     * "CLASSIFICATION"
     * "CLUSTERING"
     * "QUESTION_ANSWERING"（仅适用于最新模型）
     * "FACT_VERIFICATION"（仅适用于最新模型）
3. 设置输出维度（*可选并仅适用于最新模型*）。

In [None]:
# @title { run: "auto" }
MODEL = "text-embedding-004"  # @param ["text-embedding-004", "text-multilingual-embedding-002","text-embedding-preview-0409", "text-multilingual-embedding-preview-0409", "textembedding-gecko@003", "textembedding-gecko-multilingual@001"]
TASK = "RETRIEVAL_DOCUMENT"  # @param ["RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING", "QUESTION_ANSWERING", "FACT_VERIFICATION"]
TEXT = "Banana Muffin?"  # @param {type:"string"}
TITLE = ""  # @param {type:"string"}
OUTPUT_DIMENSIONALITY = 256  # @param [1, 768, "None"] {type:"raw", allow-input:true}

if not MODEL:
    raise ValueError("MODEL must be specified.")
if not TEXT:
    raise ValueError("TEXT must be specified.")
if TITLE and TASK != "RETRIEVAL_DOCUMENT":
    raise ValueError("TITLE can only be specified for TASK 'RETRIEVAL_DOCUMENT'")
if OUTPUT_DIMENSIONALITY is not None and MODEL not in [
    "text-embedding-004",
    "text-multilingual-embedding-002",
    "text-embedding-preview-0409",
    "text-multilingual-embedding-preview-0409",
]:
    raise ValueError(f"OUTPUT_DIMENTIONALITY cannot be specified for model '{MODEL}'.")
if TASK in ["QUESTION_ANSWERING", "FACT_VERIFICATION"] and MODEL not in [
    "text-embedding-004",
    "text-multilingual-embedding-002",
    "text-embedding-preview-0409",
    "text-multilingual-embedding-preview-0409",
]:
    raise ValueError(f"TASK '{TASK}' is not valid for model '{MODEL}'.")

In [None]:
def embed_text(
    model_name: str,
    task_type: str,
    text: str,
    title: str = "",
    output_dimensionality=None,
) -> list:
    """Generates a text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    text_embedding_input = TextEmbeddingInput(
        task_type=task_type, title=title, text=text
    )
    kwargs = (
        dict(output_dimensionality=output_dimensionality)
        if output_dimensionality
        else {}
    )
    embeddings = model.get_embeddings([text_embedding_input], **kwargs)
    return embeddings[0].values


# Get a text embedding for a downstream task.
embedding = embed_text(
    model_name=MODEL,
    task_type=TASK,
    text=TEXT,
    title=TITLE,
    output_dimensionality=OUTPUT_DIMENSIONALITY,
)
print(len(embedding))  # Expected value: {OUTPUT_DIMENSIONALITY}.

清理工作

要清理此项目中使用的所有Google Cloud资源，您可以删除用于本教程的[Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。