In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AutoSxS：检查自动评分仪的对齐性与人类喜好数据集对比

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_evaluation/model_based_llm_evaluation/autosxs_check_alignment_against_human_preference_data.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> 在 Colab 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fmodel_evaluation%2Fmodel_based_llm_evaluation%2Fautosxs_check_alignment_against_human_preference_data.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> 在 Colab Enterprise 中打开
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/model_evaluation/model_based_llm_evaluation/autosxs_check_alignment_against_human_preference_data.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> 在 Workbench 中打开
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_evaluation/model_based_llm_evaluation/autosxs_check_alignment_against_human_preference_data.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> 在 GitHub 上查看
    </a>
  </td>
</table>

## 概述

本笔记本演示了如何使用Vertex AI自动对比（AutoSxS）来检查自动评分器与人工评分器的对齐程度。

自动对比（AutoSxS）是一个模型辅助评估工具，可以帮助您将两个大型语言模型（LLMs）进行对比。作为AutoSxS的预览发布的一部分，我们目前仅支持对总结和问题回答任务的模型比较。我们将在将来支持更多任务和定制功能。

了解更多关于[Vertex AI AutoSxS模型评估](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#autosxs)。

### 目标

在本教程中，您将学习如何使用`Vertex AI Pipelines`和`google_cloud_pipeline_components`来使用人类偏好数据检查评分器的对齐：

本教程使用以下Google Cloud ML服务和资源：

- 云存储
- Vertex AI PaLM API
- Vertex AI Pipelines
- Vertex AI Batch Prediction

执行的步骤包括：
- 创建一个包含预测和人类偏好数据的评估数据集。
- 在本地对数据进行预处理并将其保存在云存储中。
- 创建并运行一个生成判断和一组使用生成判断的AutoSxS指标的Vertex AI AutoSxS Pipeline。
- 打印判断和AutoSxS指标。
- 清理在此笔记本中创建的资源。

### 成本

本教程使用Google Cloud的计费组件：

* Vertex AI
* Cloud Storage

了解[Vertex AI价格](https://cloud.google.com/vertex-ai/pricing)，
以及[Cloud Storage价格](https://cloud.google.com/storage/pricing)，
并使用[Pricing Calculator](https://cloud.google.com/products/calculator/)
根据您预期的使用量生成成本估算。

开始吧

### 为Python安装Vertex AI SDK和其他必要的软件包

In [1]:
! pip3 install --upgrade --quiet \
    google-cloud-aiplatform \
    google-cloud-pipeline-components \
    gcsfs

重新启动运行时（仅限Colab）

为了使用新安装的软件包，您必须在Google Colab上重新启动运行时。

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️内核即将重新启动。在继续下一步之前请等待完成。⚠️</b>
</div>

### 在谷歌 Colab 上验证您的笔记本环境

在谷歌 Colab 上验证您的环境。

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### 设置Google Cloud项目信息并初始化Python的Vertex AI SDK

要开始使用Vertex AI，您必须拥有现有的Google Cloud项目并[启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。了解更多关于[设置项目和开发环境的信息](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

UUID

定义一个UUID生成函数，以避免在笔记本中创建的资源名称冲突。

In [None]:
import random
import string


def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

创建一个云存储桶

创建一个存储桶来存储AutoSxS管道的中间产物。

In [None]:
BUCKET_URI = "gs://[your-bucket-name-unique]"  # @param {type:"string"}

只有在您的存储桶不存在时才运行以下单元格以创建您的云存储存储桶。

In [2]:
if (
    BUCKET_URI == ""
    or BUCKET_URI is None
    or BUCKET_URI == "gs://[your-bucket-name-unique]"
):
    BUCKET_URI = "gs://" + PROJECT_ID + "-aip-" + UUID

! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

### 导入库

导入Vertex AI Python SDK和其他必需的Python库。

In [3]:
import os

import pandas as pd
from google.cloud import aiplatform
from google_cloud_pipeline_components.v1 import model_evaluation
from kfp import compiler

初始化Vertex AI SDK用于Python

为您的项目和相应的存储桶初始化Python的Vertex SDK。

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### 教程
在所有客户使用情况下，作者可能不太可能达到与人类评分者相同的水平，特别是在人类评分者被期望具有专业知识的情况下。

下面的教程展示了AutoSxS如何帮助确定一旦您拥有了地面真实的人类偏好数据，您是否可以信任作者。

### 生成用于 AutoSxS 人类对齐检查的评估数据集

在下面，您将创建您的数据集，指定一组提示、两个模型的预测和人类偏好数据。

在这个笔记本中，我们：
- 为 AutoSxS 创建一个包含 10 个示例的评估数据集。
  - 列“prompt”中的数据将被视为模型提示。
  - 列“pred_a”中的数据将被视为模型 A 的响应。
  - 列“pred_b”中的数据将被视为模型 B 的响应。
  - 列“actuals”中的数据将被视为人类偏好数据。
- 将其存储为 JSON 文件在云存储中。

#### **注意：为了获得最佳结果，我们建议至少使用 100 个示例。超过 400 个示例后收益递减。**

In [4]:
# Define context, questions, predictions and human preference data.
context = [
    "Beginning in the late 1910s and early 1920s, Whitehead gradually turned his attention from mathematics to philosophy of science, and finally to metaphysics. He developed a comprehensive metaphysical system which radically departed from most of western philosophy. Whitehead argued that reality consists of processes rather than material objects, and that processes are best defined by their relations with other processes, thus rejecting the theory that reality is fundamentally constructed by bits of matter that exist independently of one another. Today Whitehead's philosophical works – particularly Process and Reality – are regarded as the foundational texts of process philosophy.",
    "The gills have an adnate attachment to the cap, are narrow to moderately broad, closely spaced, and eventually separate from the stem. Young gills are cinnamon-brown in color, with lighter edges, but darken in maturity because they become covered with the dark spores. The stem is 6 to 8 cm (2+3⁄8 to 3+1⁄8 in) long by 1.5 to 2 mm (1⁄16 to 3⁄32 in) thick, and roughly equal in width throughout except for a slightly enlarged base. The lower region of the stem is brownish in color and has silky 'hairs' pressed against the stem; the upper region is grayish and pruinose (lightly dusted with powdery white granules). The flesh turns slightly bluish or greenish where it has been injured. The application of a drop of dilute potassium hydroxide solution on the cap or flesh will cause a color change to pale to dark yellowish to reddish brown; a drop on the stem produces a less intense or no color change.",
    "Go to Device Support. Choose your device. Scroll to Getting started and select Hardware &amp; phone details. Choose Insert or remove SIM card and follow the steps. Review the Account Summary page for details. Image 13 Activate online Go to att.com/activateprepaid ((att.com/activarprepaid for Spanish)) and follow the prompts. Activate over the phone Call us at 877.426.0525 for automated instructions. You will need to know your SIM/eSIM ICCID &amp; IMEI number for activation. Note: Look for your SIM (( ICCID )) number on your box or SIM card Now youre ready to activate your phone 1. Start with your new device powered off. 2. To activate a new line of service or a replacement device, please go to the AT&amp;T Activation site or call 866.895.1099. You download the eSIM to your device over Wi-Fi®. The eSIM connects your device to our wireless network. How do I activate my phone with an eSIM? Turn your phone on, connect to Wi-Fi, and follow the prompts. Swap active SIM cards AT&amp;T Wireless SM SIM Card Turn your device off. Remove the old SIM card. Insert the new one. Turn on your device.",
    "According to chief astronaut Deke Slayton's autobiography, he chose Bassett for Gemini 9 because he was 'strong enough to carry' both himself and See. Slayton had also assigned Bassett as command module pilot for the second backup Apollo crew, alongside Frank Borman and William Anders.",
    "Adaptation of the endosymbiont to the host's lifestyle leads to many changes in the endosymbiont–the foremost being drastic reduction in its genome size. This is due to many genes being lost during the process of metabolism, and DNA repair and recombination. While important genes participating in the DNA to RNA transcription, protein translation and DNA/RNA replication are retained. That is, a decrease in genome size is due to loss of protein coding genes and not due to lessening of inter-genic regions or open reading frame (ORF) size. Thus, species that are naturally evolving and contain reduced sizes of genes can be accounted for an increased number of noticeable differences between them, thereby leading to changes in their evolutionary rates. As the endosymbiotic bacteria related with these insects are passed on to the offspring strictly via vertical genetic transmission, intracellular bacteria goes through many hurdles during the process, resulting in the decrease in effective population sizes when compared to the free living bacteria. This incapability of the endosymbiotic bacteria to reinstate its wild type phenotype via a recombination process is called as Muller's ratchet phenomenon. Muller's ratchet phenomenon together with less effective population sizes has led to an accretion of deleterious mutations in the non-essential genes of the intracellular bacteria. This could have been due to lack of selection mechanisms prevailing in the rich environment of the host.",
    "The National Archives Building in downtown Washington holds record collections such as all existing federal census records, ships' passenger lists, military unit records from the American Revolution to the Philippine–American War, records of the Confederate government, the Freedmen's Bureau records, and pension and land records.",
    "Standard 35mm photographic film used for cinema projection has a much higher image resolution than HDTV systems, and is exposed and projected at a rate of 24 frames per second (frame/s). To be shown on standard television, in PAL-system countries, cinema film is scanned at the TV rate of 25 frame/s, causing a speedup of 4.1 percent, which is generally considered acceptable. In NTSC-system countries, the TV scan rate of 30 frame/s would cause a perceptible speedup if the same were attempted, and the necessary correction is performed by a technique called 3:2 Pulldown: Over each successive pair of film frames, one is held for three video fields (1/20 of a second) and the next is held for two video fields (1/30 of a second), giving a total time for the two frames of 1/12 of a second and thus achieving the correct average film frame rate.",
    "Maria Deraismes was initiated into Freemasonry in 1882, then resigned to allow her lodge to rejoin their Grand Lodge. Having failed to achieve acceptance from any masonic governing body, she and Georges Martin started a mixed masonic lodge that actually worked masonic ritual. Annie Besant spread the phenomenon to the English speaking world. Disagreements over ritual led to the formation of exclusively female bodies of Freemasons in England, which spread to other countries. Meanwhile, the French had re-invented Adoption as an all-female lodge in 1901, only to cast it aside again in 1935. The lodges, however, continued to meet, which gave rise, in 1959, to a body of women practising continental Freemasonry.",
    "Excavation of the foundations began in November 1906, with an average of 275 workers during the day shift and 100 workers during the night shift. The excavation was required to be completed in 120 days. To remove the spoils from the foundation, three temporary wooden platforms were constructed to street level. Hoisting engines were installed to place the beams for the foundation, while the piers were sunk into the ground under their own weight. Because of the lack of space in the area, the contractors' offices were housed beneath the temporary platforms. During the process of excavation, the Gilsey Building's foundations were underpinned or shored up, because that building had relatively shallow foundations descending only 18 feet (5.5 m) below Broadway.",
    "Dopamine consumed in food cannot act on the brain, because it cannot cross the blood–brain barrier. However, there are also a variety of plants that contain L-DOPA, the metabolic precursor of dopamine. The highest concentrations are found in the leaves and bean pods of plants of the genus Mucuna, especially in Mucuna pruriens (velvet beans), which have been used as a source for L-DOPA as a drug. Another plant containing substantial amounts of L-DOPA is Vicia faba, the plant that produces fava beans (also known as 'broad beans'). The level of L-DOPA in the beans, however, is much lower than in the pod shells and other parts of the plant. The seeds of Cassia and Bauhinia trees also contain substantial amounts of L-DOPA.",
]

questions = [
    "What was the predominant theory of reality that Whitehead opposed?",
    "Why do the gills on the Psilocybe pelliculosa mushroom darken as they mature?",
    "user: How do I provision my AT&T SIM card?",
    "Why did chief astronaut Deke Slayton choose Charles Bassett for Gemini 9, according to Slayton's autobiography?",
    "What is the main alteration in an endosymbiont when it adapts to a host?",
    "What's the earliest war The National Archives Building has military unit records for",
    "To be shown on SDTV in PAL-system countries, at what rate is cinema film scanned?",
    "What year was the all-female masonic lodge cast aside?",
    "Why did the Gilsey Building have underpinned and shored up foundations?",
    "Why can dopamine consumed in food not act on the brain?",
]
predictions_a = [
    "bits of matter that exist independently of one another",
    "The gills darken in maturity because they become covered with the dark spores.",
    "Go to Device Support. Choose your device. Scroll to Getting started and select Hardware &amp; phone details. Choose Insert or remove SIM card and follow the steps.",
    "he was 'smart enough to carry' both himself and See",
    "drastic reduction in its genome size",
    "American Revolution to the Philippine–American War",
    "Cinema film is scanned at the TV rate of 25 frame/s.",
    "1935",
    "The Gilsey Building's foundations were shored up because they were only 18 feet below Broadway.",
    "The blood–brain barrier does not allow dopamine consumed in food to enter the brain.",
]
predictions_b = [
    "independent bits of matter",
    "Young gills are cinnamon-brown in color, with lighter edges, but darken in maturity because they become covered with the dark spores.",
    "Go to Device Support.",
    "he was 'strong enough to carry' both himself and See, as stated by chief astronaut Deke Slayton in his autobiography",
    "its genome size decrease",
    "American Revolution",
    "25 frame/s, causing a speedup of 4.1 percent",
    "1901",
    "The Gilsey Building's foundations were underpinned or shored up.",
    "Mucuna pruriens (velvet beans) have been used as a source for L-DOPA as a drug. Another plant containing substantial amounts of L-DOPA is Vicia faba, the plant that produces fava beans (also known as 'broad beans').",
]

human_preference = [
    "A",
    "B",
    "A",
    "B",
    "A",
    "A",
    "B",
    "A",
    "A",
    "A",
]

# Create the evaluation dataset with context, questions, predictions and human preference data.
examples = pd.DataFrame(
    {
        "context": context,
        "questions": questions,
        "pred_a": predictions_a,
        "pred_b": predictions_b,
        "actuals": human_preference,
    }
)
examples.head()

[可选]从云存储加载您的JSONL评估数据集。

您也可以从云存储加载自己的JSONL数据集。

In [None]:
# # Uncomment to read from Cloud Storage.
# GCS_PATH = 'gs://your-own-evaluation-dataset-with-human-preference-data.jsonl'
# preds = pd.read_json(GCS_PATH, lines=True)

#### 将您的数据集上传到云存储

最后，我们将评估数据集上传到云存储，以用作AutoSxS的输入。

In [5]:
# Upload predictions to the Cloud Storage bucket.
examples.to_json(
    "evaluation_dataset_with_human_preference.json", orient="records", lines=True
)
! gsutil cp evaluation_dataset_with_human_preference.json $BUCKET_URI/input/evaluation_dataset_with_human_preference.json
DATASET = f"{BUCKET_URI}/input/evaluation_dataset_with_human_preference.json"

### 创建并运行AutoSxS作业

为了运行AutoSxS，我们需要定义一个具有以下参数的`autosxs_pipeline`作业。有关AutoSxS管道配置的更多详细信息，请参阅[此处](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.9.0/api/preview/model_evaluation.html#preview.model_evaluation.autosxs_pipeline)。

**必需参数:**
  - **evaluation_dataset:** 包含评估示例的JSONL数据集的云存储路径列表。
  - **task:** 以{task}@{version}形式的评估任务。任务可以是“summarization”，“question_answering”之一。版本是一个带有3位数字或“latest”的整数。例如：summarization@001或question_answering@latest。
  - **id_columns:** 区分唯一评估示例的列。
  - **autorater_prompt_parameters:** 将autorater提示参数映射到列或模板。期望的参数包括：
      - inference_instruction - 执行任务的详细信息。
      - inference_context - 执行任务所需要参考的内容。

此外，我们需要指定候选模型（模型A和模型B）的预测来自何处。AutoSxS可以通过运行Vertex Batch Prediction来获取预测值，或者可以在评估数据集中提供预定义的预测列。

**如果使用Batch Prediction进行模型参数设置（假设模型A）:**
  - **model_a:** 完全合格的模型资源名称。如果指定了模型A响应，则此参数为可选参数。
  - **model_a_prompt_parameters:** 将模型A提示模板参数映射到列或模板。对于[text-bison](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#request_body)的情况，唯一需要的参数是`prompt`。
  - **model_a_parameters:** 控制来自模型A的预测的参数，例如模型温度等。

**如果使用自己带有的预测值进行模型参数设置（假设模型A）:**
  - **response_column_a:** 包含模型A响应的列。如果为模型A提供了任何响应表，则此项为必需项。

最后，还有一些参数可配置其他功能，例如导出判断或将判断与人类偏好数据集进行比较，以检查AutoRater与人类评分者的对齐情况。
  - **judgments_format:** 写入判断的格式。可以是'json'或'bigquery'。
  - **bigquery_destination_prefix:** 如果指定格式为'bigquery'，则写入判断的BigQuery表。
  - **human_preference_column:** 包含基准真相的列。仅在用户想要检查autorater与人类偏好对齐时才需要。

在这本笔记本中，我们将评估自动评分器如何与人类评分器对齐，使用两个模型的预测（位于“PREDS”数据集的`pred_a`列和`pred_b`列）和人类偏好数据（位于“PREDS”数据集的`actuals`列）。正在执行的任务是问答。

首先，在本地编译AutoSxS流水线。

In [None]:
template_uri = "pipeline.yaml"
compiler.Compiler().compile(
    pipeline_func=model_evaluation.autosxs_pipeline,
    package_path=template_uri,
)

下面的代码启动了一个可以在Vertex UI中查看的顶点管道作业。这个管道作业将需要大约10分钟。

这里的日志将包含当前管道的URL，这样您就可以跟踪管道的进度并访问/查看管道的输出。

In [6]:
display_name = f"autosxs-question-answering-human-alignment-checking-{generate_uuid()}"
context_column = "context"
question_column = "questions"
response_column_a = "pred_a"
response_column_b = "pred_b"
human_preference_column = "actuals"
parameters = {
    "evaluation_dataset": DATASET,
    "id_columns": [question_column],
    "autorater_prompt_parameters": {
        "inference_context": {"column": context_column},
        "inference_instruction": {"column": question_column},
    },
    "task": "question_answering",
    "response_column_a": response_column_a,
    "response_column_b": response_column_b,
    "human_preference_column": human_preference_column,
}

job = aiplatform.PipelineJob(
    job_id=display_name,
    display_name=display_name,
    pipeline_root=os.path.join(BUCKET_URI, display_name),
    template_path=template_uri,
    parameter_values=parameters,
    enable_caching=False,
)
job.run()

获取判断和AutoSxS度量标准
接下来，我们可以从完成的AutoSxS作业中加载判断。

结果将被写入您在AutoSxS作业请求中指定的云存储输出桶中。

In [7]:
# To use an existing pipeline, override job using the line below.
# job = aiplatform.PipelineJob.get('projects/[PROJECT_NUMBER]/locations/[LOCATION]/pipelineJobs/[PIPELINE_RUN_NAME]')

for details in job.task_details:
    if details.task_name == "online-evaluation-pairwise":
        break

# Judgments
judgments_uri = details.outputs["judgments"].artifacts[0].uri
judgments_df = pd.read_json(judgments_uri, lines=True)
judgments_df.head()

如果任何示例在AutoSxS中未能获得结果，它们的错误消息将被存储在错误表中。如果错误表为空，则意味着在评估过程中没有失败的示例。

In [8]:
for details in job.task_details:
    if details.task_name == "online-evaluation-pairwise":
        break

# Error table
error_messages_uri = details.outputs["error_messages"].artifacts[0].uri
errors_df = pd.read_json(error_messages_uri, lines=True)
errors_df.head()

我们还可以查看从判断中计算出的AutoSxS指标。

在提供了人类偏好数据的情况下，AutoSxS会输出AutoRater的胜率以及一组人类偏好对齐指标。您可以在这里找到有关AutoSxS指标的更多详细信息。

In [9]:
# Metrics
for details in job.task_details:
    if details.task_name == "model-evaluation-text-generation-pairwise":
        break
pd.DataFrame([details.outputs["autosxs_metrics"].artifacts[0].metadata])

清理

要清理此项目中使用的所有谷歌云资源，您可以删除用于教程的[谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源：

将`delete_bucket`设置为**True**以删除云存储桶。

In [None]:
import os

job.delete()

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI