In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI模型花园-猎鹰指导（PEFT）

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_falcon_instruct_peft.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_falcon_instruct_peft.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_falcon_instruct_peft.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
在Vertex AI Workbench中打开
    </a> （建议使用Python-3 GPU笔记本）
  </td>
</table>

## 概述

本笔记本演示了在本地运行预先构建的猎鹰指导模型进行推理，部署预建猎鹰指导模型，使用性能高效的微调库（[PEFT](https://github.com/huggingface/peft)）对猎鹰指导模型进行微调和部署，使用[GPTQ](https://arxiv.org/abs/2210.17323)对猎鹰指导模型进行量化和部署，以及在Vertex AI中评估PEFT微调的猎鹰指导模型。

### 目标

- 在预建猎鹰指导模型上在本地运行推理
- 部署预建猎鹰指导模型
- 使用PEFT微调和部署猎鹰指导模型
- 使用GPTQ量化和部署猎鹰指导模型
- 评估PEFT微调的猎鹰指导模型

| 模型 | LoRA |
| :- | :- |
| [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) | 是 |
| [tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct) | 是 |

### 成本

本教程使用谷歌云的收费组件：

* Vertex AI
* Cloud Storage

了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[Cloud Storage定价](https://cloud.google.com/storage/pricing)，并使用[定价计算器](https://cloud.google.com/products/calculator/)根据您的预期使用量生成成本估算。

在开始之前

**注意**：Jupyter以带有`!`前缀的行作为shell命令运行，并将以`$`前缀的Python变量插入这些命令中。

使用Falcon Instruct模型在本地进行推理需要GPU。

仅适用于 Colab
如果您使用 Workbench，请跳过此部分并运行以下命令。

In [None]:
import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    ! pip3 install google-cloud-language==2.10.0
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
! pip3 install transformers==4.31.0
! pip3 install einops==0.6.1
! pip3 install accelerate==0.21.0

### 设置Google Cloud项目

1. [选择或创建一个Google Cloud项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建帐户时，您将获得300美元的免费信用额度，用于您的计算/存储成本。

2. [确保您的项目已启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用Vertex AI API，Compute Engine API和Cloud Natural Language API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component,language.googleapis.com)。

4. [创建一个Cloud Storage存储桶](https://cloud.google.com/storage/docs/creating-buckets)来存储实验输出。

5. [创建一个服务账号](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console)，具有`Vertex AI User`和`Storage Object Admin`角色，用于将经过微调的模型部署到Vertex AI端点。

为实验环境设置以下变量。指定的云存储桶（`BUCKET_URI`）应位于指定的地区（`REGION`）中。请注意，多区域存储桶（例如“us”）不被视为与多区域范围（例如“us-central1”）中覆盖的单一地区匹配。

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = ""  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output with gs:// prefix.
BUCKET_URI = "gs://"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
DATA_BUCKET = os.path.join(EXPERIMENT_BUCKET, "data")
MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

### 初始化 Vertex AI API

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### 定义常量

In [None]:
# The pre-built training, serving and evaluation docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20231207_0936_RC00"
PREDICTION_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-serve:20231129_0948_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231127_0916_RC00"
VLLM_GPTQ_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:gptq"
EVAL_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-lm-evaluation-harness:20231011_0934_RC00"

定义常用函数

In [None]:
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform, language


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model(
    model_name: str,
    model_id: str,
    finetuned_lora_model_path: str,
    service_account: str,
    task: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task,
        "DEPLOY_SOURCE": "notebook",
    }
    if finetuned_lora_model_path:
        serving_env["FINETUNED_LORA_MODEL_PATH"] = finetuned_lora_model_path
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=PREDICTION_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/peft_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
    quantization_method: str = "",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--disable-log-stats",
        "--dtype=float16",
        "--trust-remote-code",
    ]
    if quantization_method:
        vllm_args.append(f"--quantization={quantization_method}")
    if quantization_method == "gptq":
        vllm_docker_uri = VLLM_GPTQ_DOCKER_URI
    else:
        vllm_docker_uri = VLLM_DOCKER_URI

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vllm_docker_uri,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def moderate_text(text: str) -> language.ModerateTextResponse:
    """Calls Vertex AI APIs to analyze text moderations."""
    client = language.LanguageServiceClient()
    document = language.Document(
        content=text,
        type_=language.Document.Type.PLAIN_TEXT,
    )
    return client.moderate_text(document=document)


def show_text_moderation(text: str, response: language.ModerateTextResponse) -> None:
    """Shows text moderation results."""
    import pandas as pd

    def confidence(category: language.ClassificationCategory) -> float:
        return category.confidence

    columns = ["category", "confidence"]
    categories = sorted(response.moderation_categories, key=confidence, reverse=True)
    data = ((category.name, category.confidence) for category in categories)
    df = pd.DataFrame(columns=columns, data=data)

    print(f"Text analyzed:\n{text}")
    print(df.to_markdown(index=False, tablefmt="presto", floatfmt=".0%"))

使用预先构建的Falcon Instruct模型在本地运行推理

您需要至少16GB的内存才能快速运行Falcon-7B-Instruct的推理。

In [None]:
import torch
import transformers
from transformers import AutoTokenizer

model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
sequences = pipeline(
    "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Girafatron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

## 部署预先构建的Falcon Instruct模型

本节在端点上部署预构建的Falcon Instruct模型。模型部署步骤需要15到40分钟才能完成。

[tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)和[tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct)的GPU内存峰值使用分别为约15.5G和约84G，具有默认设置。请根据需要调整机器类型、加速器类型和加速器数量。我们在部署中使用V100作为示例。请注意，V100服务通常比L4服务提供更好的吞吐量和延迟性能，而L4服务通常比V100服务更具成本效益。V100和L4 GPU的服务效率不及A100 GPU，但如果没有A100配额，V100和L4 GPU仍然是良好的服务解决方案。

设置预构建模型ID。

In [None]:
prebuilt_model_id = "tiiuae/falcon-7b-instruct"  # @param ["tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]

我们使用PEFT服务的图像来部署预构建的猎鹰指导模型，将微调LoRA模型路径设置为空。

In [None]:
# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/predictions/configure-compute

# Sets V100 (16G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may deploy tiiuae/falcon-40b-instruct with
#  multiple V100s. Please keep in mind that the efficiency of serving with
#  multiple V100s is inferior to that of serving with A100s.
# Compared with L4, V100 serving can have better throughput and latency.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1  # for tiiuae/falcon-7b-instruct
# accelerator_count = 8  # for tiiuae/falcon-40b-instruct

# Sets L4 (24G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may deploy tiiuae/falcon-40b-instruct with
#  multiple L4s. Please keep in mind that the efficiency of serving with
#  multiple L4s is inferior to that of serving with A100s.
# Compared with V100, L4 serving can be more cost efficient.

# For tiiuae/falcon-7b-instruct.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# For tiiuae/falcon-40b-instruct.
# machine_type = "g2-standard-48"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 4

# Sets A100 (40G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1  # for tiiuae/falcon-7b-instruct
# accelerator_count = 4  # for tiiuae/falcon-40b-instruct

# Sets A100 (80G) to deploy falcon-40b-instruct models for faster inferences.
# machine_type = "a2-ultragpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 2

model_without_peft, endpoint_without_peft = deploy_model(
    model_name=get_job_name_with_datetime(prefix="falcon-instruct-serve"),
    model_id=prebuilt_model_id,
    finetuned_lora_model_path="",  # This will avoid override finetuning models.
    service_account=SERVICE_ACCOUNT,
    task="instruct-lora",
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)
print("endpoint_name:", endpoint_without_peft.name)

注意：部署成功后，预先构建的模型权重将从原始位置即时下载。因此，在上述模型部署步骤成功后，需要额外等待10-30分钟，然后才能运行下面的下一步。否则，在发送请求到端点时，您可能会看到“ServiceUnavailable: 503 502:Bad Gateway”错误。

一旦部署成功，您可以通过文本提示向端点发送请求。

示例：

```
人类：汽车是什么？
助手：汽车，或者叫机动车，是一种用于将人或货物从一个地方运送到另一个地方的与道路相连的人类交通系统。该术语还包括各种车辆，包括摩托艇、火车和飞机。汽车通常有四个轮子、一个乘客舱和一个发动机或马达。它们自19世纪初以来就存在，并且现在是最受欢迎的交通方式之一，用于日常通勤、购物和其他目的。
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_without_peft.name` allows us to get the
#   endpoint name of the endpoint `endpoint_without_peft` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_without_peft = aiplatform.Endpoint(aip_endpoint_name)

instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_without_peft.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

使用PEFT微调和部署Falcon Instruct模型的方法示例在此部分展示。

设置基本模型ID。

In [None]:
model_id = "tiiuae/falcon-7b-instruct"  # @param ["tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]

### 调整优化

使用Vertex AI SDK来使用Vertex AI Model Garden训练图像，创建和运行自定义训练作业。

这个示例使用数据集[timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco)。您可以使用[huggingface的数据集](https://huggingface.co/datasets)或存储在Cloud Storage中的[Vertex文本模型数据集格式](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#dataset-format)中的自定义JSONL数据集。`template`参数是可选的。

LoRA模型在[tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)和[tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct)的微调过程中，默认训练参数和示例数据集下，使用的显存峰值分别为~11G和~34G。Falcon-7b-instruct可以在1个P100/V100上进行微调，而falcon-40b-instruct可以在1个A100（40G）上进行微调。

#### [可选] 使用自定义数据集进行微调

要使用自定义数据集，您应该在下面的`dataset_name`中提供`gs://` URI指向一个符合[Vertex文本模型数据集格式](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#dataset-format)的JSONL文件。

例如，这是样本数据集`gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`中的一个数据点：

```json
{"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}
```

要使用包含`input_text`和`output_text`字段的样本数据集，请将`dataset_name`设置为`gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`，并将`template`设置为`vertex_sample`。要进一步使用自定义数据集字段，请参阅[模板示例](https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json)，并提供您自己的JSON模板作为`gs://` URIs。

In [None]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

# Worker pool spec.
# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/training/configure-compute

# Uses V100 (16G) to finetune falcon-7b-instruct.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Uses L4 (24G) to finetune falcon-7b-instruct.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Uses V100 (16G) to finetune falcon-40b-instruct.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 4

# Uses L4 (24G) to finetune falcon-40b-instruct.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

# Uses A100 (40G) to finetune falcon-40b-instruct.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1

replica_count = 1

# Setup training job.
job_name = get_job_name_with_datetime("falcon-instruct-lora-train")
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)
output_dir = os.path.join(MODEL_BUCKET, job_name)
output_dir_gcsfuse = output_dir.replace("gs://", "/gcs/")

# Pass training arguments and launch job.
max_steps = 10
train_job.run(
    args=[
        "--task=instruct-lora",
        f"--pretrained_model_id={model_id}",
        f"--dataset_name={dataset_name}",
        f"--output_dir={output_dir_gcsfuse}",
        "--lora_rank=64",
        "--lora_alpha=16",
        "--lora_dropout=0.1",
        "--warmup_ratio=0.03",
        f"--max_steps={max_steps}",
        "--max_seq_length=512",
        "--learning_rate=2e-4",
        f"--template={template}",
    ],
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
)

print("Trained models were saved in: ", output_dir)

### 部署
该部分将模型上传至模型注册表并在端点上部署。

模型部署步骤将需要15分钟至40分钟完成。

[tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)和[tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct)在默认设置下使用LoRA权重的峰值GPU内存使用分别为约15.5G和约84G。请根据需要调整机器类型、加速器类型和加速器数量。我们在部署中使用V100作为示例。请注意，V100服务通常比L4服务提供更好的吞吐量和延迟性能，而L4服务通常比V100服务更具成本效益。V100和L4 GPU的服务效率不及A100 GPU，但如果您没有A100配额，V100和L4 GPU仍然是不错的服务解决方案。

In [None]:
# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/predictions/configure-compute

# Sets V100 (16G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may deploy tiiuae/falcon-40b-instruct with
#  multiple V100s. Please keep in mind that the efficiency of serving with
#  multiple V100s is inferior to that of serving with A100s.
# Compared with L4, V100 serving can have better throughput and latency.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1  # for tiiuae/falcon-7b-instruct
# accelerator_count = 8  # for tiiuae/falcon-40b-instruct

# Sets L4 (24G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may deploy tiiuae/falcon-40b-instruct with
#  multiple L4s. Please keep in mind that the efficiency of serving with
#  multiple L4s is inferior to that of serving with A100s.
# Compared with V100, L4 serving can be more cost efficient.

# For tiiuae/falcon-7b-instruct.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# For tiiuae/falcon-40b-instruct.
# machine_type = "g2-standard-48"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 4

# Sets A100 (40G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1  # for tiiuae/falcon-7b-instruct
# accelerator_count = 4  # for tiiuae/falcon-40b-instruct

# Sets A100 (80G) to deploy falcon-40b-instruct models for faster inferences.
# machine_type = "a2-ultragpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100_80GB"
# accelerator_count = 2

model_with_peft, endpoint_with_peft = deploy_model(
    model_name=get_job_name_with_datetime(prefix="falcon-instruct-peft-serve"),
    model_id=model_id,
    finetuned_lora_model_path=os.path.join(output_dir, "checkpoint-" + str(max_steps)),
    service_account=SERVICE_ACCOUNT,
    task="instruct-lora",
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)
print("endpoint_name:", endpoint_with_peft.name)

注意：部署成功后，基本模型权重将会从原始位置实时下载，并且 LoRA 模型权重将从在训练中使用的 GCS 存储桶中下载。因此，在上述模型部署步骤成功之后，需要额外等待 10-30 分钟，**然后**才能运行下面的下一步。否则，在向端点发送请求时可能会看到“ServiceUnavailable: 503 502: Bad Gateway”错误。

一旦部署成功，您就可以发送文本提示请求到端点。

例如：

```
人类：汽车是什么？
助手：汽车，或者叫汽车，是一种用于将人或货物从一处运到另一处的与道路连接的人类交通系统。这个名词还包括各种车辆，如摩托艇、火车和飞机。汽车通常有四个车轮、一个乘客舱和一个发动机。从19世纪初就存在，现在是最流行的交通工具之一，常用于日常通勤、购物等各种目的。
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_with_peft.name` allows us to get the
#   endpoint name of the endpoint `endpoint_with_peft` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_with_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_with_peft = aiplatform.Endpoint(aip_endpoint_name)

instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_with_peft.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## 对Falcon Instruct模型进行量化并部署

本节展示了使用Vertex Custom Job对Falcon Instruct模型进行后训练量化。量化可以减少模型所需的内存，同时尽量保持相同的性能。更多关于GPTQ的信息请阅读以下文章：[GPTQ: 准确的生成式预训练Transformer后训练量化](https://arxiv.org/abs/2210.17323)。

### 使用Google Cloud文本审查部署预量化模型
这里提供了许多GPTQ-量化模型（链接：https://huggingface.co/TheBloke?search_models=-gptq）。

本节将模型上传至模型注册表并在端点上部署。

模型部署步骤将花费15分钟到1小时的时间，取决于模型大小。

请注意，部署一个量化模型所需的GPU资源要少得多。我们只需使用两个L4s即可部署一个量化的40B模型，而不是四个。

In [None]:
quantized_model_id = "TheBloke/Falcon-7B-Instruct-GPTQ"  # @param ["TheBloke/Falcon-7B-Instruct-GPTQ", "TheBloke/falcon-40b-instruct-GPTQ"]

quantization_method = "gptq"

# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 (24G) to deploy Falcon Instruct 7B model.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Sets 2 L4's (24G) to deploy Falcon Instruct 40B model.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

model_prequantized_vllm, endpoint_prequantized_vllm = deploy_model_vllm(
    model_name=get_job_name_with_datetime(
        prefix="falcon-instruct-serve-vllm-prequantized"
    ),
    model_id=quantized_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    quantization_method=quantization_method,
)

注意：在部署成功后，模型权重将会即时下载。因此在上述模型部署步骤成功之后，以及在您运行下面的下一步之前，需要额外10~40分钟的等待时间（取决于模型大小）。否则，当您向端点发送请求时，可能会看到`ServiceUnavailable: 503 502: Bad Gateway`错误。

一旦部署成功，您就可以使用文本提示向端点发送请求。

示例：

```
人类：什么是汽车？
助手：汽车，或称为机动车，是一种用于将人们或货物从一处运送到另一处的道路交通系统。这个术语还包括了各种车辆，如摩托艇、火车和飞机。汽车通常有四个轮子、一个乘客舱和一个引擎或马达。它们自19世纪初以来就存在了，并且现在已经成为最受欢迎的交通方式之一，用于日常通勤、购物和其他用途。
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_prequantized_vllm.name` allows us to get the
#   endpoint name of the endpoint `endpoint_prequantized_vllm` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_prequantized_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_prequantized_vllm = aiplatform.Endpoint(aip_endpoint_name)


# Overides max_length and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_length as 20.
# If you are using L4 GPUs to serve Falcon Instruct 40B models, you should set
# max_length to around 1,000 tokens or fewer. If you need longer generated
# sequences, please file a request with Vertex to allowlist your project for a
# longer timeout threshold with Vertex endpoints.
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_prequantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

文本审核会分析文档与安全属性列表进行对比，其中包括“有害类别”和可能被视为敏感的主题。

In [None]:
for generated_text in response.predictions:
    # Send a request to the API.
    response = moderate_text(generated_text)
    # Show the results.
    show_text_moderation(generated_text, response)

### 量化法鹰指导模型

量化通过减少权重的比特精度来减少为模型提供服务所需的GPU数量，同时最大限度地减少性能下降。在VLLM上为模型提供量化模型需要将模型量化为4位。建议首先搜索模型是否已经被量化并公开可用：[GPTQ](https://huggingface.co/TheBloke?search_models=-gptq)。

使用GPTQ量化Falcon Instruct 7B模型需大约40分钟，使用1个NVIDIA_L4 GPU；使用4个NVIDIA_L4 GPU量化Falcon Instruct 40B模型需大约4小时。

经过微调的模型也可以被量化，只要LoRA权重与基础模型合并。

设置基本模型ID。

In [None]:
model_id = "tiiuae/falcon-7b-instruct"  # @param ["tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]

In [None]:
# Set up quantization job.

# Set `finetuned_model_path` to the GCS path of the merged finetuned model
# from the section above, if not set, the base model will be quantized.
finetuned_model_path = ""  # @param {type:"string"}
if finetuned_model_path:
    prequantized_model_path = finetuned_model_path
else:
    prequantized_model_path = model_id

quantization_method = "gptq"
quantization_job_name = get_job_name_with_datetime(
    f"falcon-instruct-{quantization_method}-quantize"
)

quantization_output_dir = os.path.join(MODEL_BUCKET, quantization_job_name)
quantization_output_dir_gcsfuse = quantization_output_dir.replace("gs://", "/gcs/")

# Worker pool spec.

# Set 1 L4 (24G) for quantizing 7b model.
machine_type = "g2-standard-32"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Set 4 L4 (24G) for quantizing 40b model.
# machine_type = "g2-standard-48"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 4

# Quantization parameters.
quantization_precision_mode = "4bit"

# The original datasets used in GPTQ paper.
gptq_dataset_name = "c4"  # @param ["wikitext2","c4","c4-new","ptb","ptb-new"]
group_size = -1
damp_percent = 0.1
desc_act = True
quantization_args = [
    "--task=quantize-model",
    f"--quantization_method={quantization_method}",
    f"--pretrained_model_id={model_id}",
    f"--quantization_precision_mode={quantization_precision_mode}",
    f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
    f"--quantization_dataset_name={gptq_dataset_name}",
    f"--group_size={group_size}",
    f"--damp_percent={damp_percent}",
    f"--desc_act={desc_act}",
    "--cache_examples_on_gpu=False",
]

# Pass quantization arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,
        "disk_spec": {
            "boot_disk_type": "pd-ssd",
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "env": [
                {
                    "name": "PYTORCH_CUDA_ALLOC_CONF",
                    "value": "max_split_size_mb:32",
                },
            ],
            "command": [],
            "args": quantization_args,
        },
    }
]

print(f"Quantizing {prequantized_model_path}.")
quantize_job = aiplatform.CustomJob(
    display_name=quantization_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)
quantize_job.run()

print("Quantized models were saved in: ", quantization_output_dir)

### 使用Google Cloud Text Moderation部署量化模型
本节将模型上传至模型注册表并部署到终端。

模型部署步骤需要15分钟至1小时不等的时间才能完成，取决于模型大小。

注意，部署一个量化模型需要更少的GPU。我们可以只用两个L4 GPU来部署一个量化的40B模型，而不是四个。

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 (24G) to deploy Falcon Instruct 7B.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Sets 2 L4's (24G) to deploy Falcon Instruct 70B models.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

model_quantized_vllm, endpoint_quantized_vllm = deploy_model_vllm(
    model_name=get_job_name_with_datetime(
        prefix="falcon-instruct-serve-vllm-quantized"
    ),
    model_id=quantization_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    quantization_method=quantization_method,
)

注意：部署成功后，模型权重将会立即下载。因此，在上述模型部署步骤成功之后，在您运行下面的下一步之前，需要额外等待10到40分钟的时间（取决于模型大小）。否则，当您向端点发送请求时，可能会看到“ 服务不可用：503 502：网关错误”。

一旦部署成功，您就可以使用文本提示向端点发送请求。

例：
```
人类：什么是汽车？
助手： 汽车，或者摩托车，是一种道路连接的人类交通系统，用于将人或货物从一地运送到另一地。这个术语还包括各种车辆，包括摩托艇，火车和飞机。汽车通常有四个轮子，一个供乘客的机舱，以及一个发动机或电动机。汽车自19世纪初以来一直存在，现在是最受欢迎的交通方式之一，用于日常通勤，购物和其他用途。
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_quantized_vllm.name` allows us to get the
#   endpoint name of the endpoint `endpoint_quantized_vllm` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_quantized_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_quantized_vllm = aiplatform.Endpoint(aip_endpoint_name)


# Overides max_length and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_length as 20.
# If you are using L4 GPUs to serve Falcon Instruct 40B models, you should set
# max_length to around 1,000 tokens or fewer. If you need longer generated
# sequences, please file a request with Vertex to allowlist your project for a
# longer timeout threshold with Vertex endpoints.
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_quantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

文本审核会分析文档中的安全属性列表，其中包括“有害类别”和可能被视为敏感的主题。

In [None]:
for generated_text in response.predictions:
    # Send a request to the API.
    response = moderate_text(generated_text)
    # Show the results.
    show_text_moderation(generated_text, response)

## 评估使用PEFT微调的Falcon Instruct模型

本节展示如何使用EleutherAI的[语言模型评估工具（lm-evaluation-harness)](https://github.com/EleutherAI/lm-evaluation-harness) 和 Vertex CustomJob 评估使用PEFT LoRA微调的Falcon Instruct模型。请参考用于服务的峰值GPU内存使用情况，并相应调整机器类型、加速器类型和加速器数量。

本示例使用数据集[TruthfulQA](https://arxiv.org/abs/2109.07958)。所有支持的任务均列在[任务表格](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md)中。

In [None]:
eval_dataset = "truthfulqa_mc"  # @param {type:"string"}

# Worker pool spec.
# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/training/configure-compute

# Sets V100 (16G) to evaluate tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may evaluate tiiuae/falcon-40b-instruct with
#  multiple V100s. Please keep in mind that the efficiency of evaluating with
#  multiple V100s is inferior to that of evaluating with A100s.
# Compared with L4, V100 inference can have better throughput and latency.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1  # for tiiuae/falcon-7b-instruct
# accelerator_count = 8  # for tiiuae/falcon-40b-instruct

# Sets L4 (24G) to evaluate tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may evaluate tiiuae/falcon-40b-instruct with
#  multiple L4s. Please keep in mind that the efficiency of evaluating with
#  multiple L4s is inferior to that of evaluating with A100s.
# Compared with V100, L4 inference can be more cost efficient.

# For tiiuae/falcon-7b-instruct.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# For tiiuae/falcon-40b-instruct.
# machine_type = "g2-standard-48"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 4

# Sets A100 (40G) to evaluate tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1  # for tiiuae/falcon-7b-instruct
# accelerator_count = 4  # for tiiuae/falcon-40b-instruct

# Sets A100 (80G) to evaluate falcon-40b-instruct models for faster inferences.
# machine_type = "a2-ultragpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100_80GB"
# accelerator_count = 2

replica_count = 1

# Setup evaluation job.
job_name = get_job_name_with_datetime(prefix="falcon-instruct-peft-eval")
eval_output_dir = os.path.join(MODEL_BUCKET, job_name)
eval_output_dir_gcsfuse = eval_output_dir.replace("gs://", "/gcs/")

In [None]:
# Prepare evaluation command that runs the evaluation harness.
# Set `trust_remote_code = True` because evaluating the model requires
# executing code from the model repository.
# Set `use_accelerate = True` to enable evaluation across multiple GPUs.
eval_command = [
    "python",
    "main.py",
    "--model",
    "hf-causal-experimental",
    "--model_args",
    f"pretrained={model_id},peft={output_dir_gcsfuse},trust_remote_code=True,use_accelerate=True,device_map_option=auto",
    "--tasks",
    f"{eval_dataset}",
    "--output_path",
    f"{eval_output_dir_gcsfuse}",
]

提交评价 CustomJob

In [None]:
# Pass evaluation arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,
        "disk_spec": {
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": EVAL_DOCKER_URI,
            "command": eval_command,
            "args": [],
        },
    }
]

eval_job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    base_output_dir=eval_output_dir,
)

eval_job.run()

print("Evaluation results were saved in:", eval_output_dir)

### 获取并打印评估结果

In [None]:
import json

from google.cloud import storage

# Fetch evaluation results.
storage_client = storage.Client()
BUCKET_NAME = BUCKET_URI.split("gs://")[1]
bucket = storage_client.get_bucket(BUCKET_NAME)
RESULT_FILE_PATH = eval_output_dir[len(BUCKET_URI) + 1 :]
blob = bucket.blob(RESULT_FILE_PATH)
raw_result = blob.download_as_string()

# Print evaluation results.
result = json.loads(raw_result)
result_formatted = json.dumps(result, indent=2)
print(f"Evaluation result:\n{result_formatted}")

清理资源

In [None]:
# Delete custom train, quantization, and evaluation jobs.
train_job.delete()
quantize_job.delete()
eval_job.delete()

# Undeploy models and delete endpoints.
endpoint_without_peft.delete(force=True)
endpoint_with_peft.delete(force=True)
endpoint_prequantized_vllm.delete(force=True)
endpoint_quantized_vllm.delete(force=True)

# Delete models.
model_without_peft.delete()
model_with_peft.delete()
model_prequantized_vllm.delete()
model_quantized_vllm.delete()