In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI 模型花园 - OpenLLaMA (PEFT)

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_openllama_peft.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在 Colab 中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_openllama_peft.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在 GitHub 上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_openllama_peft.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
在 Vertex AI Workbench 中打开
    </a> (建议使用 Python-3 GPU 笔记本)
  </td>
</table>

## 概述

本笔记本演示了如何使用预构建的OpenLLaMA进行本地推理，部署预构建的OpenLLaMA，使用[vLLM](https://github.com/vllm-project/vllm)部署预构建的OpenLLaMA，使用性能高效的微调库（[PEFT](https://github.com/huggingface/peft)）对OpenLLaMA进行微调和部署，使用AWQ或GPTQ对OpenLLaMA进行量化和部署，并在Vertex AI中评估PEFT微调的OpenLLaMA。

### 目标

- 使用预构建的OpenLLaMA进行本地推理
- 部署预构建的OpenLLaMA
- 使用[vLLM](https://github.com/vllm-project/vllm)部署预构建的OpenLLaMA以提高服务吞吐量
- 使用PEFT对OpenLLaMA进行微调和部署
- 使用AWQ或GPTQ对OpenLLaMA模型进行量化和部署
- 评估经过PEFT微调的OpenLLaMA

| 模型 | LoRA |
| :- | :- |
| [openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b) | 是 |
| [openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b) | 是 |
| [openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b) | 是 |

### 成本

此教程使用Google Cloud的收费组件：

* Vertex AI
* 云存储

了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[云存储定价](https://cloud.google.com/storage/pricing)，并使用[定价计算器](https://cloud.google.com/products/calculator/)根据您的预期使用情况生成成本估算。

在开始之前

注意：Jupyter将带有“!”前缀的行视为shell命令，并将带有“$”前缀的Python变量插入这些命令中。

在OpenLLaMA中运行本地推理需要GPU。

仅限于Colab
如果您正在使用Workbench，请运行以下命令并跳过此部分。

In [None]:
import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
! pip3 install transformers==4.31.0
! pip3 install sentencepiece==0.1.99
! pip3 install accelerate==0.21.0

### 设置谷歌云项目

1. [选择或创建一个谷歌云项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建帐户时，您将获得300美元的免费信用，用于计算/存储成本。

2. [确保您的项目已启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用Vertex AI API和Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component)。

4. [创建一个云存储桶](https://cloud.google.com/storage/docs/creating-buckets) 用于存储实验输出。

5. [创建一个服务帐号](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) ，并为部署经过微调的模型到 Vertex AI 端点分配 `Vertex AI User` 和 `Storage Object Admin` 角色。

为实验环境设置以下变量。指定的云存储桶（`BUCKET_URI`）应位于指定的区域（`REGION`）中。请注意，多区域存储桶（例如“美国”）不被视为与多区域范围覆盖的单个区域相匹配（例如“us-central1”）。

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = ""  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
DATA_BUCKET = os.path.join(EXPERIMENT_BUCKET, "data")
MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

### 初始化Vertex AI API

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

定义常数

In [None]:
# The pre-built training and serving docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20231130_0936_RC00"
PREDICTION_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-serve:20231130_0948_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231127_0916_RC00"
VLLM_GPTQ_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:gptq"
EVAL_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-lm-evaluation-harness:20231011_0934_RC00"

### 定义常见函数

In [None]:
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform


def create_name_with_datetime(prefix: str) -> str:
    """Creates a name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model(
    model_name: str,
    model_id: str,
    finetuned_lora_model_path: str,
    service_account: str,
    task: str,
    precision_loading_mode: str = "float16",
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
        "PRECISION_LOADING_MODE": precision_loading_mode,
        "TASK": task,
        "DEPLOY_SOURCE": "notebook",
    }
    if finetuned_lora_model_path:
        serving_env["FINETUNED_LORA_MODEL_PATH"] = finetuned_lora_model_path
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=PREDICTION_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/peft_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
    quantization_method: str = "",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--max-num-batched-tokens=4096",
        "--disable-log-stats",
    ]
    if quantization_method:
        vllm_args.append(f"--quantization={quantization_method}")
    if quantization_method == "gptq":
        vllm_docker_uri = VLLM_GPTQ_DOCKER_URI
    else:
        vllm_docker_uri = VLLM_DOCKER_URI

    serving_env = {
        "MODEL_ID": "openlm-research/open_llama",
        "DEPLOY_SOURCE": "notebook"
    }
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vllm_docker_uri,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

使用预先构建的OpenLLaMA在本地运行推断

In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

model_path = "openlm-research/open_llama_3b"

tokenizer = LlamaTokenizer.from_pretrained(model_path)

model = LlamaForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
)

prompt = "Q: What is the largest animal?\nA:"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")
generation_output = model.generate(input_ids=input_ids, max_new_tokens=32)
print(tokenizer.decode(generation_output[0]))

## 使用vLLM部署预构建的OpenLLaMA

本部分在终端上使用[vLLM](https://github.com/vllm-project/vllm)部署预构建的OpenLLaMA模型。模型部署步骤需要大约15分钟才能完成。

vLLM是一个高度优化的LLM服务框架，可以显著提高服务吞吐量。您拥有的QPS越高，使用vLLM可以获得的性能优势就越多。

设置预先构建的模型id。

In [None]:
prebuilt_model_id = "openlm-research/open_llama_7b"  # @param ["openlm-research/open_llama_3b", "openlm-research/open_llama_7b", "openlm-research/open_llama_13b"]

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets V100 to deploy open_llama_3b and open_llama_7b.
# V100 serving has better throughput and latency performance than L4 serving.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets L4 to deploy open_llama_3b and open_llama_7b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 to deploy open_llama_13b.
# V100 serving has better throughput and latency performance than L4 serving.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 to deploy open_llama_13b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

if prebuilt_model_id == "openlm-research/open_llama_3b":
    # vLLM currently does not support OpenLLaMA 3B.
    precision_loading_mode = "float16"
    model_without_peft, endpoint_without_peft = deploy_model(
        model_name=get_job_name_with_datetime(prefix="openllama-serve"),
        model_id=model_id,
        finetuned_lora_model_path="",  # This will avoid override finetuning models.
        service_account=SERVICE_ACCOUNT,
        task="causal-language-modeling-lora",
        precision_loading_mode=precision_loading_mode,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )
else:
    model_without_peft, endpoint_without_peft = deploy_model_vllm(
        model_name=create_name_with_datetime(prefix="openllama-serve-vllm"),
        model_id=prebuilt_model_id,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )

注意：在部署成功后，预建模型权重将从原始位置实时下载。因此，在上述模型部署步骤成功后，需要额外等待5分钟，在可以运行下面的下一个步骤之前。否则，当向端点发送请求时，可能会出现`ServiceUnavailable: 503 502:Bad Gateway`错误。

一旦部署成功，您可以使用文本提示向端点发送请求。如果您对额外的服务参数感兴趣，请参考vLLM GitHub的[examples/api_client.py](https://github.com/vllm-project/vllm/blob/main/examples/api_client.py)获取更多详细信息。

In [None]:
instance = {
    "prompt": "Hi, Google. How are you doing?",
    "n": 1,
    "max_tokens": 32,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint_without_peft.predict(instances=[instance])
print(response.predictions[0])

## 使用PEFT对OpenLLaMA进行微调并部署

本部分演示了如何对OpenLLaMA-7b模型进行微调，将微调后的LoRA适配器与基本模型合并，并使用vLLM进行服务。

设置基本模型ID。

In [None]:
model_id = "openlm-research/open_llama_7b"  # @param ["openlm-research/open_llama_3b", "openlm-research/open_llama_7b", "openlm-research/open_llama_13b"]

###调优

使用Vertex AI SDK来创建和运行带有Vertex AI Model Garden训练图像的自定义训练任务。

此示例使用数据集[Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes)。您可以使用[huggingface的数据集](https://huggingface.co/datasets)，或者存储在Cloud Storage中的[Vertex文本模型数据集格式](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#dataset-format)中的自定义JSONL数据集。 `template`参数为可选。

为了有效地进行微调，我们启用了量化，以加载用于微调LoRA模型的预训练模型。精度选项包括`"4bit"`、`"8bit"`、`"float16"`(默认)和`"float32"`，精度可以通过`"--precision_mode"`进行设置。针对[openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b)、[openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b)和[openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b)，使用默认训练参数和示例数据集进行微调LoRA模型时，峰值GPU内存使用量分别为~7G、~10G和~16G。`open_llama_3b`和`open_llama_7b`可以在**1个V100（16G）**和**1个L4（24G）**上进行微调，而`open_llama_13b`可以在**1个L4（24G）**上进行微调。

在本节中，微调后的LoRA适配器将保存在下面的`lora_adapter_dir`变量指定的GCS存储桶中；我们将LoRa适配器与基础模型合并，并保存在下面的`merged_model_output_dir`变量指定的另一个GCS存储桶中。

#### [可选] 使用自定义数据集进行微调

要使用自定义数据集，您应该在下面的`dataset_name`中提供一个指向[Vertex文本模型数据集格式](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#dataset-format)中的JSONL文件的`gs://` URI。

例如，这是来自示例数据集`gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`中的一个数据点：

```json
{"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}
```

要使用包含`input_text`和`output_text`字段的这个示例数据集，将`dataset_name`设置为`gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`，并将`template`设置为`vertex_sample`。对于使用自定义数据集字段的高级用法，请参阅[模板示例](https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json)，并提供自己的JSON模板作为`gs://` URI。

In [None]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "Abirate/english_quotes"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

# Worker pool spec.
# Finetunes open_llama_3b and open_llama_7b with 1 V100 (16G).
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Finetunes open_llama_3b and open_llama_7b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Finetunes open_llama_13b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Finetunes open_llama_13b with 1 A100 (40G).
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1

replica_count = 1


# Setup training job.
job_name = create_name_with_datetime("openllama-lora-train")
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)

# Create a GCS folder to store the LORA adapter.
lora_adapter_dir = create_name_with_datetime("openllama-lora-adapter")
lora_output_dir = os.path.join(MODEL_BUCKET, lora_adapter_dir)
lora_output_dir_gcsfuse = lora_output_dir.replace("gs://", "/gcs/")

# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_dir = create_name_with_datetime("openllama-merged-model")
merged_model_output_dir = os.path.join(MODEL_BUCKET, merged_model_dir)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

# Pass training arguments and launch job.
train_job.run(
    args=[
        "--task=causal-language-modeling-lora",
        f"--pretrained_model_id={model_id}",
        f"--dataset_name={dataset_name}",
        f"--output_dir={lora_output_dir_gcsfuse}",
        f"--merge_base_and_lora_output_dir={merged_model_output_dir_gcsfuse}",
        "--lora_rank=16",
        "--lora_alpha=32",
        "--lora_dropout=0.05",
        "--warmup_steps=10",
        "--max_steps=10",
        "--learning_rate=2e-4",
        f"--template={template}",
    ],
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
)

print("The finetuned Lora adapter can be found at: ", lora_output_dir)
print(
    "The finetuned Lora adapter merged with the base model can be found at: ",
    merged_model_output_dir,
)

### [可选] 超参数调优

您可以使用Vertex AI SDK 创建和运行[超参数调优作业](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview)，通过尝试不同的超参数（例如学习率）来获得更好的性能。

定义以下规范：

- `worker_pool_specs`：指定机器类型和Docker镜像的字典。

- `parameter_spec`：指定要优化的参数的字典。字典键是您的训练应用程序代码中为每个超参数分配的命令行参数的字符串，字典值是参数规范。参数规范包括超参数的类型、最小/最大值和规模。

- `metric_spec`：指定要优化的指标的字典。字典键是您在训练应用程序代码中设置的超参数指标标签，值是优化目标。

以下是针对ARC Challenge数据集评估的4位QLoRA实验结果，用于参考超参数调整的有效性：

| 模型         | 训练时间    | Trials | 并行 Trials | GPU  | ∆arc 挑战 | ∆hellaswag | ∆truthfulqa_mc | 成本       |
|---------------|------------|--------|-------------|------|------------|------------|----------------|------------|
| Openllama-3b  | 2天10小时   | 8      | 1           | L4x1 | +1.62      | +7.32      | +3.34          | \$29.0232 |
| Openllama-7b  | 1天4小时    | 8      | 2           | L4x1 | +2.82      | +3.55      | +6.68          | \$47.8016 |
| Openllama-13b | 6天10小时   | 8      | 2           | L4x1 | +1.01      | +3.67      | +6.19          | \$87.9208 |

以下示例在`timdettmers/openassistant-guanaco`上运行8个试验，使用不同的学习率，并在`arc_challenge`数据集上评估模型。您可以通过扩展学习率范围、添加LoRA等参数来定制搜索空间。请参阅[超参数调优文档](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview)获取更多信息。

In [None]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

hpt_precision_mode = "4bit"

# Worker pool spec for 4bit finetuning.

# Finetunes Openllama 3B / 7B / 13B with 1 L4 (24G).
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

### [可选] 自定义评估数据集

为了获得在某些特定任务上性能更好的模型，您可能希望使用自定义评估数据集运行超参数调整。超参数调整服务将根据评估数据集和您选择的指标选择模型。您可以在下面的代码单元格中将任何以下任务用作`eval_task`：

1. [lm-evaluation-harness 任务](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/lm_eval/tasks)的名称。

2. `custom_likelihood`。然后，添加一个标志 `--eval_dataset_path=<您的 JSONL 数据集的云存储 URI>`。JSONL 文件必须采用 Vertex AI 语言模型的[准备评估数据集](https://cloud.google.com/vertex-ai/docs/generative-ai/models/evaluate-models#classification)页面中的格式。

3. `builtin_eval`。将使用训练器的内置评估循环来评估模型，而不是使用[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)库。您可以通过指定 `--eval_dataset_path`、`--eval_split`、`--eval_template` 和 `--eval_column` 提供与训练数据集相同格式的任何评估数据集。

In [None]:
from google.cloud.aiplatform import hyperparameter_tuning as hpt

eval_task = "arc_challenge"  # @param {type:"string"}
eval_metric_name = "acc_norm"  # @param {type:"string"}

# Runs 10 training steps as a minimal example. Use 1000 to reproduce the experiment results.
max_steps = 10  # @param {type:"integer"}
# Evaluates the model on 10 examples. Use 10000 to reproduce the experiment results.
eval_limit = 10  # @param {type:"integer"}

flags = {
    "learning_rate": 1e-5,
    "precision_mode": hpt_precision_mode,
    "task": "instruct-lora",
    "pretrained_model_id": model_id,
    "output_dir": lora_output_dir_gcsfuse,
    "warmup_steps": 10,
    "max_steps": max_steps,
    "lora_rank": 32,
    "lora_alpha": 64,
    "lora_dropout": 0.05,
    "dataset_name": dataset_name,
    "eval_steps": max_steps + 1,  # Only evaluates in the end.
    "eval_tasks": eval_task,
    "eval_limit": eval_limit,
    "eval_metric_name": eval_metric_name,
    "merge_base_and_lora_output_dir": merged_model_output_dir_gcsfuse,
}

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "args": ["--{}={}".format(k, v) for k, v in flags.items()],
        },
    }
]
metric_spec = {"model_performance": "maximize"}
parameter_spec = {
    "learning_rate": hpt.DoubleParameterSpec(min=1e-5, max=1e-4, scale="linear"),
}
train_job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

train_hpt_job = aiplatform.HyperparameterTuningJob(
    display_name=f"{job_name}_hpt",
    custom_job=train_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=8,
    parallel_trial_count=2,
)

train_hpt_job.run()

print("Trained models were saved in: ", lora_output_dir)

接着，从超参数调整工作中找到最佳试验。

In [None]:
best_trial_id = max(
    train_hpt_job.trials, key=lambda trial: trial.final_measurement.metrics[0].value
).id
lora_output_dir = os.path.join(lora_output_dir, f"trial_{best_trial_id}")
lora_output_dir_gcsfuse = lora_output_dir.replace("gs://", "/gcs/")
print(f"Best trial {best_trial_id} saved model in:", lora_output_dir)

### 使用vLLM部署
该部分将模型上传到模型注册表，并将其部署在端点上。vLLM目前不支持为经过微调的[openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b)提供服务。

模型部署步骤将需要大约15分钟的时间才能完成。

使用LoRA权重的[openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b)，[openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b)，和[openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b)的峰值GPU内存使用量分别为~5.3G，~8.7G和~15.2G（使用默认设置）。

注意：vLLM需要一个合并模型，其中包含基础模型和经过微调的LoRA适配器。根据您的业务需求，如果您需要基础模型和经过微调的LoRA权重分开提供服务，请考虑改用正常的Vertex服务。

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets V100 to deploy open_llama_3b and open_llama_7b.
# V100 serving has better throughput and latency performance than L4 serving.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets L4 to deploy open_llama_3b and open_llama_7b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 to deploy open_llama_13b.
# V100 serving has better throughput and latency performance than L4 serving.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 to deploy open_llama_13b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

if prebuilt_model_id == "openlm-research/open_llama_3b":
    # vLLM currently does not support OpenLLaMA 3B.
    precision_loading_mode = "float16"
    model_with_peft, endpoint_with_peft = deploy_model(
        model_name=get_job_name_with_datetime(prefix="openllama-peft-serve"),
        model_id=model_id,
        finetuned_lora_model_path=lora_output_dir,  # This will avoid override finetuning models.
        service_account=SERVICE_ACCOUNT,
        task="causal-language-modeling-lora",
        precision_loading_mode=precision_loading_mode,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )
else:
    model_with_peft, endpoint_with_peft = deploy_model_vllm(
        model_name=create_name_with_datetime(prefix="openllama-peft-serve-vllm"),
        model_id=merged_model_output_dir,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )

print("endpoint_name:", endpoint_with_peft.name)

注意：部署成功后，基本模型权重将从原始位置上的基础上动态下载，LoRA模型权重将从训练中使用的GCS存储桶中下载。因此，在上述模型部署步骤成功之后，需要额外等待5分钟，在进行下面的下一步之前。否则，当您发送请求到端点时，您可能会看到`ServiceUnavailable: 503 502:Bad Gateway`错误。

一旦部署成功，您可以使用文本提示向端点发送请求。vLLM支持的参数可以在[这里](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64)找到。

In [None]:
instance = {
    "prompt": "Hi, Google. How are you doing?",
    "n": 1,
    "max_tokens": 32,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint_with_peft.predict(instances=[instance])
print(response.predictions[0])

### [可选] 将之前训练过的LoRA适配器与基础模型合并

本节介绍如何将之前训练过的LoRA适配器与基础模型合并，并将合并后的模型保存到GCS存储桶中。请注意，LoRA适配器应该在相同的基础模型上进行训练。

In [None]:
merge_job_name = create_name_with_datetime(prefix="openllama-peft-merge")

# The base model to be merged upon. It can be a huggingface model id, or a GCS
# path where the base model was stored.
base_model_dir = "gs://"  # @param {type:"string"}
# The previously trained LoRA adapter. It needs to be stored in a GCS path.
finetuned_lora_adapter_dir = ""  # @param {type:"string"}

# The GCS path to save the merged model
merged_model_output_dir = os.path.join(MODEL_BUCKET, merge_job_name)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

# Worker pool spec.
# Merges open_llama_3b and open_llama_7b with 1 V100 (16G).
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"

# Merges open_llama_3b and open_llama_7b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"

# Merges open_llama_13b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"

# Merges open_llama_13b with 1 A100 (40G).
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "command": [],
            "args": [
                "--task=merge-causal-language-model-lora",
                "--merge_model_precision_mode=float16",
                "--pretrained_model_id=%s" % base_model_dir,
                "--finetuned_lora_model_dir=%s" % finetuned_lora_adapter_dir,
                "--merge_base_and_lora_output_dir=%s" % merged_model_output_dir_gcsfuse,
            ],
        },
    }
]

merge_custom_job = aiplatform.CustomJob(
    display_name=merge_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

merge_custom_job.run()

print("The merged model is stored at: ", merged_model_output_dir)

## 量化并部署OpenLLaMA 2 模型

本节展示了使用Vertex Custom Job 对OpenLLaMA模型进行训练后的量化。量化可以减少模型所需的内存，同时尽量保持相同的性能。其中两种算法为AWQ和GPTQ。想了解更多关于AWQ的信息，请阅读以下出版物：[AWQ: 激活感知权重量化用于LLM压缩和加速](https://arxiv.org/abs/2306.00978)。想了解更多关于GPTQ的信息，请阅读以下出版物：[GPTQ: 用于生成式预训练变压器的准确后训练量化](https://arxiv.org/abs/2210.17323)。

### 对OpenLLaMA模型进行量化

量化通过减少权重的位精度来减少提供模型所需的GPU数量，同时最小化性能下降。在VLLM上提供量化模型需要将模型量化为4位。建议首先搜索是否已经有模型被量化并公开可用：[AWQ](https://huggingface.co/TheBloke?search_models=-awq) 和 [GPTQ](https://huggingface.co/TheBloke?search_models=-gptq)。

使用1个NVIDIA_L4 GPU对AWQ量化模型需要大约
20分钟针对OpenLLaMA 3B，30分钟针对OpenLLaMA 7B，1小时针对OpenLLaMA 13B。

使用1个NVIDIA_L4 GPU对GPTQ量化模型需要大约30分钟针对OpenLLaMA 3B，45分钟针对OpenLLaMA 7B，1.5小时针对OpenLLaMA 13B。经过微调的模型也可以被量化，只要LoRA权重与基本模型合并即可。

In [None]:
# Setup quantization job.

# Set `finetuned_model_path` to `merged_model_output_dir` from the previous
# section above to quantize the finetuned model, if not set the base model will
# be quantized.
finetuned_model_path = ""  # @param {type:"string"}
if finetuned_model_path:
    prequantized_model_path = finetuned_model_path
else:
    prequantized_model_path = model_id

quantization_method = "awq"  # @param ["awq", "gptq"]
quantization_job_name = get_job_name_with_datetime(
    f"openllama-{quantization_method}-quantize"
)

quantization_output_dir = os.path.join(MODEL_BUCKET, quantization_job_name)
quantization_output_dir_gcsfuse = quantization_output_dir.replace("gs://", "/gcs/")

# Worker pool spec.

# Sets 1 L4 (24G) to quantize OpenLLaMA model.
machine_type = "g2-standard-16"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1


# Quantization parameters.
quantization_precision_mode = "4bit"
if quantization_method == "awq":
    awq_dataset_name = "pileval"
    group_size = 64
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={model_id}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={awq_dataset_name}",
        f"--group_size={group_size}",
    ]
else:
    # The original datasets used in GPTQ paper ["wikitext2","c4","c4-new","ptb","ptb-new"].
    gptq_dataset_name = "c4"  # @param {type:"string"}
    gptq_precision_mode = "4bit"
    group_size = -1
    damp_percent = 0.1
    desc_act = True
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={model_id}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={gptq_dataset_name}",
        f"--group_size={group_size}",
        f"--damp_percent={damp_percent}",
        f"--desc_act={desc_act}",
    ]

# Pass quantization arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,
        "disk_spec": {
            "boot_disk_type": "pd-ssd",
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "env": [
                {
                    "name": "PYTORCH_CUDA_ALLOC_CONF",
                    "value": "max_split_size_mb:32",
                },
            ],
            "command": [],
            "args": quantization_args,
        },
    }
]

print(f"Quantizing {prequantized_model_path}.")
quantize_job = aiplatform.CustomJob(
    display_name=quantization_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)
quantize_job.run()

print("Quantized models were saved in: ", quantization_output_dir)

使用Google Cloud Text Moderation部署量化模型
本部分将模型上传至模型注册表，并在端点上部署它。

模型部署步骤将需要15分钟到1小时的时间来完成，具体取决于模型大小。

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 (24G) to deploy OpenLLaMA models.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1


if prebuilt_model_id == "openlm-research/open_llama_3b":
    # vLLM currently does not support OpenLLaMA 3B.
    precision_loading_mode = "float16"
    model_quantized_vllm, endpoint_quantized_vllm = deploy_model(
        model_name=get_job_name_with_datetime(prefix="openllama-quantized-serve"),
        model_id=quantization_output_dir,
        finetuned_lora_model_path="",  # This will avoid override finetuning models.
        service_account=SERVICE_ACCOUNT,
        task="causal-language-modeling-lora",
        precision_loading_mode=precision_loading_mode,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )
else:
    model_quantized_vllm, endpoint_quantized_vllm = deploy_model_vllm(
        model_name=create_name_with_datetime(prefix="openllama-quantized-serve-vllm"),
        model_id=quantization_output_dir,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )

注意：部署成功后，模型权重将会动态下载。因此，在上述模型部署步骤成功之后、下一步之前，需要额外等待 10 到 40 分钟（取决于模型大小）。否则，在向端点发送请求时，可能会出现 `ServiceUnavailable: 503 502: Bad Gateway` 错误。

一旦部署成功，您可以使用文本提示向端点发送请求。

例如：

```
人类: 什么是汽车？
助手: 汽车，或称摩托车，是一种与道路相连的人类交通系统，用于将人或货物从一地移到另一地。这个词还包括一系列车辆，包括摩托艇、火车和飞机。汽车通常有四个轮子，一个载客舱以及一个引擎或发动机。它们自19世纪初以来就存在，现在是最受欢迎的交通方式之一，用于日常通勤、购物和其他目的。
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_quantized_vllm.name` allows us to get the
#   endpoint name of the endpoint `endpoint_quantized_vllm` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_quantized_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_quantized_vllm = aiplatform.Endpoint(aip_endpoint_name)


# Overides max_length and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_length as 20.
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_quantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

评估使用PEFT LoRA微调的OpenLLaMA

本节展示如何使用EleutherAI的[语言模型评估工具（lm-evaluation-harness）](https://github.com/EleutherAI/lm-evaluation-harness)和Vertex CustomJob评估微调后的OpenLLaMA模型。

此示例使用数据集[TruthfulQA](https://arxiv.org/abs/2109.07958)。所有支持的任务都列在[此任务表](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md)中。

In [None]:
eval_dataset = "truthfulqa_mc"  # @param {type:"string"}

# Worker pool spec.
# Sets L4 to evaluate open_llama_3b and open_llama_7b.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets L4 to evaluate open_llama_3b and open_llama_7b.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 to evaluate open_llama_13b.
# machine_type = "n1-standard-8"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 to evaluate open_llama_13b.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

replica_count = 1

# Setup evaluation job.
job_name = create_name_with_datetime(prefix="openllama-peft-eval")
eval_output_dir = os.path.join(MODEL_BUCKET, job_name)
eval_output_dir_gcsfuse = eval_output_dir.replace("gs://", "/gcs/")

In [None]:
# Prepare evaluation command that runs the evaluation harness.
# Set `use_accelerate = True` to enable evaluation across multiple GPUs.
eval_command = [
    "python",
    "main.py",
    "--model",
    "hf-causal-experimental",
    "--model_args",
    f"pretrained={merged_model_output_dir_gcsfuse},use_accelerate=True,device_map_option=auto",
    "--tasks",
    f"{eval_dataset}",
    "--output_path",
    f"{eval_output_dir_gcsfuse}",
]

### 提交评估 CustomJob

In [None]:
# Pass evaluation arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,
        "disk_spec": {
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": EVAL_DOCKER_URI,
            "command": eval_command,
            "args": [],
        },
    }
]

eval_job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    base_output_dir=eval_output_dir,
)

eval_job.run()

print("Evaluation results were saved in:", eval_output_dir)

获取并打印评估结果

In [None]:
import json

from google.cloud import storage

# Fetch evaluation results.
storage_client = storage.Client()
BUCKET_NAME = BUCKET_URI.split("gs://")[1]
bucket = storage_client.get_bucket(BUCKET_NAME)
RESULT_FILE_PATH = eval_output_dir[len(BUCKET_URI) + 1 :]
blob = bucket.blob(RESULT_FILE_PATH)
raw_result = blob.download_as_string()

# Print evaluation results.
result = json.loads(raw_result)
result_formatted = json.dumps(result, indent=2)
print(f"Evaluation result:\n{result_formatted}")

清理资源

In [None]:
# Delete custom train and evaluation jobs.
train_job.delete()
eval_job.delete()
quantize_job.delete()

# Undeploy models and delete endpoints.
endpoint_without_peft.delete(force=True)
endpoint_with_peft.delete(force=True)
endpoint_quantized_vllm.delete(force=True)

# Delete models.
model_without_peft.delete()
model_with_peft.delete()
model_quantized_vllm.delete()