In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI - 使用LoRA进行Llama2微调并在TPUv5e上提供服务

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/tpuv5e_llama2_pytorch_finetuning_and_serving.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/tpuv5e_llama2_pytorch_finetuning_and_serving.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/training/tpuv5e_llama2_pytorch_finetuning_and_serving.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
在Vertex AI Workbench中打开
    </a> 
  </td>
</table>

# 配额 - 确保在开始之前完成！

为了运行此示例，您需要获得以下TPUv5e的配额批准。您可以通过IAM和管理 > 配额 或联系您的Google帐户团队发出请求：

aiplatform.googleapis.com/custom_model_serving_tpu_v5e (4-8芯片。Llama2 7B的最少4芯片)
aiplatform.googleapis.com/custom_model_training_tpu_v5e (至少16芯片)

查看[TPU价格页](https://cloud.google.com/tpu/pricing)了解区域可用性和定价。

## 概述

本笔记本演示了使用[LoRA](https://huggingface.co/docs/peft/v0.9.0/en/package_reference/lora#peft.LoraConfig)对Llama2 7B模型进行微调，并在TPUv5e上进行微调和服务。微调基于一个[Hugging Face示例](https://huggingface.co/google/gemma-7b/blob/main/examples/example_fsdp.py)，该示例使用[PyTorch XLA中的完全分片数据并行](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/)和[SPMD](https://pytorch.org/blog/pytorch-xla-spmd/)。点击链接了解更多信息。


微调通过[Vertex AI自定义训练作业](https://cloud.google.com/vertex-ai/docs/training/create-custom-job)进行。Vertex AI自定义训练作业允许对微调作业进行更高级别的定制和控制。本笔记本中的所有示例都使用参数Low-Rank Adaption [LoRA](https://huggingface.co/docs/peft/en/package_reference/lora)来降低训练和存储成本。

本笔记本使用Hex-LLM部署模型，Hex-LLM是一个由Google Cloud开发的基于XLA构建的高效大型语言模型服务解决方案


### 目标

- 使用Vertex AI自定义训练作业和Vertex Prediction端点微调和部署Llama2模型。
- 发送预测请求到您微调的Llama2模型。


### 成本

本教程使用Google Cloud的收费组件：

* Vertex AI
* Cloud Storage

了解[Vertex AI价格](https://cloud.google.com/vertex-ai/pricing)、[Cloud Storage价格](https://cloud.google.com/storage/pricing)以及使用[Pricing Calculator](https://cloud.google.com/products/calculator/)根据您的预计使用情况生成成本估算。

### 数据集

在这个例子中，您将使用Hugging Face提供的english_quotes数据集来微调模型。有关数据集的详细信息可以在此处找到：https://huggingface.co/datasets/Abirate/english_quotes

成本

本教程使用 Google Cloud 的计费组件：

Vertex AI（训练、预测、TPUv5e）、Cloud 存储

请了解[Vertex AI 定价](https://cloud.google.com/vertex-ai/pricing)，[Cloud 存储定价](https://cloud.google.com/storage/pricing)，[Cloud NL API 定价](https://cloud.google.com/natural-language/pricing)并使用[定价计算器](https://cloud.google.com/products/calculator/)根据您的预期使用情况生成成本估算。

## 安装

安装以下必需的软件包以执行此笔记本。

运行以下命令安装支持 TPUv5e 的最新 Google 云平台库。

In [None]:
import os

# (optional) update gcloud if needed
if os.getenv("IS_TESTING"):
    ! gcloud components update --quiet

! pip3 install --upgrade --quiet google-cloud-aiplatform

只有Colab：取消注释下面的单元格以重新启动内核。

In [None]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## 在开始之前

### 设置您的 Google Cloud 项目

**无论您使用什么笔记本环境，都需要进行以下步骤。**

1. [选择或创建一个 Google Cloud 项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建账户时，您会获得 $300 的免费信用额度用于支付计算/存储成本。

2. [确保您的项目已启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用 Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

4. 如果您在本地运行此笔记本，您需要安装 [Cloud SDK](https://cloud.google.com/sdk)。

5. [选择或创建一个 Cloud 存储存储桶](https://cloud.google.com/storage/docs/creating-buckets) 用于存储实验输出。

6. [创建一个服务帐号](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console)，并赋予 `Vertex AI User` 和 `Storage Object Admin` 角色，用于部署微调模型到 Vertex AI 终端点。

设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 参考支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### 区域

您还可以更改 Vertex AI 使用的 `REGION` 变量。欲了解更多关于 [Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations)。

TPUv5e 可在[此处列出的以下区域](https://cloud.google.com/tpu/pricing)中使用。

In [None]:
REGION = "us-west1"  # @param {type: "string"}

### 验证您的Google Cloud帐户

根据您的Jupyter环境，您可能需要手动验证。请按照以下相关说明操作。

1. 顶点 AI 工作台
* 无需操作，您已经通过身份验证。

2. 本地JupyterLab实例，取消注释并运行：

In [None]:
# ! gcloud auth login

3. 协作，取消注释并运行:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

查看如何在https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples为您的服务帐号授予云存储权限。

### 导入库

In [None]:
import os
from datetime import datetime, timedelta

from google.cloud import aiplatform

创建一个云存储桶
创建一个存储桶，用于存储中间产物，如数据集。

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

为分阶段、环境和模型构件设置文件夹路径。

In [None]:
STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

只有在您的存储桶尚不存在时才能运行以下单元格以创建您的云存储存储桶。

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

### 访问Llama 2预训练和微调模型
Meta的原始模型被转换为Hugging Face格式，用于微调和在Vertex AI中提供服务。

接受模型协议以访问模型：

1. 转到Google Cloud控制台中的Vertex AI > Model Garden页面
2. 搜索Llama 2
3. 查看在模型卡页面弹出的协议
4. 接受Llama 2的协议
5. 在文档选项卡上，将共享包含Llama 2预训练和微调模型的Cloud Storage存储桶链接
6. 将Cloud Storage存储桶链接粘贴到下面，并将其分配给VERTEX_AI_MODEL_GARDEN_LLAMA2

In [None]:
VERTEX_AI_MODEL_GARDEN_LLAMA2 = "<Bucket path from documentation tab of Llama 2 in Vertex Model Garden>"  # This will be shared once click the agreement of LLaMA2 in Vertex AI Model Garden.
VERTEX_MODEL_ID = "llama2-7b-hf"
HF_MODEL_ID = "meta-llama/Llama-2-7b-hf"

In [None]:
assert (
    VERTEX_AI_MODEL_GARDEN_LLAMA2
), "Please click the agreement of Llama 2 in Vertex AI Model Garden, and get the GCS path of Llama 2 model artifacts."
print(
    "Copy Llama 2 model artifacts from",
    VERTEX_AI_MODEL_GARDEN_LLAMA2,
    "to ",
    f"{BUCKET_URI}/{HF_MODEL_ID}",
)

# Copy model files to your bucket
! gcloud storage cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA2/$VERTEX_MODEL_ID/* $BUCKET_URI/$HF_MODEL_ID

### 初始化 Python 的 Vertex AI SDK

为您的项目初始化 Python 的 Vertex AI SDK。

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### 创建构建包注册库并设置自定义的Docker镜像URI.

In [None]:
REPOSITORY = "tpuv5e-training-repository-unique"

In [None]:
image_name_train = "llama2-7b-hf-lora-tuning-tpuv5e"
hostname = f"{REGION}-docker.pkg.dev"
tag = "latest"

In [None]:
# Register gcloud as a Docker credential helper
!gcloud auth configure-docker $REGION-docker.pkg.dev --quiet

In [None]:
# One time or use an existing repository
!gcloud artifacts repositories create $REPOSITORY --repository-format=docker \
--location=$REGION --description="Vertex TPUv5e training repository"

In [None]:
# Define container image name
PYTORCH_TRAIN_DOCKER_URI = (
    f"{hostname}/{PROJECT_ID}/{REPOSITORY}/{image_name_train}:{tag}"
)

### 定义常用函数

In [None]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")

构建Docker容器文件

创建训练师目录

In [None]:
import os

if not os.path.exists("trainer"):
    os.makedirs("trainer")

创建用于自定义容器的Dockerfile。这将安装Hugging Face transformers、datasets、trl 和 peft 用于微调。

In [None]:
%%writefile trainer/Dockerfile
# This Dockerfile fine tunes the Llamas2 model using LoRA with PyTorch XLA
# Nightly TPU VM docker image
FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_20240324

ENV DEBIAN_FRONTEND=noninteractive

# Install basic libs
RUN apt-get update && apt-get -y upgrade && apt-get install -y --no-install-recommends \
        cmake \
        curl \
        wget \
        sudo \
        gnupg \
        libsm6 \
        libxext6 \
        libxrender-dev \
        lsb-release \
        ca-certificates \
        build-essential \
        git \
        libgl1

# Copy Apache license.
RUN wget https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/LICENSE

# Install required libs
RUN pip install --upgrade pip
RUN pip install --upgrade pip
RUN pip install transformers==4.38.2 -U
RUN pip install datasets==2.18.0
RUN pip install trl==0.8.1 peft==0.10.0
RUN pip install accelerate==0.28.0
RUN pip install --upgrade google-cloud-storage

# Copy other licenses.
RUN wget -O MIT_LICENSE https://github.com/pytest-dev/pytest/blob/main/LICENSE
RUN wget -O BSD_LICENSE https://github.com/pytorch/xla/blob/master/LICENSE
RUN wget -O BSD-3_LICENSE https://github.com/pytorch/pytorch/blob/main/LICENSE

# Copy install libtpu to PATH above
RUN find ./usr/local/lib -name 'libtpu.so' -exec cp {} /lib \;

WORKDIR /
COPY train.py train.py
ENV PYTHONPATH ./

ENTRYPOINT ["python", "train.py"]

请添加__init__.py文件

In [None]:
!touch trainer/__init__.py

添加train.py文件

这段代码来自于LoRA分布式微调代码，可以在这个示例中找到：https://ai.google.dev/gemma/docs/distributed_tuning

IMDB TensorFlow数据集用于对Gemma模型进行微调。还添加了额外的逻辑来处理TPUv5e所需的TPU拓扑设置：https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config

In [None]:
%%writefile trainer/train.py
import os, sys
import argparse

import torch
import torch_xla
import torch_xla.core.xla_model as xm

from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer

from google.cloud import storage

# use spmd
import torch_xla.runtime as xr
xr.use_spmd()

parser = argparse.ArgumentParser()
parser.add_argument(
    "--tpu_topology",
    help="Topology to use for the TPUv5e (2x2, 2x4, 4x4)",
    default="4x4",
    type=str
)
parser.add_argument(
    "--model_name",
    help="Llama2 model name (meta-llama/Llama-2-7b-hf, meta-llama/Llama-2-13b-hf)",
    default="meta-llama/Llama-2-7b-hf",
    type=str
)
parser.add_argument(
    "--bucket_name",
    help="The name of the bucket you copied the Llama2 model files to",
    required=True,
    type=str
)
parser.add_argument(
    "--output_folder",
    type=str,
    required=True,
    help="Output folder name",
)
parser.add_argument(
    "--checkpoint_directory",
    type=str,
    default="output_ckpt",
    help="Checkpoint Directory name",
)
parser.add_argument(
    "--epochs",
    type=int,
    default=10,
    help="Number of epochs to train",
)
parser.add_argument(
    "--merged_model_folder",
    type=str,
    default="llama2-7b-hf/modelfiles",
    help="Checkpoint Directory name",
)
args = parser.parse_args()

GCS_PREFIX = "gs://"

def is_gcs_path(input_path: str) -> bool:
    return input_path.startswith(GCS_PREFIX)

def download_gcs_dir(gcs_dir: str, local_dir: str):
    """Download files in a GCS directory to a local directory.

    For example:
    download_gcs_dir(gs://bucket/foo, /tmp/bar)
    gs://bucket/foo/a -> /tmp/bar/a
    gs://bucket/foo/b/c -> /tmp/bar/b/c

    Arguments:
    gcs_dir: A string of directory path on GCS.
    local_dir: A string of local directory path.
    """
    if not is_gcs_path(gcs_dir):
        raise ValueError(f"{gcs_dir} is not a GCS path starting with gs://.")

    bucket_name = gcs_dir.split("/")[2]
    prefix = gcs_dir[len(GCS_PREFIX + bucket_name) :].strip("/")
    client = storage.Client()
    blobs = client.list_blobs(bucket_name, prefix=prefix)
    for blob in blobs:
        if blob.name[-1] == "/":
            continue
        file_path = blob.name[len(prefix) :].strip("/")
        local_file_path = os.path.join(local_dir, file_path)
        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
        blob.download_to_filename(local_file_path)
        print (f'download of {local_file_path} complete')
    print (f'Show all files in directory {os.listdir(local_dir)}')

def upload_directory_with_transfer_manager(bucket_name, source_directory, blob_name_prefix, workers=8):
    """Upload every file in a directory, including all files in subdirectories.

    Each blob name is derived from the filename, not including the `directory`
    parameter itself. For complete control of the blob name for each file (and
    other aspects of individual blob metadata), use
    transfer_manager.upload_many() instead.
    """

    # bucket_name = "your-bucket-name"

    # The directory on your computer to upload. Files in the directory and its
    # subdirectories will be uploaded. An empty string means "the current
    # working directory".
    # source_directory=""

    # blob_name_prefix = prefix for the files being uploaded to GCS
    # example: file1 and file2 in a folder uploaded to my-bucket with blob_name_prefix=my-folder/a/
    # will be uploaded to gs://my-bucket/my-folder/a/file1 and gs://my-bucket/my-folder/a/file2
    
    # The maximum number of processes to use for the operation. The performance
    # impact of this value depends on the use case, but smaller files usually
    # benefit from a higher number of processes. Each additional process occupies
    # some CPU and memory resources until finished. Threads can be used instead
    # of processes by passing `worker_type=transfer_manager.THREAD`.
    # workers=8

    from pathlib import Path

    from google.cloud.storage import Client, transfer_manager

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    # Generate a list of paths (in string form) relative to the `directory`.
    # This can be done in a single list comprehension, but is expanded into
    # multiple lines here for clarity.

    # First, recursively get all files in `directory` as Path objects.
    directory_as_path_obj = Path(source_directory)
    paths = directory_as_path_obj.rglob("*")

    # Filter so the list only includes files, not directories themselves.
    file_paths = [path for path in paths if path.is_file()]

    # These paths are relative to the current working directory. Next, make them
    # relative to `directory`
    relative_paths = [path.relative_to(source_directory) for path in file_paths]

    # Finally, convert them all to strings.
    string_paths = [str(path) for path in relative_paths]

    print("Found {} files.".format(len(string_paths)))

    # Start the upload.
    print (f"source directory {source_directory}")
    results = transfer_manager.upload_many_from_filenames(
        bucket, string_paths, blob_name_prefix=blob_name_prefix, source_directory=source_directory, max_workers=workers
    )

    for name, result in zip(string_paths, results):
        # The results list is either `None` or an exception for each filename in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to upload {} due to exception: {}".format(name, result))
        else:
            print("Uploaded {} to {}/{}.".format(name, bucket.name, blob_name_prefix))
    
def main():
    x = args.tpu_topology.split("x")
    tpu_topology_x = int(x[0])
    tpu_topology_y = int(x[1])
    print (f'TPU topology is ({tpu_topology_x}, {tpu_topology_y})')
    print (f'Model name is {args.model_name}')
    
    # Set batch size to 8 for each chip
    BATCH_SIZE = 8 * tpu_topology_x * tpu_topology_y
    # For anything larger than an 8 chip instance, set the BATCH_SIZE to 128, since we run out of samples
    if (tpu_topology_x * tpu_topology_y) >=16:
        BATCH_SIZE = 128
    
    # Set download directory to a tempory folder
    DL_DIR="/tmp/modelfiles"
    if not os.path.exists(DL_DIR):
        os.makedirs(DL_DIR)

    print ('Downloading data to temporary folder')
    download_gcs_dir (f"gs://{args.bucket_name}/{args.model_name}", DL_DIR)
    
    # Create output folders
    if not os.path.exists(f"/tmp/{args.output_folder}"):
        os.makedirs(f"/tmp/{args.output_folder}")
    if not os.path.exists(f"/tmp/{args.checkpoint_directory}"):
        os.makedirs(f"/tmp/{args.checkpoint_directory}")

    device = xm.xla_device()
    
    # Set tokenizer parallelism to false to avoid warnings
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    tokenizer = AutoTokenizer.from_pretrained(DL_DIR)
    print ('Loaded tokenizer')
    base_model = AutoModelForCausalLM.from_pretrained(DL_DIR, torch_dtype=torch.bfloat16)
    print ('Loaded base model')

    # Set LoRA configuration
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["k_proj", "v_proj"],
    )
    
    # Required when using Llama2, as the tokenizer has no padding
    tokenizer.pad_token = tokenizer.eos_token

    # Load the dataset and format it for training.
    data = load_dataset("Abirate/english_quotes", split="train")
    max_seq_length = 512
    print ('Loaded dataset')

    # Set up the FSDP config. To enable FSDP via SPMD, set xla_fsdp_v2 to True.
    fsdp_config = {"fsdp_transformer_layer_cls_to_wrap": [
            "LlamaDecoderLayer"
        ],
        "xla": True,
        "xla_fsdp_v2": True,
        "xla_fsdp_grad_ckpt": True}

    OUTPUT_DIR=f"/tmp/{args.output_folder}"
    CHECKPOINT_DIR=f"/tmp/{args.checkpoint_directory}"

    # Finally, set up the trainer and train the model.
    trainer = SFTTrainer(
        model=base_model,
        train_dataset=data,
        args=TrainingArguments(
            per_device_train_batch_size=BATCH_SIZE,  # This is actually the global batch size for SPMD.
            num_train_epochs=args.epochs,
            max_steps=-1,
            output_dir=OUTPUT_DIR,
            optim="adafactor",
            logging_steps=1,
            dataloader_drop_last = True,  # Required for SPMD.
            fsdp="full_shard",
            fsdp_config=fsdp_config,
        ),
        peft_config=lora_config,
        dataset_text_field="quote",
        max_seq_length=max_seq_length,
        packing=True,
    )

    # train
    trainer.train()
    
    adapter_model_id = "adapter_model"
    adapter_path = f"{CHECKPOINT_DIR}/{adapter_model_id}"
    merged_model_id = "merged_model"
    merged_model_path = f"{CHECKPOINT_DIR}/{merged_model_id}"
    
    trainer.model.to('cpu').save_pretrained(adapter_path)
    
    # Save the adapter, merged model, and tokenizer
    base_model = AutoModelForCausalLM.from_pretrained(DL_DIR, torch_dtype=torch.bfloat16)
    peft_model = PeftModel.from_pretrained(base_model, adapter_path)
    merged_model = peft_model.merge_and_unload()
    merged_model.save_pretrained(merged_model_path,safe_serialization=False)
    tokenizer.save_pretrained(merged_model_path)
    
    # Copy merged files to GCS folder
    OUTPUT_PREFIX=f"{args.merged_model_folder}/{merged_model_id}/{xr.process_index()}/"
    upload_directory_with_transfer_manager(bucket_name=args.bucket_name,source_directory=merged_model_path,
                                       blob_name_prefix=OUTPUT_PREFIX)
    print ('Uploaded merged model files')

    # copy adapter files to GCS folder
    OUTPUT_PREFIX=f"{args.merged_model_folder}/{adapter_model_id}/{xr.process_index()}/"
    upload_directory_with_transfer_manager(bucket_name=args.bucket_name,source_directory=adapter_path,
                                       blob_name_prefix=OUTPUT_PREFIX)
    print ('Uploaded adapter model files')

    print ('Exiting job')
    sys.exit(0)

if __name__ == "__main__":
    main()

使用Vertex AI自定义训练作业进行微调

本节演示如何使用PEFT LoRA在Vertex AI自定义训练作业上微调和部署Llama2模型。 LoRA（Low-Rank Adaptation）是PEFT（参数高效微调）的一种方法，其中预训练模型的权重被冻结，并在微调过程中训练表示模型权重变化的秩分解矩阵。 有关LoRA的更多信息，请阅读以下出版物：[Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L. and Chen, W., 2021. Lora: Low-rank adaptation of large language models. *arXiv preprint arXiv:2106.09685*](https://arxiv.org/abs/2106.09685).

### 将docker配置为普通用户的运行权限

In [None]:
!sudo usermod -a -G docker ${USER}

将目录切换到trainer目录以构建docker容器。

In [None]:
%cd trainer

构建定制的Docker容器并推送到存储库。

In [None]:
!docker build -t $PYTORCH_TRAIN_DOCKER_URI -f Dockerfile .

In [None]:
!docker push $PYTORCH_TRAIN_DOCKER_URI

回到你的主目录

In [None]:
%cd ..

设置GCS文件夹位置和作业配置设置

In [None]:
# Create a GCS folder to store the merged model with the base model and the
# fine-tuned LORA adapter.
BUCKET_NAME = BUCKET_URI.replace("gs://", "")
OUTPUT_DIR_NAME = "output"
CHECKPOINT_DIR_NAME = "output_chk"
NUM_EPOCHS = 200
MERGED_MODEL_FOLDER = "llama2-7b-hf/modelfiles"

# See machines type to match chips being used
# Topologies of 2x2, 2x4, 4x4 = 4, 8, 16 chip settings and use quota from aiplatform.googleapis.com/custom_model_training_tpu_v5e
MACHINE_TYPE = "ct5lp-hightpu-4t"
TPU_TOPOLOGY = "4x4"

DISPLAY_NAME_PREFIX = f"llama2-7b-lora-train-{TPU_TOPOLOGY}"
tpuv5e_llama2_peft_job = {
    "display_name": get_job_name_with_datetime(DISPLAY_NAME_PREFIX),
    "job_spec": {
        "worker_pool_specs": [
            {
                "machine_spec": {
                    "machine_type": MACHINE_TYPE,
                    "tpu_topology": TPU_TOPOLOGY,
                },
                "replica_count": 1,
                "container_spec": {
                    "image_uri": PYTORCH_TRAIN_DOCKER_URI,
                    "args": [
                        f"--tpu_topology={TPU_TOPOLOGY}",
                        f"--model_name={HF_MODEL_ID}",
                        f"--bucket_name={BUCKET_NAME}",
                        f"--output_folder={OUTPUT_DIR_NAME}",
                        f"--checkpoint_directory={CHECKPOINT_DIR_NAME}",
                        f"--epochs={NUM_EPOCHS}",
                        f"--merged_model_folder={MERGED_MODEL_FOLDER}",
                    ],
                },
            },
        ],
    },
}

tpuv5e_llama2_peft_job

创建作业客户端并运行作业

In [None]:
job_client = aiplatform.gapic.JobServiceClient(
    client_options=dict(api_endpoint=f"{REGION}-aiplatform.googleapis.com")
)

In [None]:
create_tpuv5e_llama2_peft_job_response = job_client.create_custom_job(
    parent="projects/{project}/locations/{location}".format(
        project=PROJECT_ID, location=REGION
    ),
    custom_job=tpuv5e_llama2_peft_job,
)
print(create_tpuv5e_llama2_peft_job_response)

检查工作进度
这可能需要20-60分钟或更长时间，具体取决于模型大小。多次运行此单元格以检查进度。

In [None]:
get_tpuv5e_llama2_peft_job_response = job_client.get_custom_job(
    name=create_tpuv5e_llama2_peft_job_response.name
)
get_tpuv5e_llama2_peft_job_response

#### 点击此单元格输出的控制台日志网址以查看您的日志

In [None]:
job_id = create_tpuv5e_llama2_peft_job_response.name[
    create_tpuv5e_llama2_peft_job_response.name.rfind("/") + 1 :
]
STARTDATE = datetime.today() - timedelta(days=1)
STARTDATE = STARTDATE.strftime("%Y-%m-%dT%H:%M:%S.%f")
ENDDATE = datetime.today() + timedelta(days=0.1)
ENDDATE = ENDDATE.strftime("%Y-%m-%dT%H:%M:%S.%f")
print(
    f"https://console.cloud.google.com/logs/query;query=resource.labels.job_id=%22{job_id}%22;cursorTimestamp={ENDDATE}Z;startTime={STARTDATE}Z;endTime={ENDDATE}Z?project={PROJECT_ID}"
)

等待直到培训任务完成

In [None]:
import time

from google.cloud.aiplatform import gapic as aip

while True:
    response = job_client.get_custom_job(
        name=create_tpuv5e_llama2_peft_job_response.name
    )
    if response.state != aip.JobState.JOB_STATE_SUCCEEDED:
        print(f"Training is not complete and is in state {response.state.name}")
        if response.state == aip.JobState.JOB_STATE_FAILED:
            raise Exception("Training Job Failed")
    else:
        print("Training has completed")
        break
    time.sleep(60)

### 部署经过微调的模型
本部分将上传模型至模型注册表，并使用Hex-LLM部署模型，这是一个由谷歌云开发的基于XLA构建的高效大型语言模型服务解决方案

模型部署步骤将需要15-20分钟完成。

In [None]:
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:20240328_RC01"

# GCS folder path where the merged model files were saved in you bucket
# MERGED_MODEL_FOLDER="llama2-7b-hf/modelfiles" set during fine-tuning
MERGED_MODEL_PATH = f"{MERGED_MODEL_FOLDER}/merged_model/0"
GCS_MODEL_PATH = f"{BUCKET_URI}/{MERGED_MODEL_PATH}"

DISPLAY_NAME_PREFIX = "llama2-7b-lora-deploy"  # @param {type:"string"}
JOB_NAME = get_job_name_with_datetime(DISPLAY_NAME_PREFIX)
GCS_MODEL_PATH

#### 检查您的GCS目录中的模型文件

您的输出应该显示一个类似以下的文件列表
```
gs://<YOUR-BUCKET>/modelfiles/merged_model/config.json
gs://<YOUR-BUCKET>/modelfiles/merged_model/generation_config.json
gs://<YOUR-BUCKET>/modelfiles/merged_model/pytorch_model-00001-of-00003.bin
gs://<YOUR-BUCKET>/modelfiles/merged_model/pytorch_model-00002-of-00003.bin
gs://<YOUR-BUCKET>/modelfiles/merged_model/pytorch_model-00003-of-00003.bin
gs://<YOUR-BUCKET>/modelfiles/merged_model/pytorch_model.bin.index.json
gs://<YOUR-BUCKET>/modelfiles/merged_model/special_tokens_map.json
gs://<YOUR-BUCKET>/modelfiles/merged_model/tokenizer.json
gs://<YOUR-BUCKET>/modelfiles/merged_model/tokenizer_config.json
```

In [None]:
!gsutil ls $GCS_MODEL_PATH

定义用于部署模型的函数

In [None]:
from typing import Tuple


def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "ct5lp-hightpu-4t",
    max_num_batched_tokens: int = 11264,  # 11264
    tokens_pad_multiple: int = 1024,
    seqs_pad_multiple: int = 32,
    sync: bool = True,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    num_tpu_chips = int(machine_type[-2])
    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        "--log_level=INFO",
        f"--model={model_id}",
        "--load_format=pt",  # Note: Using Pytorch bin format for weights
        f"--tensor_parallel_size={num_tpu_chips}",
        "--num_nodes=1",
        "--use_ray",
        "--batch_mode=continuous",
        f"--max_num_batched_tokens={max_num_batched_tokens}",
        f"--tokens_pad_multiple={tokens_pad_multiple}",
        f"--seqs_pad_multiple={seqs_pad_multiple}",
    ]

    env_vars = {
        "PJRT_DEVICE": "TPU",
        "RAY_DEDUP_LOGS": "0",
        "RAY_USAGE_STATS_ENABLED": "0",
    }

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.entrypoints.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        deploy_request_timeout=1800,
        service_account=service_account,
        sync=sync,
    )
    return model, endpoint

部署模型到 Vertex
`deploy_model_hexllm` 函数将返回一个指向已添加到 Vertex AI 模型注册表的模型的引用，以及一个将部署模型的新端点。

In [None]:
print("Using model from: ", GCS_MODEL_PATH)
model, endpoint = deploy_model_hexllm(
    model_name=JOB_NAME,
    model_id=GCS_MODEL_PATH,
    service_account=SERVICE_ACCOUNT,
    sync=False,
)
print("endpoint_name:", endpoint.name)

部署模型后，检查日志。

In [None]:
ENDPOINT_ID = endpoint.name[endpoint.name.rfind("/") + 1 :]
STARTDATE = datetime.today() - timedelta(days=1)
STARTDATE = STARTDATE.strftime("%Y-%m-%dT%H:%M:%S.%f")
ENDDATE = datetime.today() + timedelta(days=0.1)
ENDDATE = ENDDATE.strftime("%Y-%m-%dT%H:%M:%S.%f")
print(
    f"https://console.cloud.google.com/logs/query;query=resource.type%3D%22aiplatform.googleapis.com%2FEndpoint%22%20resource.labels.endpoint_id%3D%22{ENDPOINT_ID}%22%20resource.labels.location%3D%22{REGION}%22;startTime={STARTDATE}Z;endTime={ENDDATE}Z?project={PROJECT_ID}"
)

等到终点完成。

In [None]:
endpoint.wait()

In [None]:
# (optional) Wait 15 minutes while the model is downloaded and setup
if os.getenv("IS_TESTING"):
    time.sleep(900)

注意：整个部署过程可能需要30-40分钟甚至更长时间。在部署成功后（大约15-20分钟），微调模型将从用于训练的GCS存储桶中下载。因此，在模型部署步骤成功后，并且在运行下面的下一步之前，需要额外的约15-20分钟的等待时间（取决于模型大小）。否则，在发送请求到终端时，可能会看到`ServiceUnavailable: 503 502:Bad Gateway`错误。

### 一旦部署准备就绪，请发送预测请求

部署成功后，您可以发送带有文本提示的请求到终端。第一个请求将花费一两分钟的时间进行模型热身。

示例：

```
Prompt: 提供一个不超过50个字符的90年代三部最好笑的喜剧电影的列表
Response:  1) 铁甲侠 2) 史酷比 3) 贝多芬的要求
```

In [None]:
PROMPT = (
    "Provide a list of the 3 best comedy movies in the 90s in 50 characters or less"
)

instances = [
    {
        "prompt": PROMPT,
        "max_tokens": 80,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 1.0,
    },
]

response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

清理

要清理本项目中使用的所有谷歌云资源，您可以删除用于教程的[谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除本教程中创建的各个资源。

In [None]:
# Delete the train job.
job_client.delete_custom_job(name=create_tpuv5e_llama2_peft_job_response.name)

# Undeploy model and delete endpoint.
endpoint.delete(force=True)

# Delete models.
model.delete()

import os

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI