In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI模型花园 - 綿羊模型

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_vicuna.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_vicuna.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_vicuna.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
在Vertex AI Workbench中打开
    </a>（建议使用Python-3 GPU笔记本）
  </td>
</table>

## 概述

本笔记本演示了如何使用预先构建的Vicuna进行本地推断，并使用[vLLM](https://github.com/vllm-project/vllm)部署预先构建的Vicuna。

### 目标

- 使用预先构建的Vicuna进行本地推断
- 部署预建的Vicuna与[vLLM](https://github.com/vllm-project/vllm)以提高服务吞吐量

| 模型 |
| :- |
| [lmsys/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) |
| [lmsys/vicuna-7b-v1.5-16k](https://huggingface.co/lmsys/vicuna-7b-v1.5-16k) |
| [lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) |
| [lmsys/vicuna-13b-v1.5-16k](https://huggingface.co/lmsys/vicuna-13b-v1.5-16k) |

### 成本

本教程使用Google Cloud的计费组件：

* Vertex AI
* 云存储

了解有关[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[云存储定价](https://cloud.google.com/storage/pricing)的信息，并使用[价格计算器](https://cloud.google.com/products/calculator/)根据您的预期使用量生成费用估计。

在开始之前

**注意**: Jupyter使用以`!`开头的行作为shell命令，并将以`$`开头的Python变量插入这些命令中。

使用Vicuna进行本地推理需要GPU。

只限于Colab
如果您正在使用工作台，请运行以下命令并跳过此部分。 

In [None]:
! pip3 install transformers==4.31.0
! pip3 install sentencepiece==0.1.99
! pip3 install accelerate==0.21.0

import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### 设置 Google 云项目

1. [选择或创建一个Google云项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建一个账户时，您将获得300美元的免费信用额用于您的计算/存储成本。

2. [确保为您的项目启用了计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用Vertex AI API和Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component)。

4. [创建一个云存储存储桶](https://cloud.google.com/storage/docs/creating-buckets) 用于存储实验输出。

5. [创建一个服务账号](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) 并为其分配 `Vertex AI User` 和 `Storage Object Admin` 角色，以部署微调模型到 Vertex AI 端点。

为实验环境设置以下变量。指定的云存储桶(`BUCKET_URI`)应位于指定的区域(`REGION`)。请注意，多区域桶（如"us"）不被视为与多区域范围（如"us-central1"）覆盖的单一区域相匹配。

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = ""  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

### 初始化 Vertex AI API

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### 定义常数.

In [None]:
# The pre-built serving docker image.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231002_0916_RC00"

### 定义常见函数

In [None]:
from datetime import datetime

from google.cloud import aiplatform


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
    max_model_len: int = 4000,
) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.90",
        "--disable-log-stats",
        f"--max-model-len={max_model_len}",
    ]
    serving_env = {
        "MODEL_ID": "lmsys/vicuna",
        "DEPLOY_SOURCE": "notebook",
    }
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

使用预先构建的维库纳在本地进行推理

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

prebuilt_model_id = "lmsys/vicuna-7b-v1.5"

tokenizer = AutoTokenizer.from_pretrained(
    prebuilt_model_id,
    use_fast=True,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    prebuilt_model_id,
    low_cpu_mem_usage=True,
    device_map="cuda",
)

prompt = "Q: What is the largest animal?\nA:"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")
generation_output = model.generate(input_ids=input_ids, max_new_tokens=32)
print(tokenizer.decode(generation_output[0], skip_special_tokens=True))

使用vLLM部署预构建的维库纳模型

本节在终端点上使用[vLLM](https://github.com/vllm-project/vllm)部署预构建的维库纳模型。模型部署步骤将需要大约15分钟完成。

vLLM是一个高度优化的LLM服务框架，可以显著提高服务吞吐量。您的QPS越高，使用vLLM获得的改进越多。

设置预构建的模型ID。

In [None]:
prebuilt_model_id = "lmsys/vicuna-7b-v1.5"  # @param ["lmsys/vicuna-7b-v1.5", "lmsys/vicuna-7b-v1.5-16k", "lmsys/vicuna-13b-v1.5", "lmsys/vicuna-13b-v1.5-16k"]

In [None]:
# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/predictions/configure-compute

# Set max_model_len to the desired context length.
max_model_len = 2000

# Sets V100s/A100 to deploy Vicuna models.
machine_type = "n1-highmem-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1  # for lmsys/vicuna-7b-v1.5

# machine_type = "n1-highmem-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2  # for lmsys/vicuna-13b-v1.5

# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1  # for lmsys/vicuna-7b-v1.5-16k, lmsys/vicuna-13b-v1.5-16k
# max_model_len = 16000

model, endpoint = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="vicuna-serve-vllm"),
    model_id=prebuilt_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
)

请注意：在部署成功后，预建模型权重将从原始位置动态下载。因此，在上述模型部署步骤成功之后，需要额外等待5分钟**才能**运行下面的下一步骤。否则，当您发送请求到端点时，可能会看到 `ServiceUnavailable: 503 502:Bad Gateway` 错误。

一旦部署成功，您就可以使用文本提示向端点发送请求。vLLM支持的参数可以在[这里](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64)找到。

In [None]:
instance = {
    "prompt": "Q: What is the tallest animal?\nA:",
    "n": 1,
    "max_tokens": 100,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint.predict(instances=[instance])
print(response.predictions[0])

清理资源

In [None]:
# Undeploy models and delete endpoints.
endpoint.delete(force=True)

# Delete models.
model.delete()