In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# BigQuery数据帧ML：药物名称生成

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
     在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
     在Vertex AI Workbench中打开
    </a>
  </td>                    
</table>

**_注意_**：此笔记本已在以下环境中进行了测试：

- Python 版本 = 3.9

## 概述

这篇笔记的目标是展示一个企业生成式人工智能的使用案例。营销用户可以提供关于一种新药品及其通用名称的信息，并获得针对该药品的市场定位品牌名称的想法。

了解更多关于[BigQuery DataFrames](https://cloud.google.com/bigquery/docs/dataframes-quickstart)。

### 目标

在本教程中，您将了解生成式人工智能（Generative AI）概念，如提示和少样本学习，以及如何使用 BigFrames ML 来简单执行这些任务，只需使用直观的 dataframe API。

执行的步骤包括：

1. 要求用户提供药品的通用名称和用法。
2. 使用 `bigframes` 查询超过 100,000 种药品的 FDA 数据集，根据品牌名称、通用名称和适应症和用法列进行筛选。
3. 过滤此数据集，找到可以用作提示调整中示例的典型品牌名称。
4. 创建一个提示，包括用户输入、通用说明、示例和反例，以便获得所需品牌名称。
5. 使用 `bigframes.ml.llm.PaLM2TextGenerator` 生成品牌名称的选择。

### 数据集

该笔记本使用可在[`bigquery-public-data.fda_drug`](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1sbigquery-public-data!2sfda_drug)处获取的[美国食品和药物管理局数据集](https://cloud.google.com/blog/topics/healthcare-life-sciences/fda-mystudies-comes-to-google-cloud)。

费用

本教程使用Google Cloud的收费组件：

* BigQuery（计算）
* BigQuery ML

了解[BigQuery计算定价](https://cloud.google.com/bigquery/pricing#analysis_pricing_models),
和[BigQuery ML定价](https://cloud.google.com/bigquery/pricing#bqml),
并使用[定价计算器](https://cloud.google.com/products/calculator/)
根据您的预期使用情况生成费用估算。

## 安装

安装以下必要的软件包以执行此笔记本。

In [None]:
!pip install -U --quiet bigframes

只有合作：取消下面的单元格以重新启动内核。

In [None]:
# # Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### 导入库

In [None]:
import bigframes.pandas as bpd
from bigframes.ml.llm import PaLM2TextGenerator
from google.cloud import bigquery_connection_v1 as bq_connection
from IPython.display import Markdown

### 验证您的谷歌云帐户

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下相关说明进行操作。

1. Vertex AI 工作台
* 什么都不要做，因为您已经进行了身份验证。

2. 本地的 JupyterLab 实例，取消注释并运行:

In [None]:
# ! gcloud auth login

3. 在Colab中取消注释并运行:

In [None]:
# from google.colab import auth

# auth.authenticate_user()

## 开始之前

### 设置您的Google Cloud项目

**无论您使用的是哪种笔记本环境，下面这些步骤都是必需的。**

1. [选择或创建一个Google Cloud项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建账户时，您将获得$300的免费信用额度用于计算/存储成本。

2. [确保您的项目已启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用BigQuery API](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com)。

4. 如果您在本地运行此笔记本，您需要安装[Cloud SDK](https://cloud.google.com/sdk)。

设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 查看支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "<your-project-id>"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### BigFrames 配置

接下来，我们将指定一个[BigQuery连接](https://cloud.google.com/bigquery/docs/working-with-connections)。 如果您已经有一个连接，您可以简化提供名称并跳过以下创建步骤。

In [None]:
# Please fill in these values.
LOCATION = "us"  # @param {type:"string"}
CONNECTION = "<your-connection>"  # @param {type:"string"}

connection_name = f"{PROJECT_ID}.{LOCATION}.{CONNECTION}"

我们将尝试使用提供的连接，如果不存在，则创建一个新的。我们还将打印所使用的服务账号。

In [None]:
# Initialize client and set request parameters
client = bq_connection.ConnectionServiceClient()
new_conn_parent = f"projects/{PROJECT_ID}/locations/{LOCATION}"
exists_conn_parent = (
    f"projects/{PROJECT_ID}/locations/{LOCATION}/connections/{CONNECTION}"
)
cloud_resource_properties = bq_connection.CloudResourceProperties({})

# Try to connect using provided connection
try:
    request = client.get_connection(
        request=bq_connection.GetConnectionRequest(name=exists_conn_parent)
    )
    CONN_SERVICE_ACCOUNT = f"serviceAccount:{request.cloud_resource.service_account_id}"
# Create a new connection on error
except Exception:
    connection = bq_connection.types.Connection(
        {"friendly_name": CONNECTION, "cloud_resource": cloud_resource_properties}
    )
    request = bq_connection.CreateConnectionRequest(
        {
            "parent": new_conn_parent,
            "connection_id": CONNECTION,
            "connection": connection,
        }
    )
    response = client.create_connection(request)
    CONN_SERVICE_ACCOUNT = (
        f"serviceAccount:{response.cloud_resource.service_account_id}"
    )
# Set service account permissions
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'

print(CONN_SERVICE_ACCOUNT)

初始化BigFrames客户端

在这里，我们根据提供的参数设置项目配置。

In [None]:
bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.location = LOCATION

## 生成名称

让我们从输入一般名称和药物描述开始。

In [None]:
GENERIC_NAME = "Entropofloxacin"  # @param {type:"string"}
USAGE = "Entropofloxacin is a fluoroquinolone antibiotic that is used to treat a variety of bacterial infections, including: pneumonia, streptococcus infections, salmonella infections, escherichia coli infections, and pseudomonas aeruginosa infections It is taken by mouth or by injection. The dosage and frequency of administration will vary depending on the type of infection being treated. It should be taken for the full course of treatment, even if symptoms improve after a few days. Stopping the medication early may increase the risk of the infection coming back."  # @param {type:"string"}
NUM_NAMES = 10  # @param {type:"integer"}
TEMPERATURE = 0.5  # @param {type: "number"}

我们现在可以创建一个提示字符串，并填充它的名称和描述。

In [None]:
zero_shot_prompt = f"""Provide {NUM_NAMES} unique and modern brand names in Markdown bullet point format. Do not provide any additional explanation.

Be creative with the brand names. Don't use English words directly; use variants or invented words.

The generic name is: {GENERIC_NAME}

The indications and usage are: {USAGE}."""

print(zero_shot_prompt)

接下来，让我们创建一个辅助函数来使用我们的模型进行预测。它将接受一个字符串输入，并将其添加到一个临时的 BigFrames `DataFrame` 中。它还会返回从响应`DataFrame`中提取的字符串。

In [None]:
def predict(prompt: str, temperature: float = TEMPERATURE) -> str:
    # Create dataframe
    input = bpd.DataFrame(
        {
            "prompt": [prompt],
        }
    )

    # Return response
    return model.predict(input, temperature).ml_generate_text_llm_result.iloc[0]

我们现在可以初始化模型，并对我们的提示做出响应！

In [None]:
# Get BigFrames session
session = bpd.get_global_session()

# Define the model
model = PaLM2TextGenerator(session=session, connection_name=connection_name)

# Invoke LLM with prompt
response = predict(zero_shot_prompt)

# Print results as Markdown
Markdown(response)

我们开始得很不错！让我们看看能否继续改进我们的回应。

## 小样本学习

让我们尝试使用[小样本学习](https://paperswithcode.com/task/few-shot-learning)。 我们将提供一些示例，说明我们所寻找的内容以及我们的提示。

我们的提示将包含三个部分：
* 一般指示（例如，生成 $n$ 个品牌名称）
* 多个示例
* 有关我们想要生成名称的药物的信息

让我们走一遍如何构建这个提示的步骤。

我们的第一步将是定义提示中我们要提供多少示例。

In [None]:
# Specify number of examples to include

NUM_EXAMPLES = 3  # @param {type:"integer"}

接下来，让我们定义一个前缀，用于设置整体的上下文。

In [None]:
prefix_prompt = f"""Provide {NUM_NAMES} unique and modern brand names in Markdown bullet point format, related to the drug at the bottom of this prompt.

Be creative with the brand names. Don't use English words directly; use variants or invented words.

First, we will provide {NUM_EXAMPLES} examples to help with your thought process.

Then, we will provide the generic name and usage for the drug we'd like you to generate brand names for.
"""

print(prefix_prompt)

我们接下来的步骤将是将示例包含在提示中。

我们将首先通过查询BigQuery公共数据集来检索示例的原始数据。

In [None]:
# Query 3 columns of interest from drug label dataset
df = bpd.read_gbq(
    "bigquery-public-data.fda_drug.drug_label",
    col_order=["openfda_generic_name", "openfda_brand_name", "indications_and_usage"],
)

# Exclude any rows with missing data
df = df.dropna()

# Drop duplicate rows
df = df.drop_duplicates()

# Print values
df.head()

让我们现在筛选结果，去掉不典型的名称。

In [None]:
# Remove names with spaces
df = df[df["openfda_brand_name"].str.find(" ") == -1]

# Remove names with 5 or fewer characters
df = df[df["openfda_brand_name"].str.len() > 5]

# Remove names where the generic and brand name match (case-insensitive)
df = df[df["openfda_generic_name"].str.lower() != df["openfda_brand_name"].str.lower()]

让我们取 `NUM_EXAMPLES` 个样本放入提示中。

In [None]:
# Take a sample and convert to a Pandas dataframe for local usage.
df_examples = df.sample(NUM_EXAMPLES, random_state=3).to_pandas()

df_examples

让我们现在将数据转换为JSON结构，以便嵌入到提示中。为了保持一致性，我们将大写每个示例品牌名称。

In [None]:
examples = [
    {
        "brand_name": brand_name.capitalize(),
        "generic_name": generic_name,
        "usage": usage,
    }
    for brand_name, generic_name, usage in zip(
        df_examples["openfda_brand_name"],
        df_examples["openfda_generic_name"],
        df_examples["indications_and_usage"],
    )
]

print(examples)

我们将为每个示例创建一个提示模板，并查看第一个。

In [None]:
example_prompt = ""
for example in examples:
    example_prompt += f"Generic name: {example['generic_name']}\nUsage: {example['usage']}\nBrand name: {example['brand_name']}\n\n"

example_prompt

最后，我们可以为我们的提示创建一个后缀。 它将包含药物的通用名称，用途，最后请求品牌名称。

In [None]:
suffix_prompt = f"""Generic name: {GENERIC_NAME}
Usage: {USAGE}
Brand names:"""

print(suffix_prompt)

让我们把它集中到一个简短的提示中。

In [None]:
# Define the prompt
few_shot_prompt = prefix_prompt + example_prompt + suffix_prompt

# Print the prompt
print(few_shot_prompt)

现在，让我们把提示传递给LLM，并收到回复！

In [None]:
response = predict(few_shot_prompt)

Markdown(response)

大批量生成

让我们把这些实验提升到下一个水平，通过大批量生成多个名字。我们将看看如何在规模上利用 BigFrames！

我们可以首先找到缺少品牌名称的药物。符合这一标准的药物大约有4,000种。我们在这份笔记本中设置一个限制为100。

In [None]:
# Query 3 columns of interest from drug label dataset
df_missing = bpd.read_gbq(
    "bigquery-public-data.fda_drug.drug_label",
    col_order=["openfda_generic_name", "openfda_brand_name", "indications_and_usage"],
)

# Exclude any rows with missing data
df_missing = df_missing.dropna()

# Include rows in which openfda_brand_name equals openfda_generic_name
df_missing = df_missing[
    df_missing["openfda_generic_name"] == df_missing["openfda_brand_name"]
]

# Limit the number of rows for demonstration purposes
df_missing = df_missing.head(100)

# Print values
df_missing.head()

我们将为每一行创建一个带有自定义提示的`prompt`列。

In [None]:
df_missing["prompt"] = (
    "Provide a unique and modern brand name related to this pharmaceutical drug."
    + "Don't use English words directly; use variants or invented words. The generic name is: "
    + df_missing["openfda_generic_name"]
    + ". The indications and usage are: "
    + df_missing["indications_and_usage"]
    + "."
)

我们将创建一个新的辅助方法，`batch_predict()` 并查询 LLM。该任务可能需要几分钟才能执行。

In [None]:
def batch_predict(
    input: bpd.DataFrame, temperature: float = TEMPERATURE
) -> bpd.DataFrame:
    return model.predict(input, temperature).ml_generate_text_llm_result


response = batch_predict(df_missing["prompt"])

让我们来检查一下我们的某个回应的结果！

In [None]:
# Pick a sample
k = 0

# Gather the prompt and response details
prompt_generic = df_missing["openfda_generic_name"][k].iloc[0]
prompt_usage = df_missing["indications_and_usage"][k].iloc[0]
response_str = response[k].iloc[0]

# Print details
print(f"Generic name: {prompt_generic}")
print(f"Brand name: {prompt_usage}")
print(f"Response: {response_str}")

恭喜！您已经学会了如何使用生成式AI来启动创意过程。您还看到了BigFrames如何管理过程的每一步，包括收集数据、数据操作和查询LLM。

清理

要清理此项目中使用的所有Google Cloud资源，您可以[删除您用于教程的Google Cloud项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以取消注释剩余的单元格并运行它们，以删除您在此教程中创建的各个资源。

In [None]:
# Delete the BigQuery Connection
from google.cloud import bigquery_connection_v1 as bq_connection

client = bq_connection.ConnectionServiceClient()
CONNECTION_ID = f"projects/{PROJECT_ID}/locations/{LOCATION}/connections/{CONNECTION}"
client.delete_connection(name=CONNECTION_ID)
print(f"Deleted connection {CONNECTION_ID}.")