In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# CDM Pricing Data的定价优化分析

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/workbench/pricing_optimization/pricing-optimization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/workbench/pricing_optimization/pricing-optimization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在GitHub上查看
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/workbench/pricing_optimization/pricing-optimization.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在Vertex AI Workbench中打开
    </a>
  </td>                                                                                               
</table>

## 目录
* [概述](#section-1)
* [目标](#section-2)
* [数据集](#section-3)
* [成本](#section-4)
* [创建一个BigQuery数据集](#section-5)
* [从Cloud Storage加载数据集](#section-6)
* [数据分析](#section-7)
* [为训练准备数据](#section-8)
* [使用BigQuery ML训练模型](#section-9)
* [从模型生成预测](#section-10)
* [解释结果以选择最佳价格](#section-11)
* [清理](#section-12)

## 概述
<a name="section-1"></a>

本笔记本演示了对[CDM定价数据](https://github.com/trifacta/trifacta-google-cloud/tree/main/design-pattern-pricing-optimization)进行定价优化分析，并使用Vertex AI Workbench托管的笔记本自动化工作流程。

*注：此笔记本文件是为在[Vertex AI Workbench托管的笔记本](https://console.cloud.google.com/vertex-ai/workbench/list/managed)实例中使用Python（本地）内核而开发的。本笔记本的某些组件可能在其他笔记本环境中无法运行。*

了解更多关于[Vertex AI Workbench](https://cloud.google.com/vertex-ai/docs/workbench/introduction)和[BigQuery ML](https://cloud.google.com/vertex-ai/docs/beginner/bqml#machine_learning_directly_in)。

### 目标
<a name="section-2"></a>

本笔记本的目标是使用BigQuery ML构建定价优化模型。已按照以下步骤进行操作：

本教程使用以下Google Cloud ML服务和资源：

- Google Cloud Storage
- BigQuery

执行的步骤包括：

- 从Cloud Storage存储桶加载所需数据集。
- 分析数据集中的字段。
- 处理数据以构建模型。
- 在处理后的数据上构建BigQuery ML预测模型。
- 从BigQuery ML模型中获取预测值。
- 解释预测结果以确定最佳价格。
- 清理。

### 数据集
<a name="section-3"></a>

本笔记本中使用的数据集是[CDM定价数据集](https://github.com/trifacta/trifacta-google-cloud/blob/main/design-pattern-pricing-optimization/CDM_Pricing_large_table.csv)的一部分，其中包含指定日期的产品销售信息。

### 费用
<a name="section-4"></a>

该教程使用 Google Cloud 的以下计费组件：

- Vertex AI
- BigQuery
- Cloud Storage

了解关于 [Vertex AI 价格](https://cloud.google.com/vertex-ai/pricing), [BigQuery 价格](https://cloud.google.com/bigquery/pricing) 和 [Cloud Storage 价格](https://cloud.google.com/storage/pricing)，并使用 [价格计算器](https://cloud.google.com/products/calculator/) 根据您的预期使用量生成费用估算。

### 安装额外的软件包

In [None]:
! pip3 install --quiet --upgrade pandas-gbq 'google-cloud-bigquery[bqstorage,pandas]' seaborn fsspec gcsfs


### 仅限Colab：取消注释以下单元格以重新启动内核

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### 开始之前

#### 设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下方法：
- 运行 `gcloud config list`
- 运行 `gcloud projects list`
- 查阅支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# set the project id
! gcloud config set project $PROJECT_ID

#### 区域

您还可以更改 Vertex AI 使用的“REGION”变量。
了解有关[Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations)的更多信息。

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

### 对您的Google Cloud帐户进行身份验证

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下相关说明进行操作。

**1. Vertex AI Workbench**
- 无需操作，因为您已经验证过了。

**2. 本地JupyterLab实例，** 请取消注释并运行。

In [None]:
# ! gcloud auth login

3. 协作，取消注释并运行：

In [None]:
# from google.colab import auth
# auth.authenticate_user()

4. 服务账号或其他
- 在这里查看所有身份验证选项：[Google Cloud Platform Jupyter Notebook身份验证指南](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/notebook_authentication_guide.ipynb)

UUID

如果您正在参加一次实时教程会话，您可能会使用一个共享的测试帐户或项目。为了避免在创建的资源上的用户名称冲突，您为每个实例会话创建一个uuid，并将其附加到您在本教程中创建的资源名称上。

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

###导入必要的库并定义常量

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from google.cloud import bigquery
from google.cloud.bigquery import Client

#### 设置BigQuery数据集ID和表ID

In [None]:
DATASET = "pricing_optimization" + "_" + UUID  # set the BigQuery dataset-id
TRAINING_DATA_TABLE = (
    "training_data_table"  # set the BigQuery table-id to store the training data
)

创建一个BigQuery数据集
<a name="section-5"></a>

如果您正在使用***Vertex AI Workbench受管笔记本实例***，每个以“#@bigquery”开头的单元格将是一个SQL查询。如果您正在使用Vertex AI Workbench用户管理的笔记本实例或Colab，则它将是一个Markdown单元格。

#@bigquery
-- 在BigQuery中创建一个数据集

创建模式[您的数据集ID]
选项(
  位置="us"
  )

In [None]:
# Construct a BigQuery client object.
client = Client(project=PROJECT_ID)

In [None]:
query = """
CREATE SCHEMA {DATASET}
OPTIONS(
  location="us"
  )
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

## 从云存储加载BigQuery表格
<a name="section-6"></a>

In [None]:
table_id_name = f"{PROJECT_ID}.{DATASET}.data"

In [None]:
table_id = "data"

In [None]:
job_config = bigquery.LoadJobConfig(
    autodetect=True,
    skip_leading_rows=1,
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.CSV,
)
uri = "gs://cloud-samples-data/ai-platform-unified/datasets/tabular/cdm_pricing_large_table.csv"

load_job = client.load_table_from_uri(
    uri, table_id_name, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table(table_id_name)  # Make an API request.
print("Loaded {} rows.".format(destination_table.num_rows))

你会根据这些数据建立一个预测模型，从而确定产品的最佳价格。对于这种类型的模型，你不会使用很多字段：只有销售和与价格相关的字段。针对当前的练习，重点关注以下字段：

- `产品ID`
- `客户层次`
- `财政日期`
- `列表价格收敛`
- `发票数量（件）`
- `净销售额`

## 数据分析

首先，探索数据和分布情况。

#### 从数据框中选择所需的列。

In [None]:
id_col = "Product_ID"
date_col = "Fiscal_Date"
categ_cols = ["Customer_Hierarchy"]
num_cols = ["List_Price_Converged", "Invoiced_quantity_in_Pieces", "Net_Sales"]
required_columns = [id_col] + [date_col] + categ_cols + num_cols
required_columns

创建一个视图，只提取所需的列

In [None]:
query = """
    CREATE OR REPLACE TABLE {DATASET}.required_columns AS
    ( SELECT Product_ID,Fiscal_Date,Customer_Hierarchy,List_Price_Converged,Invoiced_quantity_in_Pieces,Net_Sales FROM `{DATASET}.{table_id}` )
    
""".format(
    DATASET=DATASET, table_id=table_id
)

query_job = client.query(query)  # Make an API request.
print(query_job.result())

查看视图中存储的数据

In [None]:
query = """
    SELECT * FROM {DATASET}.required_columns 
    
""".format(
    DATASET=DATASET
)

query_job = client.query(query)  # Make an API request.
print(query_job.result())

In [None]:
query_job.to_dataframe()

检查数据框中的列类型和空值。

In [None]:
query_job.to_dataframe().info()

这些数据描述显示数据中没有空值。此外，日期字段“Fiscal_Date”被加载为对象类型。

#### 将日期字段的类型更改为datetime。

将Fiscal_Date数据类型从datetime更改为date，并将生成的整个数据存储在视图中。

In [None]:
query = """
CREATE OR REPLACE VIEW {DATASET}.required_columns_final AS
(
SELECT Product_ID,Customer_Hierarchy,List_Price_Converged,Invoiced_quantity_in_Pieces,Net_Sales,CAST(DATE(Fiscal_Date) AS DATE) AS Fiscal_Date FROM {DATASET}.required_columns    
)
""".format(
    DATASET=DATASET
)

query_job = client.query(query)  # Make an API request.
print(query_job.result())

查看required_columns_final视图中的数据

In [None]:
query = """
SELECT * FROM {DATASET}.required_columns_final

""".format(
    DATASET=DATASET
)

query_job = client.query(query)  # Make an API request.
print(query_job.result())

In [None]:
required_columns_final_df = query_job.to_dataframe()

In [None]:
required_columns_final_df

绘制分类字段的分布。

In [None]:
for i in categ_cols:
    required_columns_final_df[i].value_counts(normalize=True).plot(kind="bar")
    plt.title(i)
    plt.show()

#### 绘制数值字段的分布。

In [None]:
for i in num_cols:
    _, ax = plt.subplots(1, 2, figsize=(10, 4))
    required_columns_final_df[i].plot(kind="box", ax=ax[0])
    required_columns_final_df[i].plot(kind="hist", ax=ax[1])
    ax[0].set_title(i + "-Boxplot")
    ax[1].set_title(i + "-Histogram")
    plt.show()

检查Fiscal_Date列中的最大日期和最小日期。

In [None]:
print(required_columns_final_df["Fiscal_Date"].max())
print(required_columns_final_df["Fiscal_Date"].min())

检查每个类别的产品分布。

In [None]:
query = """
SELECT Customer_Hierarchy,COUNT(*) as count FROM (SELECT Customer_Hierarchy,Product_ID FROM {DATASET}.required_columns_final GROUP BY Customer_Hierarchy,Product_ID) GROUP BY  Customer_Hierarchy
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

In [None]:
query_job.to_dataframe()

根据价格百分比变化检查订单的百分比变化。

您可以根据价格的百分比变化来检查订单中的百分比变化，遵循三个步骤。

**步骤1**。首先，您要创建一张表，每当产品的价格发生变化时，表格中就会新增一行，包括关于这种特定产品价格的信息，例如每种价格订单了多少件商品以及与该价格关联的总净销售额。

In [None]:
query = """
create table {DATASET}.price_changes as (
select
       product_id,
       list_price_converged,
       total_ordered_pieces,
       total_net_sales,
       first_price_date,
       lag(list_price_converged) over(partition by product_id order by first_price_date asc) as previous_list,
       lag(total_ordered_pieces) over(partition by product_id order by first_price_date asc) as previous_total_ordered_pieces,
       lag(total_net_sales) over(partition by product_id order by first_price_date asc) as previous_total_net_sales,
       lag(first_price_date) over(partition by product_id order by first_price_date asc) as previous_first_price_date,
       
       
       from (
           select
               product_id,list_price_converged,sum(invoiced_quantity_in_pieces) as total_ordered_pieces, sum(net_sales) as total_net_sales, min(fiscal_date) as first_price_date
           from `{DATASET}.required_columns_final` AS cdm_pricing
           group by 1,2
           order by 1, 2 asc
       )
);

""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

In [None]:
query = """
select * from {DATASET}.price_changes order by product_id, first_price_date 
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

In [None]:
df_price_changes = query_job.to_dataframe()

In [None]:
df_price_changes

**步骤2**。接下来，在临时表准备好后，您可以计算不同SKU之间的价格变化

(之前的价格清单 - 收敛价格清单)/nullif(之前的价格清单,0)*100

**步骤3**。接下来，您可以计算不同SKU之间的总订单件数变化
(总订单件数 - 之前的总订单件数)/nullif(之前的总订单件数,0)*100

In [None]:
query = """
select *,(list_price_converged-previous_list)/nullif(previous_list,0)*100 as price_change_perc,(total_ordered_pieces-previous_total_ordered_pieces)/nullif(previous_total_ordered_pieces,0)*100 as order_change_perc  from `{DATASET}.price_changes`
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

现在您有一个数据框(df_for_plot)，其中包含price_change_perc和order_change_perc字段。

In [None]:
df_for_plot = query_job.to_dataframe()

In [None]:
# sort values chronologically
df_for_plot.sort_values(by=["product_id", "first_price_date"], inplace=True)
df_for_plot.reset_index(drop=True, inplace=True)

In [None]:
df_for_plot

最后，您可以通过分析价格改变后每次价格变动和已订购物品总数之间的关系来了解发生了什么。

In [None]:
# plot a scatterplot to visualize the changes
sns.scatterplot(
    x="price_change_perc",
    y="order_change_perc",
    data=df_for_plot,
    hue="product_id",
    legend=False,
)
plt.title("Percentage of change in price vs order")
plt.show()

对于大多数产品，订单数量的百分比变化较高，而价格的百分比变化较低。这表明价格的变化过大可能会影响订单数量。

**注意**：数据中似乎存在一些异常值，百分比变化超过800。在当前练习中，不要采取任何手动措施处理异常值，因为您将创建一个已处理异常值的BigQuery ML时间序列模型。

## 为训练数据进行预处理
#### 检查哪些`Product_ID`的订单数量最多。

创建一个视图，根据客户层次结构，存储每种产品的订单数量。

In [None]:
query = """
CREATE OR REPLACE VIEW {DATASET}.total_orders AS
(
SELECT Customer_Hierarchy,Product_ID,SUM(Invoiced_quantity_in_Pieces) AS Invoiced_quantity_in_Pieces FROM {DATASET}.required_columns_final GROUP BY Customer_Hierarchy,Product_ID

)
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

In [None]:
query = """
SELECT * FROM {DATASET}.total_orders""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

In [None]:
# sort values chronologically
df_total_orders = query_job.to_dataframe()
df_total_orders.sort_values(by=["Product_ID"], inplace=True)
df_total_orders.reset_index(drop=True, inplace=True)

In [None]:
df_total_orders

#### 选择每个客户层次结构中的顶级产品

以下是一个示例，展示如何找出每个客户层次结构中的顶级产品

示例：
假设首先总订单视图如下：

<table>
    <tr>
        <th>
            Customer_Hierarchy
        </th>
        <th> 
            Invoiced_quantity_in_Pieces
        </th>
        <th> 
            Product_ID
        </th>
    </tr>    
    <tr> 
        <td>食品</td>                 
        <td>200</td>                       
        <td>1</td> 
    </tr>
    <tr> 
        <td>纸张</td>                 
        <td>100</td>                       
        <td>2</td> 
    </tr>
    <tr> 
        <td>食品</td>                 
        <td>300</td>                       
        <td>3</td> 
    </tr>
    <tr> 
        <td>纸张</td>                 
        <td>400</td>                       
        <td>4</td> 
    </tr>
   
</table>    

首先我们按照Customer_Hierarchy对total_orders视图进行分区，并按照Invoiced_quantity_in_Pieces降序排序。
应用分区后变为：

<table>
    <tr>
        <th>
            Customer_Hierarchy
        </th>
        <th> 
            Invoiced_quantity_in_Pieces
        </th>
        <th> 
            Product_ID
        </th>
    </tr>    
    <tr> 
        <td>食品</td>                 
        <td>300</td>                       
        <td>3</td> 
    </tr>
    <tr> 
        <td>食品</td>                 
        <td>200</td>                       
        <td>1</td> 
    </tr>
    <tr> 
        <td>纸张</td>                 
        <td>100</td>                       
        <td>2</td> 
    </tr>
    <tr> 
        <td>纸张</td>                 
        <td>400</td>                       
        <td>4</td> 
    </tr>
</table>   

现在对于每个Customer_Hierarchy，Invoiced_quantity_in_Pieces将按降序排列。    
现在我们对上述表应用ROW_NUMBER函数
之后变为：

<table>
    <tr>
        <th>
            Customer_Hierarchy
        </th>
        <th> 
            Invoiced_quantity_in_Pieces
        </th>
        <th> 
            Product_ID
        </th>
        <th>
            rowNumber
        </th>    
    </tr>    
    <tr> 
        <td>食品</td>                 
        <td>300</td>                       
        <td>3</td>
        <td>1</td>
    </tr>
    <tr> 
        <td>食品</td>                 
        <td>200</td>                       
        <td>1</td>
        <td>2</td>
    </tr>
    <tr> 
        <td>纸张</td>                 
        <td>100</td>                       
        <td>2</td> 
        <td>1 </td>
    </tr>
    <tr> 
        <td>纸张</td>                 
        <td>400</td>                       
        <td>4</td> 
        <td>2</td>
    </tr>
</table>   

（对于唯一的Customer_Hierarchy，编号从1开始）

In [None]:
query = """
SELECT 
  *,
  ROW_NUMBER() OVER(PARTITION BY Customer_Hierarchy ORDER BY Invoiced_quantity_in_Pieces DESC) rowNumber
  FROM {DATASET}.total_orders
""".format(
    DATASET=DATASET
)
query_job = client.query(query)

In [None]:
query_job.to_dataframe()

正如您所看到的，如果您参考客户层次纸张，按件计的开票数量是按降序排列的，行号从1开始。

In [None]:
query_job.to_dataframe().loc[query_job.to_dataframe()["Customer_Hierarchy"] == "Paper"]

我们希望选择每个客户层次中发票数量最高的行，因此选择行号为1

In [None]:
query = """
 SELECT A.Product_ID, A.Customer_Hierarchy,A.Invoiced_quantity_in_Pieces
  FROM (
  SELECT 
  *,
  ROW_NUMBER() OVER(PARTITION BY Customer_Hierarchy ORDER BY Invoiced_quantity_in_Pieces DESC) rowNumber
  FROM {DATASET}.total_orders
  )A
  WHERE A.rowNumber =1;
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
print(query_job.result())

In [None]:
query_job.to_dataframe()

从上述结果中，您可以推断以下内容：

- 在**食品**类别下，**SKU 62**有最大订单量。
- 在**制造业**类别下，**SKU 17**有最大订单量。
- 在**纸品**类别下，**SKU 107**有最大订单量。
- 在**出版**类别下，**SKU 8**有最大订单量。
- 在**公用事业**类别下，**SKU 140**有最大订单量。

考虑到存在太多 ID，而大多数记录只有少量，只考虑上述具有最大订单量的`产品 ID`。

**注意**：`Invoiced_quantity_in_Pieces` 字段似乎是*浮点*类型，而不是本应该是*整数*类型。这可能是因为数据本身可能是平均值。

请检查这些“Product_ID”可用的各种价格。

首先，从required_columns_final视图中，我们只选择具有我们所需产品ID和客户层次结构的行。

In [None]:
query = """
SELECT * FROM {DATASET}.required_columns_final WHERE Product_ID="SKU 62" AND Customer_Hierarchy="Food"
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
df_sku_62 = query_job.to_dataframe()
df_sku_62

然后我们绘制这些`Product_ID`可用的各种价格。

In [None]:
print(df_sku_62["List_Price_Converged"].value_counts())

In [None]:
query = """
SELECT * FROM {DATASET}.required_columns_final WHERE Product_ID="SKU 17" AND Customer_Hierarchy="Manufacturing"
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
df_sku_17 = query_job.to_dataframe()
df_sku_17

In [None]:
print(df_sku_17["List_Price_Converged"].value_counts())

In [None]:
query = """
SELECT * FROM {DATASET}.required_columns_final WHERE Product_ID="SKU 107" AND Customer_Hierarchy="Paper"
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
df_sku_107 = query_job.to_dataframe()
df_sku_107

In [None]:
print(df_sku_107["List_Price_Converged"].value_counts())

In [None]:
query = """
SELECT * FROM {DATASET}.required_columns_final WHERE Product_ID="SKU 8" AND Customer_Hierarchy="Publishing"
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
df_sku_8 = query_job.to_dataframe()
df_sku_8

In [None]:
print(df_sku_8["List_Price_Converged"].value_counts())

In [None]:
query = """
SELECT * FROM {DATASET}.required_columns_final WHERE Product_ID="SKU 140" AND Customer_Hierarchy="Utilities"
""".format(
    DATASET=DATASET
)
query_job = client.query(query)
df_sku_140 = query_job.to_dataframe()

In [None]:
print(df_sku_140["List_Price_Converged"].value_counts())

在发布类别中，`Product_ID` `SKU 8` 和 `SKU 17` 的价格在整个数据中都小于或等于两种不同的价格，因此你可以排除它们，考虑其他产品来建立预测模型。这里的想法是对具有不同价格的产品的时间序列数据进行训练。

#### 将所有`Product_ID`的数据连接到一个数据框中，并删除重复记录。

In [None]:
df_final = pd.concat([df_sku_62, df_sku_107, df_sku_140])
df_final = (
    df_final[
        [
            "Product_ID",
            "Fiscal_Date",
            "Customer_Hierarchy",
            "List_Price_Converged",
            "Invoiced_quantity_in_Pieces",
        ]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_final

将数据保存到一个 BigQuery 表中。

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)

job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        bigquery.SchemaField("Product_ID", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("Fiscal_Date", bigquery.enums.SqlTypeNames.DATE),
        bigquery.SchemaField("List_Price_Converged", bigquery.enums.SqlTypeNames.FLOAT),
        bigquery.SchemaField(
            "Invoiced_quantity_in_Pieces", bigquery.enums.SqlTypeNames.FLOAT
        ),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

# save the dataframe to a table in the created dataset
job = bq_client.load_table_from_dataframe(
    df_final,
    "{}.{}.{}".format(PROJECT_ID, DATASET, TRAINING_DATA_TABLE),
    job_config=job_config,
)  # Make an API request.
print(job.result())  # Wait for the job to complete.

使用BigQuery ML训练一个[Arima-Plus](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-time-series)模型。

#@bigquery
创建或替换模型[your-dataset-id].bqml_arima
选项
（模型类型= 'ARIMA_PLUS'，
时间系列时间戳列= 'Fiscal_Date'，
时间序列数据列= 'Invoiced_quantity_in_Pieces'，
时间序列ID列= 'ID'
）为
选择
财政日期，
Concat(Product_ID，“_”，Cast(List_Price_Converged as string)) as ID，
Invoiced_quantity_in_Pieces
从
[your-dataset-id].TRAINING_DATA

In [None]:
query = """
create or replace model `{PROJECT_ID}.{DATASET}.bqml_arima`
options
 (model_type = 'ARIMA_PLUS',
  time_series_timestamp_col = 'Fiscal_Date',
  time_series_data_col = 'Invoiced_quantity_in_Pieces',
  time_series_id_col = 'ID'
 ) as
select
 Fiscal_Date,
 Concat(Product_ID,"_" ,Cast(List_Price_Converged as string)) as ID,
 Invoiced_quantity_in_Pieces
from
 `{DATASET}.{TRAINING_DATA_TABLE}`""".format(
    PROJECT_ID=PROJECT_ID, DATASET=DATASET, TRAINING_DATA_TABLE=TRAINING_DATA_TABLE
)
query_job = client.query(query)
print(query_job.result())

从模型中生成预测
<a name="section-10"></a>

预测每个id未来30天的销售额，并保存到数据框中。

In [None]:
query = '''
DECLARE HORIZON STRING DEFAULT "30"; #number of values to forecast
DECLARE CONFIDENCE_LEVEL STRING DEFAULT "0.90"; ## required confidence level

EXECUTE IMMEDIATE format("""
    SELECT
      *
    FROM 
      ML.FORECAST(MODEL {DATASET}.bqml_arima, 
                  STRUCT(%s AS horizon, 
                         %s AS confidence_level)
                 )
    """,HORIZON,CONFIDENCE_LEVEL)'''.format(
    DATASET=DATASET
)
job = client.query(query)
dfforecast = job.to_dataframe()
dfforecast.head()

## 解释结果以选择最佳价格
<a name="section-11"></a>

#### 计算预测期间的平均预测值。

In [None]:
dfforecast_avg = (
    dfforecast[["ID", "forecast_value"]].groupby("ID", as_index=False).mean()
)

#### 从ID字段中提取ID和价格字段。

In [None]:
dfforecast_avg["Product_ID"] = dfforecast_avg["ID"].apply(lambda x: x.split("_")[0])
dfforecast_avg["Price"] = dfforecast_avg["ID"].apply(lambda x: x.split("_")[1])

####绘制产品价格与平均预测销售额的关系图。

In [None]:
for i in dfforecast_avg["Product_ID"].unique():
    dfforecast_avg[dfforecast_avg["Product_ID"] == i].set_index("Price").sort_values(
        "forecast_value"
    ).plot(kind="bar")
    plt.title("Price vs. Average Sales for " + i)
    plt.show()

根据价格与平均预测订单的图表，可以说为了使用最大订单量，考虑的每个“Product_ID”可以遵循以下价格：

- SKU 107的价格范围可以在4.44 - 4.73单位之间
- SKU 140的价格可以是1.95单位
- SKU 62的价格可以是4.23单位

清理
<a name="section-12"></a>

要清理此项目中使用的所有谷歌云资源，您可以[删除用于本教程的谷歌云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。以下代码将删除整个数据集。

In [None]:
# Set dataset_id to the ID of the dataset to fetch.
dataset_id = "{PROJECT_ID}.{DATASET}".format(PROJECT_ID=PROJECT_ID, DATASET=DATASET)

# Use the delete_contents parameter to delete a dataset and its contents.
# Use the not_found_ok parameter to not receive an error if the dataset has already been deleted.
client.delete_dataset(
    dataset_id, delete_contents=True, not_found_ok=True
)  # Make an API request.

print("Deleted dataset '{}'.".format(dataset_id))