# Demand Forecasting with Vertex Forecast on Tabular Workflows

**Objectives**
* train with and forecast *Iowa liquor BigQuery public dataset*
* Use Tabular Workflows to orchestrate Vertex Forecast pipeline
* Track experiments
* Run model evalutaions for trained forecast models

**TODOs**
* `skip architecture search` in a retraining pipeline
* upload v2 of a model and its evals

In [39]:
# !pip3 install {USER_FLAG} google-cloud-aiplatform kfp google-cloud-pipeline-components --upgrade
# !pip3 install --no-cache-dir {USER_FLAG} PyYAML==5.3.1 

In [40]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.4.0
google_cloud_pipeline_components version: 2.3.0


## Load notebook config

> use the prefix defined in 00-env-setup

In [41]:
CREATE_NEW_ASSETS = True

In [42]:
# naming convention for all cloud resources
VERSION        = "v1"              # TODO
PREFIX         = f'forecast-refresh-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = forecast-refresh-v1


In [43]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# ! gcloud config set project $PROJECT_ID

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-gcs'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "forecast-refresh-v1"
VERSION                  = "v1"

BUCKET_NAME              = "forecast-refresh-v1-hybrid-vertex-gcs"
BUCKET_URI               = "gs://forecast-refresh-v1-hybrid-vertex-gcs"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://forecast-refresh-v1-hybrid-vertex-gcs/data"


VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"



In [44]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/


## Imports

In [45]:
EXPERIMENT_TAG     = "tide-prob-infer"
EXPERIMENT_VERSION = "v1"

EXPERIMENT_NAME = f"{EXPERIMENT_TAG}-{EXPERIMENT_VERSION}"

print(EXPERIMENT_NAME)

tide-prob-infer-v1


In [46]:
# Import required modules
import json
import datetime
from pprint import pprint
from typing import Any, Dict, List, Optional

from google.cloud import aiplatform, storage, bigquery

# from google_cloud_pipeline_components.types.artifact_types import VertexDataset
from google_cloud_pipeline_components.preview.automl.forecasting import \
    utils as automl_forecasting_utils


# Construct a BigQuery client object.
bq_client = bigquery.Client(project=PROJECT_ID)

aiplatform.init(
    experiment=EXPERIMENT_NAME, 
    project=PROJECT_ID, 
    location=REGION
)

import sys
sys.path.append("..")
from src import helpers

## Create BigQuery Dataset

In [47]:
BIGQUERY_DATASET_NAME = EXPERIMENT_NAME.replace("-","_")

if CREATE_NEW_ASSETS:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    ds.location = BQ_LOCATION
    ds = bq_client.create_dataset(dataset = ds, exists_ok = True)
    # print(ds.full_dataset_id)
else:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    
ds 

Dataset(DatasetReference('hybrid-vertex', 'tide_prob_infer_v1'))

## prepare train job

In [48]:
# Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used.
dataflow_subnetwork           = None 

# Specifies whether Dataflow workers use public IP addresses.
dataflow_use_public_ips       = True

NOW                           = datetime.datetime.now().strftime("%d %H:%M:%S.%f").replace(" ","").replace(":","_").replace(".","_")
ROOT_DIR                      = f"{BUCKET_URI}/automl_forecasting_pipeline/{EXPERIMENT_NAME}/run-{NOW}"
time_column                   = "date"
time_series_identifier_column = "store_name"
target_column                 = "sale_dollars"
data_source_csv_filenames     = None

print(f"ROOT_DIR = {ROOT_DIR}")

ROOT_DIR = gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-prob-infer-v1/run-0302_32_26_686468


In [49]:
data_source_bigquery_table_path = (
    "bq://bigquery-public-data.iowa_liquor_sales_forecasting.2020_sales_train"
)

training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

predefined_split_key = None
if predefined_split_key:
    training_fraction = None
    validation_fraction = None
    test_fraction = None

weight_column = None

features = [
    time_column,
    target_column,
    "city",
    "zip_code",
    "county",
]

available_at_forecast_columns = [time_column]
unavailable_at_forecast_columns = [target_column]
time_series_attribute_columns = ["city", "zip_code", "county"]

forecast_horizon = 50
context_window = 50

print(f"available_at_forecast_columns    = {available_at_forecast_columns}")
print(f"unavailable_at_forecast_columns  = {unavailable_at_forecast_columns}")
print(f"time_series_attribute_columns    = {time_series_attribute_columns}")

available_at_forecast_columns    = ['date']
unavailable_at_forecast_columns  = ['sale_dollars']
time_series_attribute_columns    = ['city', 'zip_code', 'county']


In [50]:
transformations = helpers.generate_transformation(auto_column_names=features)

print(f"transformations       = {transformations}\n")

transformations       = {'auto': ['date', 'sale_dollars', 'city', 'zip_code', 'county'], 'numeric': [], 'categorical': [], 'text': [], 'timestamp': []}



In [51]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI/

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/


# Vertex Forecast Training

**Optimization Objectives** ([docs](https://cloud.google.com/vertex-ai/docs/tabular-data/forecasting-parameters#optimization-objectives))

| Objective  | API                      | Use case |
| :--------: | :------------:           | :------------------------------------- |
| RMSE       | `minimize-rmse`          | Minimize root-mean-squared error (RMSE). Captures more extreme values accurately and is less biased when aggregating predictions.Default value. |
| MAE        | `minimize-mae`           | Minimize mean-absolute error (MAE). Views extreme values as outliers with less impact on model. |
| RMSLE      | `minimize-rmsle`         | Minimize root-mean-squared log error (RMSLE). Penalizes error on relative size rather than absolute value. Useful when both predicted and actual values can be large. |
| RMSPE      | `minimize-rmspe`         | Minimize root-mean-squared percentage error (RMSPE). Captures a large range of values accurately. Similar to RMSE, but relative to target magnitude. Useful when the range of values is large. |
| WAPE       | `minimize-wape-mae`      | Minimize the combination of weighted absolute percentage error (WAPE) and mean-absolute-error (MAE). Useful when the actual values are low. |
| QUANTILE   | `minimize-quantile-loss` | Minimize the scaled pinball loss of the defined quantiles to quantify uncertainty in estimates. Quantile predictions quantify the uncertainty of predictions. They measure the likelihood of a prediction being within a range. |


**TiDE on Vertex Tabluar Workflows**
* [src](https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/preview/automl/forecasting/utils.py#L413)

#### TODO

* add `with dsl.ParallelFor(LIST) as cw:` for parallel jobs with diff params (e.g., statmike [example](https://github.com/statmike/vertex-ai-mlops/blob/main/Applied%20Forecasting/Vertex%20AI%20Pipelines%20-%20Forecasting%20Tournament%20with%20Kubeflow%20Pipelines%20(KFP).ipynb)

In [52]:
# Number of weak models in the final ensemble model.
num_selected_trials           = 3
train_budget_milli_node_hours = 250

optimization_objective        = "minimize-wape-mae" # "minimize-quantile-loss" 

RUN_EVALUATION                = True

PROBABILISTIC_INFER           = True
QUANTILES                     = [0.05, 0.25, 0.50, 0.75, 0.95]

JOB_ID                        = f"{EXPERIMENT_NAME}-{NOW}".replace("_","-")

print(f"JOB_ID = {JOB_ID}")

JOB_ID = tide-prob-infer-v1-0302-32-26-686468


In [53]:
(
    template_path,
    parameter_values,
) = automl_forecasting_utils.get_time_series_dense_encoder_forecasting_pipeline_and_parameters(
    project=PROJECT_ID,
    location=REGION,
    root_dir=ROOT_DIR,
    target_column=target_column,
    optimization_objective=optimization_objective,
    transformations=transformations,
    train_budget_milli_node_hours=train_budget_milli_node_hours,
    data_source_csv_filenames=data_source_csv_filenames,
    data_source_bigquery_table_path=data_source_bigquery_table_path,
    weight_column=weight_column,
    predefined_split_key=predefined_split_key,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    num_selected_trials=num_selected_trials,
    time_column=time_column,
    time_series_identifier_columns=[time_series_identifier_column],
    time_series_attribute_columns=time_series_attribute_columns,
    available_at_forecast_columns=available_at_forecast_columns,
    unavailable_at_forecast_columns=unavailable_at_forecast_columns,
    forecast_horizon=forecast_horizon,
    context_window=context_window,
    dataflow_subnetwork=dataflow_subnetwork,
    dataflow_use_public_ips=dataflow_use_public_ips,
    run_evaluation=RUN_EVALUATION,                          # set True to eval on test/valid set
    evaluated_examples_bigquery_path=f'bq://{PROJECT_ID}.{BIGQUERY_DATASET_NAME}',
    enable_probabilistic_inference=PROBABILISTIC_INFER,
    ### quantile forecast
    quantiles=QUANTILES,
    
    ### hierarchical forecast
    # group_columns=XXXX,
    # group_total_weight=XXXX,
    # temporal_total_weight=XXXX,
    # group_temporal_total_weight=XXXX,
)

job = aiplatform.PipelineJob(
    display_name=JOB_ID,
    location=REGION,  # launches the pipeline job in the specified region
    template_path=template_path,
    job_id=JOB_ID,
    pipeline_root=ROOT_DIR,
    parameter_values=parameter_values,
    enable_caching=False,
)

job.submit(
    experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/tide-prob-infer-v1-0302-32-26-686468
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/tide-prob-infer-v1-0302-32-26-686468')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/tide-prob-infer-v1-0302-32-26-686468?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/tide-prob-infer-v1-0302-32-26-686468 to Experiment: tide-prob-infer-v1


In [54]:
pipeline_task_details = job.task_details

for task_deets in pipeline_task_details:
    print(task_deets.task_name)

exit-handler-1
model-batch-predict-2
training-configurator-and-validator
condition-4
condition-5
feature-attribution-2
automl-forecasting-stage-1-tuner
model-evaluation-import-2
split-materialized-data
get-prediction-image-uri-2
get-predictions-column-2
automl-tabular-finalizer
model-batch-explanation-2
string-not-empty
model-upload-2
condition-2
calculate-training-parameters-2
table-to-uri-2
set-optional-inputs
automl-forecasting-ensemble-2
model-evaluation-forecasting-2
tide-prob-infer-v1-0302-32-26-686468
feature-transform-engine
get-or-create-model-description-2
finalize-eval-quantile-parameters-2


### Get trained model

In [55]:
stage_1_tuner_task = helpers.get_task_detail(
    pipeline_task_details, "automl-forecasting-stage-1-tuner"
)
stage_1_tuning_result_artifact_uri = (
    stage_1_tuner_task.outputs["tuning_result_output"].artifacts[0].uri
)
print(f"stage_1_tuning_result_artifact_uri: {stage_1_tuning_result_artifact_uri}")

# get uploaded model
upload_model_task = helpers.get_task_detail(
    pipeline_task_details, "model-upload-2"
)

forecasting_mp_model_artifact = (
    upload_model_task.outputs["model"].artifacts[0]
)

forecasting_mp_model = aiplatform.Model(forecasting_mp_model_artifact.metadata['resourceName'])
print(f"forecasting_mp_model: {forecasting_mp_model}")

stage_1_tuning_result_artifact_uri: gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-prob-infer-v1/run-0302_32_26_686468/934903580331/tide-prob-infer-v1-0302-32-26-686468/automl-forecasting-stage-1-tuner_-3896564892672458752/tuning_result_output
forecasting_mp_model: <google.cloud.aiplatform.models.Model object at 0x7fbc04f929e0> 
resource name: projects/934903580331/locations/us-central1/models/3100286339770220544


### Model Evaluations

In [88]:
if RUN_EVALUATION:
    forecast_EVALS = forecasting_mp_model.list_model_evaluations()

    for model_evaluation in forecast_EVALS:
        pprint(model_evaluation.to_dict())
        
else:
    print(f"Model evaluations were set to: {RUN_EVALUATION}")

{'createTime': '2024-01-03T03:46:23.564344Z',
 'displayName': 'Vertex Forecasting pipeline',
 'metadata': {'evaluation_dataset_path': ['bq://hybrid-vertex.vertex_feature_transform_engine_staging_us.vertex_ai_fte_split_output_test_staging_ide6c7363c89364430953edb41ebcf094a'],
              'evaluation_dataset_type': 'bigquery',
              'pipeline_job_id': '95635796261863424',
              'pipeline_job_resource_name': 'projects/934903580331/locations/us-central1/pipelineJobs/tide-prob-infer-v1-0302-32-26-686468'},
 'metrics': {'meanAbsoluteError': 4866.9663,
             'meanAbsolutePercentageError': 310.6815,
             'quantileMetrics': [{'observedQuantile': 0.058108253524157966,
                                  'quantile': 0.05,
                                  'scaledPinballLoss': 398.75778},
                                 {'observedQuantile': 0.1049714839126224,
                                  'quantile': 0.25,
                                  'scaledPinballLoss': 

In [89]:
if RUN_EVALUATION:
    # Get evaluations
    model_evaluations = forecasting_mp_model.list_model_evaluations()

    # Print the evaluation metrics
    for evaluation in model_evaluations:
        evaluation = evaluation.to_dict()
        print("Model's evaluation metrics from training:\n")
        metrics = evaluation["metrics"]
        for metric in metrics.keys():
            print(f"metric: {metric}, value: {metrics[metric]}\n")

Model's evaluation metrics from training:

metric: meanAbsoluteError, value: 4866.9663

metric: weightedAbsolutePercentageError, value: 57.549328

metric: rSquared, value: 0.36727983

metric: meanAbsolutePercentageError, value: 310.6815

metric: rootMeanSquaredPercentageError, value: 3845.579

metric: rootMeanSquaredError, value: 10987.61

metric: quantileMetrics, value: [{'scaledPinballLoss': 398.75778, 'quantile': 0.05, 'observedQuantile': 0.058108253524157966}, {'quantile': 0.25, 'scaledPinballLoss': 1649.602, 'observedQuantile': 0.1049714839126224}, {'observedQuantile': 0.3444528139459809, 'scaledPinballLoss': 2433.4834, 'quantile': 0.5}, {'observedQuantile': 0.6726030345421284, 'quantile': 0.75, 'scaledPinballLoss': 1928.8099}, {'scaledPinballLoss': 1240.8347, 'quantile': 0.95, 'observedQuantile': 0.8080813515549338}]

metric: rootMeanSquaredLogError, value: 1.0028754



## View prediction results

**Working with quantiles / prediction intervals**
* see this guide for details: [Example batch prediction output for a quantile-loss optimized model](https://cloud.google.com/vertex-ai/docs/tabular-data/tabular-workflows/forecasting-batch-predictions#example_batch_prediction_output_for_a_quantile-loss_optimized_model)
* `predicted_sales.quantile_values` will give the quantiles, i.e. `[0.1, 0.3, 0.5, 0.7, 0.9]`
* `predicted_sales.quantile_predictions` will be an array of the same length with matching predictions
* There is also a field `predicted_sales.value` which is just the prediction for the 0.5 quantile (median)


**Different statistics can be estimated from the quantiles, including statistics that minimize:**

* RMSE (weighted mean of quantile values)
* MAPE (median weighted by 1/value)
* MAE (median)

Use the BigQuery Python client to query the destination table and return results as a Pandas dataframe.

In [90]:
batch_pred_task = helpers.get_task_detail(
    pipeline_task_details, "model-batch-predict-2"
)

batch_pred_task_result_artifact_uri = (
    batch_pred_task.outputs["bigquery_output_table"].artifacts[0].uri
)

BP_PREDS_DATASET_NAME = ds.dataset_id
BP_PREDS_TABLE_NAME   = batch_pred_task_result_artifact_uri.split('/')[-1]

print(f"BP_PREDS_DATASET_NAME : {BP_PREDS_DATASET_NAME}")
print(f"BP_PREDS_TABLE_NAME   : {BP_PREDS_TABLE_NAME}")

BP_PREDS_DATASET_NAME : tide_prob_infer_v1
BP_PREDS_TABLE_NAME   : predictions_2024_01_02T19_20_11_716Z_405


In [91]:
batch_predict_bq_output_uri = "{}.{}".format(
    BP_PREDS_DATASET_NAME,
    BP_PREDS_TABLE_NAME
)

print(f"batch_predict_bq_output_uri : {batch_predict_bq_output_uri}")

batch_predict_bq_output_uri : tide_prob_infer_v1.predictions_2024_01_02T19_20_11_716Z_405


In [92]:
def _get_quantile_strings(quantile_list):
    
    cleaned_list = []
    
    for ele in quantile_list:
        if str(ele).startswith("0."):
            # cleaned_list.append(str(round(ele, 2)).replace("0.",""))
            cleaned_list.append(('{0:.2f}'.format(ele)).replace("0.",""))
                
        if str(ele).startswith("."):
            # cleaned_list.append(str(round(ele, 2)).replace(".",""))
            cleaned_list.append(('{0:.2f}'.format(ele)).replace("0.",""))
    
    return cleaned_list

cleaned_quantile_list = _get_quantile_strings(QUANTILES)

print(f"# of Quantiles        : {len(QUANTILES)}")
print(f"Quantiles             : {QUANTILES}")
print(f"cleaned_quantile_list : {cleaned_quantile_list}")

# of Quantiles        : 5
Quantiles             : [0.05, 0.25, 0.5, 0.75, 0.95]
cleaned_quantile_list : ['05', '25', '50', '75', '95']


In [94]:
quantile_string = ""
TARGET_COLUMN = "sale_dollars"

for i in range(0, len(cleaned_quantile_list)):
    quantile_string += f" predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET({i})] AS predicted_{TARGET_COLUMN}_p{cleaned_quantile_list[i]},"
    
# quantile_string


query = f"""
SELECT
 *EXCEPT(predicted_{TARGET_COLUMN}),
 predicted_{TARGET_COLUMN}.value AS predicted_sales_mean,
 {quantile_string}
FROM
 `{PROJECT_ID}.{batch_predict_bq_output_uri}`
 LIMIT 100
"""
print(query)


SELECT
 *EXCEPT(predicted_sale_dollars),
 predicted_sale_dollars.value AS predicted_sales_mean,
  predicted_sale_dollars.quantile_predictions[OFFSET(0)] AS predicted_sale_dollars_p05, predicted_sale_dollars.quantile_predictions[OFFSET(1)] AS predicted_sale_dollars_p25, predicted_sale_dollars.quantile_predictions[OFFSET(2)] AS predicted_sale_dollars_p50, predicted_sale_dollars.quantile_predictions[OFFSET(3)] AS predicted_sale_dollars_p75, predicted_sale_dollars.quantile_predictions[OFFSET(4)] AS predicted_sale_dollars_p95,
FROM
 `hybrid-vertex.tide_prob_infer_v1.predictions_2024_01_02T19_20_11_716Z_405`
 LIMIT 100



In [95]:
qs_eval = bq_client.query(query).to_dataframe()

qs_eval['date'] = qs_eval["date"].astype("datetime64[ns]")

qs_eval.head(3)

Unnamed: 0,HORIZON__sale_dollars,city,county,date,predicted_on_date,sale_dollars,store_name,window__ide6c7363c89364430953edb41ebcf094a,zip_code,predicted_sales_mean,predicted_sale_dollars_p05,predicted_sale_dollars_p25,predicted_sale_dollars_p50,predicted_sale_dollars_p75,predicted_sale_dollars_p95
0,1048.08,Adair,ADAIR,2020-12-18,2020-12-18,,KUM & GO #76 / ADAIR,True,50002.0,1213.5,593.0,923.0,1214.0,1541.0,2023.0
1,1718.4,Adair,ADAIR,2020-12-04,2020-12-04,,KUM & GO #76 / ADAIR,True,50002.0,1211.0,588.0,921.0,1212.0,1540.0,2107.0
2,1681.86,Adair,ADAIR,2020-12-11,2020-12-04,,KUM & GO #76 / ADAIR,True,50002.0,1184.5,550.0,872.0,1185.0,1573.0,2154.0


**Finished**