# Demand Forecasting with Vertex Forecast on Tabular Workflows

**Objectives**
* train with and forecast *Iowa liquor BigQuery public dataset*
* Use Tabular Workflows to orchestrate Vertex Forecast pipeline
* Track experiments
* Run model evalutaions for trained forecast models

**TODOs**
* `skip architecture search` in a retraining pipeline
* upload v2 of a model and its evals

In [1]:
# !pip3 install {USER_FLAG} google-cloud-aiplatform kfp google-cloud-pipeline-components --upgrade
# !pip3 install --no-cache-dir {USER_FLAG} PyYAML==5.3.1 

In [2]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.4.0
google_cloud_pipeline_components version: 2.3.0


## Load notebook config

> use the prefix defined in 00-env-setup

In [3]:
CREATE_NEW_ASSETS = True

In [4]:
# naming convention for all cloud resources
VERSION        = "v1"              # TODO
PREFIX         = f'forecast-refresh-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = forecast-refresh-v1


In [5]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# ! gcloud config set project $PROJECT_ID

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-gcs'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "forecast-refresh-v1"
VERSION                  = "v1"

BUCKET_NAME              = "forecast-refresh-v1-hybrid-vertex-gcs"
BUCKET_URI               = "gs://forecast-refresh-v1-hybrid-vertex-gcs"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://forecast-refresh-v1-hybrid-vertex-gcs/data"


VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"



In [6]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/


## Imports

In [7]:
EXPERIMENT_TAG     = "tide-twrkflow-eval"
EXPERIMENT_VERSION = "v1"

EXPERIMENT_NAME = f"{EXPERIMENT_TAG}-{EXPERIMENT_VERSION}"

print(EXPERIMENT_NAME)

tide-twrkflow-eval-v1


In [22]:
# Import required modules
import json
import datetime
from pprint import pprint
from typing import Any, Dict, List, Optional

from google.cloud import aiplatform, storage, bigquery

# from google_cloud_pipeline_components.types.artifact_types import VertexDataset
from google_cloud_pipeline_components.preview.automl.forecasting import \
    utils as automl_forecasting_utils


# Construct a BigQuery client object.
bq_client = bigquery.Client(project=PROJECT_ID)

aiplatform.init(
    experiment=EXPERIMENT_NAME, 
    project=PROJECT_ID, 
    location=REGION
)

import sys
sys.path.append("..")
from src import helpers

## Create BigQuery Dataset

In [10]:
BIGQUERY_DATASET_NAME = EXPERIMENT_NAME.replace("-","_")

if CREATE_NEW_ASSETS:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    ds.location = BQ_LOCATION
    ds = bq_client.create_dataset(dataset = ds, exists_ok = False)
    # print(ds.full_dataset_id)
else:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    
ds 
# ds.dataset_id
# ds.full_dataset_id

Dataset(DatasetReference('hybrid-vertex', 'tide_twrkflow_eval_v1'))

## prepare train job

In [13]:
# Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used.
dataflow_subnetwork           = None 

# Specifies whether Dataflow workers use public IP addresses.
dataflow_use_public_ips       = True

NOW                           = datetime.datetime.now().strftime("%d %H:%M:%S.%f").replace(" ","").replace(":","_").replace(".","_")
ROOT_DIR                      = f"{BUCKET_URI}/automl_forecasting_pipeline/{EXPERIMENT_NAME}/run-{NOW}"
time_column                   = "date"
time_series_identifier_column = "store_name"
target_column                 = "sale_dollars"
data_source_csv_filenames     = None

print(f"ROOT_DIR = {ROOT_DIR}")

ROOT_DIR = gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-twrkflow-eval-v1/run-2907_22_44_752082


In [14]:
data_source_bigquery_table_path = (
    "bq://bigquery-public-data.iowa_liquor_sales_forecasting.2020_sales_train"
)

training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

predefined_split_key = None
if predefined_split_key:
    training_fraction = None
    validation_fraction = None
    test_fraction = None

weight_column = None

features = [
    time_column,
    target_column,
    "city",
    "zip_code",
    "county",
]

available_at_forecast_columns = [time_column]
unavailable_at_forecast_columns = [target_column]
time_series_attribute_columns = ["city", "zip_code", "county"]

forecast_horizon = 150
context_window = 150

print(f"available_at_forecast_columns    = {available_at_forecast_columns}")
print(f"unavailable_at_forecast_columns  = {unavailable_at_forecast_columns}")
print(f"time_series_attribute_columns    = {time_series_attribute_columns}")

available_at_forecast_columns    = ['date']
unavailable_at_forecast_columns  = ['sale_dollars']
time_series_attribute_columns    = ['city', 'zip_code', 'county']


In [15]:
# transformations = helpers.generate_auto_transformation(features)
transformations = helpers.generate_transformation(auto_column_names=features)

# TRANSFORM_CONFIG_PATH = f"{ROOT_DIR}/transform_config_{NOW}.json"
# TRANSFORM_CONFIG_PATH = "gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/run-28ec73a7-646e-420b-b883-2aa16ea2e518/transform_config_40ac07bd-c92b-4914-beda-18a382062acd.json"

print(f"transformations       = {transformations}\n")
# print(f"TRANSFORM_CONFIG_PATH = {TRANSFORM_CONFIG_PATH}")

# helpers.write_to_gcs(TRANSFORM_CONFIG_PATH, json.dumps(transformations))

transformations       = {'auto': ['date', 'sale_dollars', 'city', 'zip_code', 'county'], 'numeric': [], 'categorical': [], 'text': [], 'timestamp': []}



In [16]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI/

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/


# Vertex Forecast Training

**Optimization Objectives** ([docs](https://cloud.google.com/vertex-ai/docs/tabular-data/forecasting-parameters#optimization-objectives))

| Objective  | API                      | Use case |
| :--------: | :------------:           | :------------------------------------- |
| RMSE       | `minimize-rmse`          | Minimize root-mean-squared error (RMSE). Captures more extreme values accurately and is less biased when aggregating predictions.Default value. |
| MAE        | `minimize-mae`           | Minimize mean-absolute error (MAE). Views extreme values as outliers with less impact on model. |
| RMSLE      | `minimize-rmsle`         | Minimize root-mean-squared log error (RMSLE). Penalizes error on relative size rather than absolute value. Useful when both predicted and actual values can be large. |
| RMSPE      | `minimize-rmspe`         | Minimize root-mean-squared percentage error (RMSPE). Captures a large range of values accurately. Similar to RMSE, but relative to target magnitude. Useful when the range of values is large. |
| WAPE       | `minimize-wape-mae`      | Minimize the combination of weighted absolute percentage error (WAPE) and mean-absolute-error (MAE). Useful when the actual values are low. |
| QUANTILE   | `minimize-quantile-loss` | Minimize the scaled pinball loss of the defined quantiles to quantify uncertainty in estimates. Quantile predictions quantify the uncertainty of predictions. They measure the likelihood of a prediction being within a range. |


**TiDE on Vertex Tabluar Workflows**
* [src](https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/preview/automl/forecasting/utils.py#L413)

#### TODO

* add `with dsl.ParallelFor(LIST) as cw:` for parallel jobs with diff params (e.g., statmike [example](https://github.com/statmike/vertex-ai-mlops/blob/main/Applied%20Forecasting/Vertex%20AI%20Pipelines%20-%20Forecasting%20Tournament%20with%20Kubeflow%20Pipelines%20(KFP).ipynb)

In [17]:
# Number of weak models in the final ensemble model.
num_selected_trials           = 5
train_budget_milli_node_hours = 500  # 30 minutes

optimization_objective        = "minimize-wape-mae" 

RUN_EVALUATION                = True

PROBABILISTIC_INFER           = False
# QUANTILES                     = [0.25, 0.5, 0.9] # [0.05, 0.25, 0.50, 0.75, 0.95]

JOB_ID                        = f"tide-{EXPERIMENT_NAME}"

print(f"JOB_ID = {JOB_ID}")

JOB_ID = tide-tide-twrkflow-eval-v1


## (1) TiDE - full AutoML train & eval

TiDE stands for "Time series Dense Encoder", which is a new model type in Vertex Forecasting and has the best training and inference performance while not sacrificing any model quality.

For more details, please see https://ai.googleblog.com/2023/04/recent-advances-in-deep-long-horizon.html

You will create a skip evaluation AutoML Forecasting pipeline with the following customizations:
- Limit the hyperparameter search space
- Change machine type and tuning / training parallelism

In [18]:
(
    template_path,
    parameter_values,
) = automl_forecasting_utils.get_time_series_dense_encoder_forecasting_pipeline_and_parameters(
    project=PROJECT_ID,
    location=REGION,
    root_dir=ROOT_DIR,
    target_column=target_column,
    optimization_objective=optimization_objective,
    transformations=transformations,
    train_budget_milli_node_hours=train_budget_milli_node_hours,
    data_source_csv_filenames=data_source_csv_filenames,
    data_source_bigquery_table_path=data_source_bigquery_table_path,
    weight_column=weight_column,
    predefined_split_key=predefined_split_key,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    num_selected_trials=num_selected_trials,
    time_column=time_column,
    time_series_identifier_columns=[time_series_identifier_column],
    time_series_attribute_columns=time_series_attribute_columns,
    available_at_forecast_columns=available_at_forecast_columns,
    unavailable_at_forecast_columns=unavailable_at_forecast_columns,
    forecast_horizon=forecast_horizon,
    context_window=context_window,
    dataflow_subnetwork=dataflow_subnetwork,
    dataflow_use_public_ips=dataflow_use_public_ips,
    run_evaluation=RUN_EVALUATION,                          # set True to eval on test/valid set
    evaluated_examples_bigquery_path=f'bq://{PROJECT_ID}.{BIGQUERY_DATASET_NAME}',
    enable_probabilistic_inference=PROBABILISTIC_INFER,
    
    ### quantile forecast
    # quantiles=QUANTILES,
    
    ### hierarchical forecast
    # group_columns=XXXX,
    # group_total_weight=XXXX,
    # temporal_total_weight=XXXX,
    # group_temporal_total_weight=XXXX,
)

# job_id = "tide-forecasting-probabilistic-inference-{}".format(uuid.uuid4())
job = aiplatform.PipelineJob(
    display_name=JOB_ID,
    location=REGION,  # launches the pipeline job in the specified region
    template_path=template_path,
    job_id=JOB_ID,
    pipeline_root=ROOT_DIR,
    parameter_values=parameter_values,
    enable_caching=False,
)

# job.run(sync=False,experiment=EXPERIMENT_NAME)
job.submit(
    experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/tide-tide-twrkflow-eval-v1
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/tide-tide-twrkflow-eval-v1')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/tide-tide-twrkflow-eval-v1?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/tide-tide-twrkflow-eval-v1 to Experiment: tide-twrkflow-eval-v1


In [60]:
# job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/prob-infer-forecast-refresh-v1-v1')

In [19]:
pipeline_task_details = job.task_details

for task_deets in pipeline_task_details:
    print(task_deets.task_name)

feature-transform-engine
calculate-training-parameters-2
get-predictions-column-2
exit-handler-1
model-upload-2
model-evaluation-import-2
string-not-empty
get-or-create-model-description-2
model-batch-predict-2
tide-tide-twrkflow-eval-v1
feature-attribution-2
condition-2
automl-forecasting-ensemble-2
condition-4
automl-forecasting-stage-1-tuner
finalize-eval-quantile-parameters-2
automl-tabular-finalizer
training-configurator-and-validator
get-prediction-image-uri-2
model-evaluation-forecasting-2
split-materialized-data
condition-5
model-batch-explanation-2
set-optional-inputs
table-to-uri-2


### Get trained model

In [20]:
stage_1_tuner_task = helpers.get_task_detail(
    pipeline_task_details, "automl-forecasting-stage-1-tuner"
)
stage_1_tuning_result_artifact_uri = (
    stage_1_tuner_task.outputs["tuning_result_output"].artifacts[0].uri
)
print(f"stage_1_tuning_result_artifact_uri: {stage_1_tuning_result_artifact_uri}")

# get uploaded model
upload_model_task = helpers.get_task_detail(
    pipeline_task_details, "model-upload-2"
)

forecasting_mp_model_artifact = (
    upload_model_task.outputs["model"].artifacts[0]
)

forecasting_mp_model = aiplatform.Model(forecasting_mp_model_artifact.metadata['resourceName'])
print(f"forecasting_mp_model: {forecasting_mp_model}")

stage_1_tuning_result_artifact_uri: gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-twrkflow-eval-v1/run-2907_22_44_752082/934903580331/tide-tide-twrkflow-eval-v1/automl-forecasting-stage-1-tuner_206416678001573888/tuning_result_output
forecasting_mp_model: <google.cloud.aiplatform.models.Model object at 0x7f639b10d7e0> 
resource name: projects/934903580331/locations/us-central1/models/7688328460153913344


### Model Evaluations

In [23]:
if RUN_EVALUATION:
    forecast_EVALS = forecasting_mp_model.list_model_evaluations()

    for model_evaluation in forecast_EVALS:
        pprint(model_evaluation.to_dict())
        
else:
    print(f"Model evaluations were set to: {RUN_EVALUATION}")

{'createTime': '2023-12-29T08:52:40.959126Z',
 'displayName': 'Vertex Forecasting pipeline',
 'metadata': {'evaluation_dataset_path': ['bq://hybrid-vertex.vertex_feature_transform_engine_staging_us.vertex_ai_fte_split_output_test_staging_id83e65c1dc39241eca34c763520380490'],
              'evaluation_dataset_type': 'bigquery',
              'pipeline_job_id': '8418345082247184384',
              'pipeline_job_resource_name': 'projects/934903580331/locations/us-central1/pipelineJobs/tide-tide-twrkflow-eval-v1'},
 'metrics': {'meanAbsoluteError': 4118.861,
             'meanAbsolutePercentageError': 390.12265,
             'rSquared': 0.55656505,
             'rootMeanSquaredError': 9204.42,
             'rootMeanSquaredLogError': 0.9453542,
             'rootMeanSquaredPercentageError': 5200.477,
             'weightedAbsolutePercentageError': 48.703373},
 'metricsSchemaUri': 'gs://google-cloud-aiplatform/schema/modelevaluation/forecasting_metrics_1.0.0.yaml',
 'modelExplanation': {'mea

In [24]:
if RUN_EVALUATION:
    # Get evaluations
    model_evaluations = forecasting_mp_model.list_model_evaluations()

    # Print the evaluation metrics
    for evaluation in model_evaluations:
        evaluation = evaluation.to_dict()
        print("Model's evaluation metrics from training:\n")
        metrics = evaluation["metrics"]
        for metric in metrics.keys():
            print(f"metric: {metric}, value: {metrics[metric]}\n")

Model's evaluation metrics from training:

metric: rSquared, value: 0.55656505

metric: rootMeanSquaredError, value: 9204.42

metric: meanAbsoluteError, value: 4118.861

metric: weightedAbsolutePercentageError, value: 48.703373

metric: rootMeanSquaredPercentageError, value: 5200.477

metric: meanAbsolutePercentageError, value: 390.12265

metric: rootMeanSquaredLogError, value: 0.9453542



### (Optional) Log pipeline to Experiment Run

In [25]:
# def log_pipeline_job_sample(
#     experiment_name: str,
#     run_name: str,
#     pipeline_job: aiplatform.PipelineJob,
#     project: str,
#     location: str,
# ):
#     aiplatform.init(experiment=experiment_name, project=project, location=location)

#     aiplatform.start_run(run=run_name, resume=True)

#     aiplatform.log(pipeline_job=pipeline_job)

## (2) TiDE - skip architecture search

Instead of doing architecture search everytime, we can reuse the existing architecture search result. This could help:
1. reducing the variation of the output model
2. reducing training cost

The existing architecture search result is stored in the `tuning_result_output` output of the `automl-forecasting-stage-1-tuner` component. You can manually input it or get it programmatically.

**New Parameter**
* `stage_1_tuning_result_artifact_uri` (str): - (Optional) URI of the hyperparameter tuning result from a previous pipeline run.

#### TODO

1. First test passing just experiement name in pipeline job:

```
job.submit(
    experiment=EXPERIMENT_NAME,
)
```

2. Check experiments & model eval compare
3. Specify `EXPERIMENT_RUN_NAME` is output in (2) not right

In [26]:
JOB_ID   = f"tide-skip-arch-{EXPERIMENT_NAME}-v1"

NOW      = datetime.datetime.now().strftime("%d %H:%M:%S.%f").replace(" ","").replace(":","_").replace(".","_")
ROOT_DIR = f"{BUCKET_URI}/automl_forecasting_pipeline/{EXPERIMENT_NAME}/run-{NOW}"

print(f"JOB_ID: {JOB_ID}")
print(f"ROOT_DIR: {ROOT_DIR}")

print(forecasting_mp_model)
print(stage_1_tuning_result_artifact_uri)

JOB_ID: tide-skip-arch-tide-twrkflow-eval-v1-v1
<google.cloud.aiplatform.models.Model object at 0x7f639b10d7e0> 
resource name: projects/934903580331/locations/us-central1/models/7688328460153913344
ROOT_DIR: gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-twrkflow-eval-v1/run-2909_01_42_271952


In [29]:
# Number of weak models in the final ensemble model.
num_selected_trials = 5

train_budget_milli_node_hours = 250.0  # 15 minutes

(
    template_path,
    parameter_values,
) = automl_forecasting_utils.get_time_series_dense_encoder_forecasting_pipeline_and_parameters(
    project=PROJECT_ID,
    location=REGION,
    root_dir=ROOT_DIR,
    target_column=target_column,
    optimization_objective=optimization_objective,
    transformations=transformations,
    train_budget_milli_node_hours=train_budget_milli_node_hours,
    data_source_csv_filenames=data_source_csv_filenames,
    data_source_bigquery_table_path=data_source_bigquery_table_path,
    weight_column=weight_column,
    predefined_split_key=predefined_split_key,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    num_selected_trials=num_selected_trials,
    time_column=time_column,
    time_series_identifier_columns=[time_series_identifier_column],
    time_series_attribute_columns=time_series_attribute_columns,
    available_at_forecast_columns=available_at_forecast_columns,
    unavailable_at_forecast_columns=unavailable_at_forecast_columns,
    forecast_horizon=forecast_horizon,
    context_window=context_window,
    dataflow_subnetwork=dataflow_subnetwork,
    dataflow_use_public_ips=dataflow_use_public_ips,
    stage_1_tuning_result_artifact_uri=stage_1_tuning_result_artifact_uri,
    run_evaluation=RUN_EVALUATION,
    evaluated_examples_bigquery_path=f'bq://{PROJECT_ID}.{BIGQUERY_DATASET_NAME}',
    enable_probabilistic_inference=PROBABILISTIC_INFER,
)

# job_id = "tide-forecasting-skip-architecture-search-{}".format(uuid.uuid4())
job = aiplatform.PipelineJob(
    display_name=JOB_ID,
    location=REGION,  # launches the pipeline job in the specified region
    template_path=template_path,
    job_id=JOB_ID,
    pipeline_root=ROOT_DIR,
    parameter_values=parameter_values,
    enable_caching=False,
)

# job.run(sync=False,experiment=EXPERIMENT_NAME)
job.submit(
    experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/tide-skip-arch-tide-twrkflow-eval-v1-v1
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/tide-skip-arch-tide-twrkflow-eval-v1-v1')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/tide-skip-arch-tide-twrkflow-eval-v1-v1?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/tide-skip-arch-tide-twrkflow-eval-v1-v1 to Experiment: tide-twrkflow-eval-v1


In [49]:
stage_1_tuning_result_artifact_uri

'gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-twrkflow-eval-v1/run-2907_22_44_752082/934903580331/tide-tide-twrkflow-eval-v1/automl-forecasting-stage-1-tuner_206416678001573888/tuning_result_output'

In [32]:
skip_arch_search_pipeline_task_details = job.task_details

for task_deets in skip_arch_search_pipeline_task_details:
    print(task_deets.task_name)

automl-tabular-finalizer
automl-forecasting-ensemble
model-evaluation-forecasting
tide-skip-arch-tide-twrkflow-eval-v1-v1
get-or-create-model-description
feature-transform-engine
finalize-eval-quantile-parameters
set-optional-inputs
calculate-training-parameters
table-to-uri
condition-2
importer
string-not-empty
model-batch-explanation
training-configurator-and-validator
model-batch-predict
model-upload
condition-3
condition-4
feature-attribution
exit-handler-1
automl-forecasting-stage-2-tuner
model-evaluation-import
get-prediction-image-uri
split-materialized-data
get-predictions-column


### Get trained model

In [None]:
# get tuning stage task
stage_2_tuner_task = helpers.get_task_detail(
    skip_arch_search_pipeline_task_details, "automl-forecasting-stage-2-tuner"
)
stage_2_tuning_result_artifact_uri = stage_2_tuner_task.outputs["tuning_result_output"].artifacts[0].uri
print(f"stage-2 result URI     : \n{stage_2_tuning_result_artifact_uri}\n")

In [46]:
# get uploaded model
upload_model_task_v2 = helpers.get_task_detail(
    skip_arch_search_pipeline_task_details, "model-upload"
)
forecasting_mp_model_v2_artifact = upload_model_task_v2.outputs["model"].artifacts[0]

forecasting_mp_model_v2 = aiplatform.Model(forecasting_mp_model_v2_artifact.metadata['resourceName'])
print(f"forecasting_mp_model_v2 : \n{forecasting_mp_model_v2}")

stage-2 result URI     : 
gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-twrkflow-eval-v1/run-2909_01_42_271952/934903580331/tide-skip-arch-tide-twrkflow-eval-v1-v1/automl-forecasting-stage-2-tuner_5386119199431065600/tuning_result_output

forecasting_mp_model_v2 : 
<google.cloud.aiplatform.models.Model object at 0x7f639b1c0b20> 
resource name: projects/934903580331/locations/us-central1/models/3016406796710445056


In [52]:
# get values for stage-2 trials
for task_deets in skip_arch_search_pipeline_task_details:
    if task_deets.task_name == "tide-skip-arch-tide-twrkflow-eval-v1-v1":
        # break
        stage_2_parallel_trials = task_deets.execution.metadata.get(key="input:stage_2_num_parallel_trials")
        stage_2_worker_pool_spec = task_deets.execution.metadata.get(key="input:stage_2_trainer_worker_pool_specs_override")
    
print(f"stage_2_parallel_trials  : {stage_2_parallel_trials}")
print(f"stage_2_worker_pool_spec : {stage_2_worker_pool_spec}")

task_id: -6674520602667122688
task_name: "tide-skip-arch-tide-twrkflow-eval-v1-v1"
create_time {
  seconds: 1703840910
  nanos: 506927000
}
start_time {
  seconds: 1703840911
  nanos: 239967000
}
end_time {
  seconds: 1703845862
  nanos: 413734000
}
executor_detail {
}
state: SUCCEEDED
execution {
  name: "projects/934903580331/locations/us-central1/metadataStores/default/executions/15616573050056497927"
  display_name: "tide-skip-arch-tide-twrkflow-eval-v1-v1"
  state: COMPLETE
  etag: "1703845862230"
  create_time {
    seconds: 1703840910
    nanos: 962000000
  }
  update_time {
    seconds: 1703845862
    nanos: 230000000
  }
  schema_title: "system.Run"
  schema_version: "0.0.1"
  metadata {
    fields {
      key: "input:available_at_forecast_columns"
      value {
        list_value {
          values {
            string_value: "date"
          }
        }
      }
    }
    fields {
      key: "input:context_window"
      value {
        number_value: 150.0
      }
    }
    fi

### Model Evaluations

In [47]:
if RUN_EVALUATION:
    forecast_EVALS = forecasting_mp_model_v2.list_model_evaluations()

    for model_evaluation in forecast_EVALS:
        pprint(model_evaluation.to_dict())
        
else:
    print(f"Model evaluations were set to: {RUN_EVALUATION}")

{'createTime': '2023-12-29T10:28:31.692046Z',
 'displayName': 'Vertex Forecasting pipeline',
 'metadata': {'evaluation_dataset_path': ['bq://hybrid-vertex.vertex_feature_transform_engine_staging_us.vertex_ai_fte_split_output_test_staging_id5efc32f2a70e48bbbc24bef6e28b5331'],
              'evaluation_dataset_type': 'bigquery',
              'pipeline_job_id': '7487788809241755648',
              'pipeline_job_resource_name': 'projects/934903580331/locations/us-central1/pipelineJobs/tide-skip-arch-tide-twrkflow-eval-v1-v1'},
 'metrics': {'meanAbsoluteError': 4128.132,
             'meanAbsolutePercentageError': 405.4942,
             'rSquared': 0.5472558,
             'rootMeanSquaredError': 9180.089,
             'rootMeanSquaredLogError': 0.9531391,
             'rootMeanSquaredPercentageError': 5343.9043,
             'weightedAbsolutePercentageError': 48.812996},
 'metricsSchemaUri': 'gs://google-cloud-aiplatform/schema/modelevaluation/forecasting_metrics_1.0.0.yaml',
 'modelExplan

In [48]:
if RUN_EVALUATION:
    # Get evaluations
    model_evaluations = forecasting_mp_model_v2.list_model_evaluations()

    # Print the evaluation metrics
    for evaluation in model_evaluations:
        evaluation = evaluation.to_dict()
        print("Model's evaluation metrics from training:\n")
        metrics = evaluation["metrics"]
        for metric in metrics.keys():
            print(f"metric: {metric}, value: {metrics[metric]}\n")

Model's evaluation metrics from training:

metric: rootMeanSquaredPercentageError, value: 5343.9043

metric: rootMeanSquaredLogError, value: 0.9531391

metric: weightedAbsolutePercentageError, value: 48.812996

metric: meanAbsolutePercentageError, value: 405.4942

metric: rSquared, value: 0.5472558

metric: rootMeanSquaredError, value: 9180.089

metric: meanAbsoluteError, value: 4128.132



## (3) TiDE - comparing pipeline runs

* see similar [GitHub example](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/experiments/comparing_pipeline_runs.ipynb)

In [85]:
forecast_horizon = 150
context_window   = 150

COMPARE_VERSION = "v1"
EXPERIMENT_NAME = f"tide-tourny-{COMPARE_VERSION}"

NOW      = datetime.datetime.now().strftime("%d %H:%M:%S.%f").replace(" ","").replace(":","_").replace(".","_")
ROOT_DIR = f"{BUCKET_URI}/automl_forecasting_pipeline/{EXPERIMENT_NAME}/run-{NOW}"

print(f"ROOT_DIR: {ROOT_DIR}")

ROOT_DIR: gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/tide-tournament-v1/run-2918_35_09_635342


In [86]:
runs = [
    # {"optimization_objective": "minimize-rmse"},
    {"optimization_objective": "minimize-mae"},
    {"optimization_objective": "minimize-rmsle"},
    {"optimization_objective": "minimize-rmspe"},
    {"optimization_objective": "minimize-wape-mae"},
]

# runs = [
#     {"forecast_horizon": 75, "context_window": 75},
#     {"forecast_horizon": 75, "context_window": 100},
#     {"forecast_horizon": 100, "context_window": 100},
#     {"forecast_horizon": 100, "context_window": 100},
#     {"forecast_horizon": 125, "context_window": 100},
#     {"forecast_horizon": 125, "context_window": 100},
# ]

print(runs)

[{'optimization_objective': 'minimize-mae'}, {'optimization_objective': 'minimize-rmsle'}, {'optimization_objective': 'minimize-rmspe'}, {'optimization_objective': 'minimize-wape-mae'}]


In [89]:
for i, run in enumerate(runs):
    
    (
        template_path,
        parameter_values,
    ) = automl_forecasting_utils.get_time_series_dense_encoder_forecasting_pipeline_and_parameters(
        project=PROJECT_ID,
        location=REGION,
        root_dir=ROOT_DIR,
        target_column=target_column,
        optimization_objective=run['optimization_objective'],
        transformations=transformations,
        train_budget_milli_node_hours=train_budget_milli_node_hours,
        data_source_csv_filenames=data_source_csv_filenames,
        data_source_bigquery_table_path=data_source_bigquery_table_path,
        weight_column=weight_column,
        predefined_split_key=predefined_split_key,
        training_fraction=training_fraction,
        validation_fraction=validation_fraction,
        test_fraction=test_fraction,
        num_selected_trials=num_selected_trials,
        time_column=time_column,
        time_series_identifier_columns=[time_series_identifier_column],
        time_series_attribute_columns=time_series_attribute_columns,
        available_at_forecast_columns=available_at_forecast_columns,
        unavailable_at_forecast_columns=unavailable_at_forecast_columns,
        forecast_horizon=forecast_horizon,
        context_window=context_window,
        dataflow_subnetwork=dataflow_subnetwork,
        dataflow_use_public_ips=dataflow_use_public_ips,
        run_evaluation=RUN_EVALUATION,                          # set True to eval on test/valid set
        evaluated_examples_bigquery_path=f'bq://{PROJECT_ID}.{BIGQUERY_DATASET_NAME}',
        enable_probabilistic_inference=PROBABILISTIC_INFER,

        ### quantile forecast
        # quantiles=QUANTILES,

        ### hierarchical forecast
        # group_columns=XXXX,
        # group_total_weight=XXXX,
        # temporal_total_weight=XXXX,
        # group_temporal_total_weight=XXXX,
    )
    
    PIPE_RUN_NAME = f"{EXPERIMENT_NAME}-{run['optimization_objective']}-{i}"

    job = aiplatform.PipelineJob(
        display_name=PIPE_RUN_NAME,
        template_path=template_path,
        location=REGION,
        pipeline_root=ROOT_DIR,
        job_id=PIPE_RUN_NAME,
        parameter_values=parameter_values,
        enable_caching=True,
    )
    
    job.submit(
        experiment=EXPERIMENT_NAME,
        # sync=False,
        service_account=VERTEX_SA,
    )

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/tide-tournament-v1-minimize-mae-0
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/tide-tournament-v1-minimize-mae-0')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/tide-tournament-v1-minimize-mae-0?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/tide-tournament-v1-minimize-mae-0 to Experiment: tide-tournament-v1
Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/tide-tournament-v1-minimize-rmsle-1
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/tide-tournament-v1-minimize-rmsle-1')
View Pipeline Job:
https://console.cloud.google.c

In [None]:
# see state of all pipelineJob
vertex_ai.get_experiment_df(EXPERIMENT_NAME)

## (4) TiDE - challenger vs blessed

In [None]:
# from google.cloud.aiplatform import gapic

In [None]:
# blessed_eval = gapic.ModelEvaluation(
#     display_name="eval",
#     metrics_schema_uri="gs://google-cloud-aiplatform/schema/modelevaluation/forecasting_metrics_1.0.0.yaml",
#     metrics=metrics,
# )

#### IF Quantiles

In [89]:
def _get_quantile_strings(quantile_list):
    
    cleaned_list = []
    
    for ele in quantile_list:
        if str(ele).startswith("0."):
            # cleaned_list.append(str(round(ele, 2)).replace("0.",""))
            cleaned_list.append(('{0:.2f}'.format(ele)).replace("0.",""))
                
        if str(ele).startswith("."):
            # cleaned_list.append(str(round(ele, 2)).replace(".",""))
            cleaned_list.append(('{0:.2f}'.format(ele)).replace("0.",""))
    
    return cleaned_list

cleaned_quantile_list = _get_quantile_strings(QUANTILES)

print(f"# of Quantiles        : {len(QUANTILES)}")
print(f"Quantiles             : {QUANTILES}")
print(f"cleaned_quantile_list : {cleaned_quantile_list}")

# of Quantiles        : 3
Quantiles             : [0.25, 0.5, 0.9]
cleaned_quantile_list : ['25', '50', '90']


In [97]:
# quantile_string = ""

# for i in range(0, len(cleaned_quantile_list)):
#     quantile_string += f" predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET({i})] AS predicted_{TARGET_COLUMN}_p{cleaned_quantile_list[i]},"
    
# # quantile_string

# TARGET_COLUMN = "sale_dollars"

# query = f"""
# SELECT
#  *EXCEPT(predicted_{TARGET_COLUMN}),
#  predicted_{TARGET_COLUMN}.value AS predicted_sales_mean,
#  {quantile_string}
# FROM
#  `{cleaned_bq_output_uri}`
#  LIMIT 100
# """
# print(query)


SELECT
 *EXCEPT(predicted_sale_dollars),
 predicted_sale_dollars.value AS predicted_sales_mean,
  predicted_sale_dollars.quantile_predictions[OFFSET(0)] AS predicted_sale_dollars_p25, predicted_sale_dollars.quantile_predictions[OFFSET(1)] AS predicted_sale_dollars_p50, predicted_sale_dollars.quantile_predictions[OFFSET(2)] AS predicted_sale_dollars_p90,
FROM
 `hybrid-vertex.forecast_refresh_v1.predictions_2023_12_28T16_02_01_117Z_466`
 LIMIT 100



In [68]:
TARGET_COLUMN = "sale_dollars"

query = f"""
SELECT
 *EXCEPT(predicted_{TARGET_COLUMN}),
 predicted_{TARGET_COLUMN}.value AS predicted_sales_mean,
 predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET(0)] AS predicted_{TARGET_COLUMN}_p25,
 predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET(1)] AS predicted_{TARGET_COLUMN}_p50,
 predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET(2)] AS predicted_{TARGET_COLUMN}_p90,
FROM
 `{cleaned_bq_output_uri}`
 LIMIT 100
"""
print(query)


SELECT
 *EXCEPT(predicted_sale_dollars),
 predicted_sale_dollars.value AS predicted_sales_mean,
 predicted_sale_dollars.quantile_predictions[OFFSET(0)] AS predicted_sale_dollars_p25,
 predicted_sale_dollars.quantile_predictions[OFFSET(1)] AS predicted_sale_dollars_p50,
 predicted_sale_dollars.quantile_predictions[OFFSET(2)] AS predicted_sale_dollars_p90,
FROM
 `hybrid-vertex.forecast_refresh_v1.predictions_2023_12_28T16_02_01_117Z_466`
 LIMIT 100



In [None]:
# qs_eval['date'] = qs_eval["date"].astype("datetime64[ns]")
# qs_eval['predicted_sales_mean'].dtype

qs_eval = bq_client.query(query).to_dataframe()

qs_eval['date'] = qs_eval["date"].astype("datetime64[ns]")

In [97]:
qs_eval.head(3)

Unnamed: 0,city,county,date,predicted_on_date,sale_dollars,store_name,zip_code,predicted_sales_mean,predicted_sale_dollars_p25,predicted_sale_dollars_p50,predicted_sale_dollars_p90
0,Altoona,POLK,2021-04-01,2021-04-01,,Super Stop Liquor and Wine / Altoona,50009.0,3989.0,2169.0,3994.0,6723.0
1,Altoona,POLK,2021-04-09,2021-04-01,,Super Stop Liquor and Wine / Altoona,50009.0,3833.0,2431.0,3842.0,7467.0
2,Altoona,POLK,2021-04-16,2021-04-01,,Super Stop Liquor and Wine / Altoona,50009.0,4191.5,2522.0,4192.0,7512.0


In [None]:
# trained_forecast = aiplatform.Model(
#     model_name=BPJ_OUTPUT_DICT['model']
# )
# my_evaluation_job = trained_forecast.evaluate(
#     prediction_type="classification",
#     target_field_name="type",
#     data_source_uris=["gs://sdk-model-eval/my-prediction-data.csv"],
#     staging_bucket="gs://my-staging-bucket/eval_pipeline_root",
# )
# my_evaluation_job.wait()
# my_evaluation = my_evaluation_job.get_model_evaluation()
# my_evaluation.metrics

In [None]:
# # from google_cloud_pipeline_components.aiplatform import ModelBatchPredictOp
# from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp

# from google_cloud_pipeline_components.v1.model_evaluation import ModelEvaluationForecastingOp

# from google_cloud_pipeline_components.preview.model_evaluation import ModelEvaluationFeatureAttributionOp

# from google_cloud_pipeline_components._implementation.model_evaluation import (ModelImportEvaluationOp, TargetFieldDataRemoverOp)

# preview.model_evaluation.ModelEvaluationFeatureAttributionOp
# from google_cloud_pipeline_components.experimental.evaluation import (
#     # EvaluationDataSamplerOp, 
#     # GetVertexModelOp,
#     # ModelEvaluationForecastingOp, 
#     # ModelEvaluationFeatureAttributionOp,
#     # ModelImportEvaluationOp, 
#     # TargetFieldDataRemoverOp
# )