# Running Forecast Experiments with Vertex Tabular Workflows

**TODO**
* Add seperate notebook for evaluating **Algorithms**

```
* BQ_ARIMA+
* Prophet
* L2L, TFT, Seq2seq+, TiDE
```

### Vertex Pipeline console view

<img src='imgs/tabular_workflow_and_gcpc_overview.png'>

## Load notebook config

> use the prefix defined in 00-env-setup

In [1]:
CREATE_NEW_ASSETS = True

In [2]:
# naming convention for all cloud resources
VERSION        = "v1"              # TODO
PREFIX         = f'forecast-refresh-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = forecast-refresh-v1


In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# ! gcloud config set project $PROJECT_ID

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-gcs'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "forecast-refresh-v1"
VERSION                  = "v1"

BUCKET_NAME              = "forecast-refresh-v1-hybrid-vertex-gcs"
BUCKET_URI               = "gs://forecast-refresh-v1-hybrid-vertex-gcs"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://forecast-refresh-v1-hybrid-vertex-gcs/data"


VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"



In [4]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/
gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v1/
gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v2/
gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v3/


## Imports

In [5]:
# Import required modules
import json
import time
import datetime
from pprint import pprint
from typing import Any, Dict, List, Optional

from google.cloud import aiplatform, storage, bigquery

# from google_cloud_pipeline_components.types.artifact_types import VertexDataset
from google_cloud_pipeline_components.preview.automl.forecasting import \
    utils as automl_forecasting_utils

import sys
sys.path.append("..")
from src import helpers

In [6]:
# Construct a BigQuery client object.
bq_client = bigquery.Client(project=PROJECT_ID)

aiplatform.init(
    # experiment=EXPERIMENT_NAME, 
    project=PROJECT_ID, 
    location=REGION
)

## Training constants

In [7]:
### data sources ###
data_source_bigquery_table_path = (
    "bq://bigquery-public-data.iowa_liquor_sales_forecasting.2020_sales_train"
)

weight_column = None
predefined_split_key = None
data_source_csv_filenames = None

if predefined_split_key:
    training_fraction = None
    validation_fraction = None
    test_fraction = None
    
### data splits ###
training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

### data transformations ###
dataflow_subnetwork           = None # Dataflow's subnetwork name; empty == use default subnetwork
dataflow_use_public_ips       = True # Specifies whether Dataflow workers use public IP addresses

### features ###
time_series_identifier_column = "store_name"
target_column                 = "sale_dollars"
time_column                   = "date"
features = [
    time_column,
    target_column,
    "city",
    "zip_code",
    "county",
]
available_at_forecast_columns = [time_column]
unavailable_at_forecast_columns = [target_column]
time_series_attribute_columns = ["city", "zip_code", "county"]
    
# feature transforms
transformations = helpers.generate_transformation(auto_column_names=features)
    
print(f"available_at_forecast_columns   = {available_at_forecast_columns}")
print(f"unavailable_at_forecast_columns = {unavailable_at_forecast_columns}")
print(f"time_series_attribute_columns   = {time_series_attribute_columns}")
print(f"transformations                 = {transformations}\n")

available_at_forecast_columns   = ['date']
unavailable_at_forecast_columns = ['sale_dollars']
time_series_attribute_columns   = ['city', 'zip_code', 'county']
transformations                 = {'auto': ['date', 'sale_dollars', 'city', 'zip_code', 'county'], 'numeric': [], 'categorical': [], 'text': [], 'timestamp': []}



In [8]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI/

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/
gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v1/
gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v2/
gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v3/


# Orchestrate experiment with Vertex Pipelines

1. For first training iteration, **set context window and the forecast horizon to the same value**, and set your training budget to at least 6 hours

2. Train (retrain) the model again, with the **same training budget, but double the size of the context window**, i.e., 2x the size of the forecast horizon

3. If evaluation metrics for the second model show substantial improvement, train the model again, **increasing the context window to 5 times the size of the forecast horizon**. 
> * Consider making a proportional increase to the training budget (if you trained for 10 hours in the first step, increase the training budget to 50 hours).

4. Continue increasing the context window until you are no longer seeing improved evaluation metrics or until you are satisfied with the results. Revert back to the lowest value of the context window that produced acceptable results.

## Config Vertex AI Experiment

In [9]:
EXPERIMENT_VERSION = "v3"
EXPERIMENT_TAG     = "tide-cw-eval"
EXPERIMENT_NAME = f"{EXPERIMENT_TAG}-{EXPERIMENT_VERSION}"

# new experiment
invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : tide-cw-eval-v3
RUN_NAME          : run-20240109-185241

BASE_OUTPUT_DIR   : gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v3/run-20240109-185241
LOG_DIR           : gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v3/run-20240109-185241/logs
ROOT_DIR          : gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v3/run-20240109-185241/root
ARTIFACTS_DIR     : gs://forecast-refresh-v1-hybrid-vertex-gcs/tide-cw-eval-v3/run-20240109-185241/artifacts


### Create BigQuery Dataset

In [10]:
BIGQUERY_DATASET_NAME = EXPERIMENT_NAME.replace("-","_")

print(f"BIGQUERY_DATASET_NAME   : {BIGQUERY_DATASET_NAME}")

BIGQUERY_DATASET_NAME   : tide_cw_eval_v3


In [11]:
# if CREATE_NEW_ASSETS:
#     ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
#     ds.location = BQ_LOCATION
#     ds = bq_client.create_dataset(dataset = ds, exists_ok = True)
#     # print(ds.full_dataset_id)
# else:
#     ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    
# ds 
# ds.dataset_id
# ds.full_dataset_id

### Create Vertex Managed Dataset

In [269]:
# if CREATE_NEW_ASSETS:
#     # Create a Vertex managed dataset artifact.
#     vertex_dataset = aiplatform.TimeSeriesDataset.create(bq_source=data_source_bigquery_table_path)
# else:
#     vertex_dataset = aiplatform.TimeSeriesDataset('projects/934903580331/locations/us-central1/datasets/1647689642478141440')

# vertex_ds_artifact_id = vertex_dataset.gca_resource.metadata_artifact.split("/")[-1]

# print(f"vertex_dataset: {vertex_dataset}")
# print(f"vertex_ds_artifact_id: {vertex_ds_artifact_id}")

## Create Custom Pipeline Steps

In [14]:
REPO_DOCKER_PATH_PREFIX = 'src'

! rm -rf $REPO_DOCKER_PATH_PREFIX
! mkdir $REPO_DOCKER_PATH_PREFIX
# !mkdir -p ./$REPO_DOCKER_PATH_PREFIX

In [15]:
from kfp import dsl
from kfp import compiler
from kfp import components

import kfp
from typing import NamedTuple, List
from kfp.dsl import (
    component, 
    Metrics
)
from google_cloud_pipeline_components.v1.dataset import TimeSeriesDatasetCreateOp
from google_cloud_pipeline_components._implementation.model import GetVertexModelOp
from google_cloud_pipeline_components.v1.wait_gcp_resources import \
    WaitGcpResourcesOp

### component: args_generate_string

> TODO: test this 

In [16]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/args_generate_string.py
import kfp
from typing import NamedTuple, List, Dict, Any, Union
from kfp.dsl import (
    component, 
    Metrics
)
@component(
  base_image='python:3.10',
)

def args_generate_string(
    # cw_values: List[int],
    # opt_objective: str,
    # experiment_name: str,
    experiment_list: List
) -> str:
    import logging
    import json
    
#     logging.info(f'NUM_EXPERIMENTS: {len(cw_values)}')
#     output_list = []
    
#     for cw in cw_values:
#         entry = {
#             "context_window" : str(cw),
#             "objective" : opt_objective,
#             "model_display_name" : f"{experiment_name}-{str(cw)}",
#         }
#         output_list.append(entry)
        
#     logging.info(f'output_list: {output_list}')
    
    return json.dumps(
        experiment_list
    )

Writing src/args_generate_string.py


### component: args_generate_ints

In [18]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/args_generate_ints.py
import kfp
from typing import NamedTuple, List, Dict, Any, Union
from kfp.dsl import (
    component, 
    Metrics
)
@component(
  base_image='python:3.10',
)

def args_generate_ints(
    experiment_dict: str,
) -> int:

    import json
    import logging
    
    entry_dump = json.loads(experiment_dict)
    logging.info(f'experiment_dict: {experiment_dict}')
    
    integer_value_cw = int(entry_dump['context_window'])
    
    # for model_version, (cw, _, _) in entry_dump.items():
    #     print(f"model_version: {model_version}")
    #     in_cw_value = int(entry_dump[model_version]['context_window'])
    
    return integer_value_cw

Writing src/args_generate_ints.py


### component: create_bq_dataset

In [19]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/create_bq_dataset.py
import kfp
from typing import NamedTuple
from kfp.dsl import (
    component, 
    Metrics
)
@component(
  base_image='python:3.10',
  packages_to_install=['google-cloud-bigquery==3.14.1'],
)
def create_bq_dataset(
    project: str,
    new_bq_dataset: str,
    bq_location: str
) -> NamedTuple('Outputs', [
    ('bq_dataset_name', str),
    ('bq_dataset_uri', str),
]):
    
    from google.cloud import bigquery

    bq_client = bigquery.Client(project=project, location=bq_location) # bq_location)
    (
      bq_client.query(f'CREATE SCHEMA IF NOT EXISTS `{project}.{new_bq_dataset}`')
      .result()
    )
    
    return (
        f'{new_bq_dataset}',
        f'bq://{project}.{new_bq_dataset}',
    )

Writing src/create_bq_dataset.py


## Define experiment design

In [None]:
RUN_EVALUATION                = True
PROBABILISTIC_INFER           = False

NUM_SELECTED_TRIALS           = 3      # Number of weak models in the final ensemble model
TRAIN_BUDGET_MILLI_NODE_HRS   = 250.0  # 15 minutes

stage_1_num_parallel_trials   = 35     # Number of parallel trails for stage 1
stage_2_num_parallel_trials   = 35     # Number of parallel trails for stage 2

forecast_horizon              = 14
context_window                = 14

optimization_objective        = "minimize-wape-mae"

CW_VALUES = [
    context_window, 
    int(context_window*2), 
    int(context_window*4), 
    int(context_window*6)
]

print(f"optimization_objective = {optimization_objective}")
print(f"forecast_horizon       = {forecast_horizon}")
print(f"context_window         = {context_window}")
print(f"CW_VALUES              = {CW_VALUES}")

In [102]:
EXPERIMENT_LIST = []

for cw in CW_VALUES:
    entry = {
        f"{EXPERIMENT_NAME}-{str(cw)}": {
            "context_window" : str(cw),
            "objective" : optimization_objective,
            "model_display_name" : f"{EXPERIMENT_NAME}-{str(cw)}",
            "forecast_horizon" : forecast_horizon,
            "num_trials" : NUM_SELECTED_TRIALS,
            "train_node_hrs" : TRAIN_BUDGET_MILLI_NODE_HRS
        },
    }
    EXPERIMENT_LIST.append(entry)

pprint(EXPERIMENT_LIST)

optimization_objective = minimize-wape-mae
forecast_horizon       = 14
context_window         = 14
CW_VALUES              = [14, 28, 56, 84]
[{'tide-cw-eval-v3-14': {'context_window': '14',
                         'forecast_horizon': 14,
                         'model_display_name': 'tide-cw-eval-v3-14',
                         'num_trials': 3,
                         'objective': 'minimize-wape-mae',
                         'train_node_hrs': 250.0}},
 {'tide-cw-eval-v3-28': {'context_window': '28',
                         'forecast_horizon': 14,
                         'model_display_name': 'tide-cw-eval-v3-28',
                         'num_trials': 3,
                         'objective': 'minimize-wape-mae',
                         'train_node_hrs': 250.0}},
 {'tide-cw-eval-v3-56': {'context_window': '56',
                         'forecast_horizon': 14,
                         'model_display_name': 'tide-cw-eval-v3-56',
                         'num_trials': 3,
          

### Build pipeline

* `parallelism=i` == i executions to be scheduled at a time (consider resource quotas)

In [None]:
PIPE_VERSION = "v7"
DISPLAY_NAME = f"{EXPERIMENT_NAME}-{RUN_NAME}-{PIPE_VERSION}".replace("_","-")

print(f"DISPLAY_NAME: {DISPLAY_NAME}") 

In [23]:
from src import (
    create_bq_dataset,
    collect_eval_metrics,
    # args_generator_op,
    args_generate_ints,
    args_generate_string
)

_worker_pool_specs_override = [
    {"machine_spec": {"machine_type": "n1-standard-16"}},  # override for TF chief node
    {},  # override for TF worker node, since it's not used, leave it empty
    {},  # override for TF ps node, since it's not used, leave it empty
    {"machine_spec": {"machine_type": "n1-standard-16"}},  # override for TF evaluator node
]

@dsl.pipeline(
    name=f"{DISPLAY_NAME}",
)
def cw_experiment_pipeline(
    project_id: str, 
    region: str, 
    bq_location: str,
    new_bq_dataset: str,
    cwvalues: List[int],
    experiment_name: str,
    bq_source_uri: str,
    optimization_objective: str,
    train_budget_milli_node_hours: float,
    num_selected_trials: int,
    # stage_1_num_parallel_trials: int,
    # stage_2_num_parallel_trials: int,
):
    
    import logging
    
    # create BQ dataset
    create_train_dataset_op = (
        create_bq_dataset.create_bq_dataset(
            project=project_id,
            new_bq_dataset=new_bq_dataset,
            bq_location=bq_location,
        )
        .set_display_name("Create BQ Dataset")
        .set_caching_options(True)
    )
    
    time_series_dataset_create_op = (
        TimeSeriesDatasetCreateOp(
            display_name='train_ds_iowa_liquor',
            bq_source=bq_source_uri,
            project=project_id,
            location=region,
        )
        .set_display_name("Create Managed Dataset")
        .set_caching_options(True)
    )
    
    args_generate_str_op = (
        args_generate_string.args_generate_string(
            cw_values=cwvalues,
            opt_objective=optimization_objective,
            experiment_name=experiment_name,
        )
        .set_display_name("Generate string args")
        .set_caching_options(False)
    )

    with dsl.ParallelFor(items=args_generate_str_op.output, parallelism=2) as item:
        
        args_generate_ints_op = (
            args_generate_ints.args_generate_ints(
                experiment_dict=item
            )
            .set_display_name("Generate integer args")
        )
        
        # TiDE tabular workflow config
        (
            pipe_template_path,
            pipe_parameter_values,
        ) = automl_forecasting_utils.get_time_series_dense_encoder_forecasting_pipeline_and_parameters(
            project=PROJECT_ID,
            location=REGION,
            root_dir=ROOT_DIR,
            model_display_name=item.model_display_name,           # item.model_display_name, | MODEL_DISPLAY_NAME
            target_column=target_column,
            optimization_objective=item.objective,                # item.objective | optimization_objective
            transformations=transformations,
            train_budget_milli_node_hours=train_budget_milli_node_hours,
            data_source_csv_filenames=data_source_csv_filenames,
            data_source_bigquery_table_path=data_source_bigquery_table_path,
            weight_column=weight_column,
            predefined_split_key=predefined_split_key,
            training_fraction=training_fraction,
            validation_fraction=validation_fraction,
            test_fraction=test_fraction,
            num_selected_trials=num_selected_trials,
            time_column=time_column,
            time_series_identifier_columns=[time_series_identifier_column],
            time_series_attribute_columns=time_series_attribute_columns,
            available_at_forecast_columns=available_at_forecast_columns,
            unavailable_at_forecast_columns=unavailable_at_forecast_columns,
            forecast_horizon=forecast_horizon,
            # context_window=item.context_window,                     # item.context_window | cw_item
            dataflow_subnetwork=dataflow_subnetwork,
            dataflow_use_public_ips=dataflow_use_public_ips,
            run_evaluation=RUN_EVALUATION,    
            # evaluated_examples_bigquery_path=f'bq://{PROJECT_ID}.{BIGQUERY_DATASET_NAME}',
            enable_probabilistic_inference=PROBABILISTIC_INFER,
            # holiday_regions=['US','AE'],
            # stage_1_tuner_worker_pool_specs_override=_worker_pool_specs_override,
            # stage_2_trainer_worker_pool_specs_override=_worker_pool_specs_override,
        )
        # load pipeline component(s) from YAML
        forecasting_pipeline_yaml = components.load_component_from_file(pipe_template_path)
        
        # TiDE tabular workflow pipeline step(s)
        forecast_train_op = (
            forecasting_pipeline_yaml(
                **pipe_parameter_values,
                vertex_dataset=time_series_dataset_create_op.outputs['dataset'],
                evaluated_examples_bigquery_path=create_train_dataset_op.outputs['bq_dataset_uri'],
                # model_display_name=MODEL_DISPLAY_NAME,
                # optimization_objective=item.objective,
                context_window=args_generate_ints_op.output,  #.outputs['cw_value'],
                stage_1_tuner_worker_pool_specs_override=_worker_pool_specs_override,
                stage_2_trainer_worker_pool_specs_override=_worker_pool_specs_override,
            )
            .set_display_name("VF Trainer")
            .set_caching_options(True)
        )

In [24]:
PIPELINE_YAML_FILENAME = "pipeline.yaml"

! rm -f $PIPELINE_YAML_FILENAME

compiler.Compiler().compile(
    pipeline_func=cw_experiment_pipeline, 
    package_path=PIPELINE_YAML_FILENAME
)

In [25]:
PIPELINES_FILEPATH = f"{BASE_OUTPUT_DIR}/{PIPELINE_YAML_FILENAME}"

!gsutil cp $PIPELINE_YAML_FILENAME $PIPELINES_FILEPATH

Copying file://pipeline.yaml [Content-Type=application/octet-stream]...
/ [1 files][380.6 KiB/380.6 KiB]                                                
Operation completed over 1 objects/380.6 KiB.                                    


In [27]:
job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    location=REGION,
    template_path=PIPELINE_YAML_FILENAME,
    pipeline_root=ROOT_DIR,
    failure_policy='fast',
    parameter_values={
        'project_id' : PROJECT_ID, 
        'region' : REGION, 
        'bq_location' : BQ_LOCATION,
        'new_bq_dataset' : BIGQUERY_DATASET_NAME,
        'cwvalues' : CW_VALUES,
        'experiment_name' : EXPERIMENT_NAME,
        'bq_source_uri' : data_source_bigquery_table_path,
        'optimization_objective' : optimization_objective,
        "train_budget_milli_node_hours": TRAIN_BUDGET_MILLI_NODE_HRS,
        "num_selected_trials": NUM_SELECTED_TRIALS,
    },
    enable_caching=True,
)

job.submit(
    # experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/forecast-cw-experiment-v1-20240109185421
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/forecast-cw-experiment-v1-20240109185421')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/forecast-cw-experiment-v1-20240109185421?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/forecast-cw-experiment-v1-20240109185421 to Experiment: tide-cw-eval-v3


#### get pipeline task details 

In [206]:
# job.task_details

# pipeline_task_details = job.task_details

# for task_deets in pipeline_task_details:
#     print(task_deets.task_name)

# Log Model Evaluations to Vertex AI Experiments

**TODO**
* for each model in experiment, add ts forecast for a specific slice/product/region/etc from data 

In [223]:
# tmp - remove experiment_nme from pipeline job
EXPERIMENT_NAME = f"{EXPERIMENT_NAME}"

aiplatform.init(
    experiment=EXPERIMENT_NAME, 
    project=PROJECT_ID, 
    location=REGION
)

## Get trained models

In [224]:
task_list_tmp = []

for task_deets in pipeline_task_details:
    if task_deets.task_name == "model-upload-2":
        
        task_list_tmp.append(task_deets)
        # print(task_deets.task_name)
        
print(f"Length of items: {len(task_list_tmp)}")

# example
task_list_tmp[0].outputs["model"].artifacts[0].metadata['resourceName']

Length of items: 4


'projects/934903580331/locations/us-central1/models/3678295204445552640'

## Log experiment run

In [225]:
overall_list = [] 
log_metrics_dict = {}
log_params_dict = {}

for i in range(0, len(task_list_tmp)):
    
    print(f"i: {i}")
    
    # get trained model 
    model_name = task_list_tmp[i].outputs["model"].artifacts[0].metadata['resourceName']
    model = aiplatform.Model(model_name)
    trained_display_name = model.to_dict()['displayName']
    
    # get model eval metrics
    model_evaluations = model.list_model_evaluations()
    for evaluation in model_evaluations:
        metrics_dict = evaluation.to_dict()["metrics"]
        
    # create metrics_dict to log
    log_metrics_dict['rmsle'] = round(metrics_dict['rootMeanSquaredLogError'], 2)
    log_metrics_dict['rmse'] = round(metrics_dict['rootMeanSquaredError'], 2)
    log_metrics_dict['rmspe'] = round(metrics_dict['rootMeanSquaredPercentageError'], 2)
    log_metrics_dict['r2'] = round(metrics_dict['rSquared'], 2)
    log_metrics_dict['mape'] = round(metrics_dict['meanAbsolutePercentageError'], 2)
    log_metrics_dict['wape'] = round(metrics_dict['weightedAbsolutePercentageError'], 2)
    log_metrics_dict['mae'] = round(metrics_dict['meanAbsoluteError'], 2)
    
    # get experiment params
    for exp in EXPERIMENT_LIST:
        for k,v in exp.items():
            if k == trained_display_name:
                log_params_dict['cw']             = int(v['context_window'])
                log_params_dict['opt_obj']        = v['objective']
                log_params_dict['num_trials']     = int(v['num_trials'])
                log_params_dict['train_node_hrs'] = float(v['train_node_hrs'])
                log_params_dict['display_name']   = v['model_display_name']
    
    EXPERIMENT_RUN_NAME = trained_display_name
    print(f"Logging for experiment run: {EXPERIMENT_RUN_NAME}")
    
    with aiplatform.start_run(f'{trained_display_name}') as my_run:

        print(f"logging metrics...")
        my_run.log_metrics(log_metrics_dict)

        print(f"logging metaparams...\n")
        my_run.log_params(log_params_dict)

        aiplatform.end_run()
        
        time.sleep(3)

i: 0
Logging for experiment run: tide-cw-eval-v3-84
Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/tide-cw-eval-v3-tmp-jt-tide-cw-eval-v3-84 to Experiment: tide-cw-eval-v3-tmp-jt
logging metrics...
logging metaparams...

i: 1
Logging for experiment run: tide-cw-eval-v3-56
Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/tide-cw-eval-v3-tmp-jt-tide-cw-eval-v3-56 to Experiment: tide-cw-eval-v3-tmp-jt
logging metrics...
logging metaparams...

i: 2
Logging for experiment run: tide-cw-eval-v3-14
Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/tide-cw-eval-v3-tmp-jt-tide-cw-eval-v3-14 to Experiment: tide-cw-eval-v3-tmp-jt
logging metrics...
logging metaparams...

i: 3
Logging for experiment run: tide-cw-eval-v3-28
Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/tide-cw-eval-v3-tmp-jt-tide-cw-eval-v3-28 to Experiment: tide-cw-eval-v3-t

### Inspect model_eval object

In [157]:
forecast_EVALS = model.list_model_evaluations()

for model_evaluation in forecast_EVALS:
    pprint(model_evaluation.to_dict())

{'createTime': '2024-01-04T12:27:50.196401Z',
 'displayName': 'Vertex Forecasting pipeline',
 'metadata': {'evaluation_dataset_path': ['bq://hybrid-vertex.vertex_feature_transform_engine_staging_us.vertex_ai_fte_split_output_test_staging_idb200b234d5404c0da302ae412489a3bb'],
              'evaluation_dataset_type': 'bigquery',
              'pipeline_job_id': '333446967210278912',
              'pipeline_job_resource_name': 'projects/934903580331/locations/us-central1/pipelineJobs/forecast-cw-experiment-v1-20240104115742'},
 'metrics': {'meanAbsoluteError': 4006.6643,
             'meanAbsolutePercentageError': 428.84814,
             'rSquared': 0.55919313,
             'rootMeanSquaredError': 8907.6455,
             'rootMeanSquaredLogError': 0.9567208,
             'rootMeanSquaredPercentageError': 5700.1587,
             'weightedAbsolutePercentageError': 47.937183},
 'metricsSchemaUri': 'gs://google-cloud-aiplatform/schema/modelevaluation/forecasting_metrics_1.0.0.yaml',
 'modelEx

# Visualize with Data Studio

> TODO

The code block included in this section dynamically generates a Data Studio link that specifies the template, the location of the forecasts, and the query to generate the chart. The data is populated from the forecasts generated earlier.

You can inspect the used template [here](https://datastudio.google.com/c/u/0/reporting/067f70d2-8cd6-4a4c-a099-292acd1053e8). This was created by Google specifically to view forecasting predictions.

In [None]:
# def _sanitize_bq_uri(bq_uri: str):
#     if bq_uri.startswith("bq://"):
#         bq_uri = bq_uri[5:]
#     return bq_uri.replace(":", ".")


# def get_data_studio_link(
#     batch_prediction_bq_input_uri: str,
#     batch_prediction_bq_output_uri: str,
#     time_column: str,
#     time_series_identifier_column: str,
#     target_column: str,
# ):
#     """Creates a link that fills in the demo Data Studio template."""
#     batch_prediction_bq_input_uri = _sanitize_bq_uri(batch_prediction_bq_input_uri)
#     batch_prediction_bq_output_uri = _sanitize_bq_uri(batch_prediction_bq_output_uri)
#     query = f"""
#         SELECT
#           CAST(input.{time_column} as DATETIME) timestamp_col,
#           CAST(input.{time_series_identifier_column} as STRING) time_series_identifier_col,
#           CAST(input.{target_column} as NUMERIC) historical_values,
#           CAST(predicted_{target_column}.value as NUMERIC) predicted_values,
#         FROM `{batch_prediction_bq_input_uri}` input
#         LEFT JOIN `{batch_prediction_bq_output_uri}` output
#           ON
#             TIMESTAMP(input.{time_column}) = TIMESTAMP(output.{time_column})
#             AND CAST(input.{time_series_identifier_column} as STRING) = CAST(
#               output.{time_series_identifier_column} as STRING)
#     """
#     params = {
#         "templateId": "067f70d2-8cd6-4a4c-a099-292acd1053e8",
#         "ds0.connector": "BIG_QUERY",
#         "ds0.projectId": PROJECT_ID,
#         "ds0.billingProjectId": PROJECT_ID,
#         "ds0.type": "CUSTOM_QUERY",
#         "ds0.sql": query,
#     }
#     base_url = "https://datastudio.google.com/c/u/0/reporting"
#     url_params = urllib.parse.urlencode({"params": json.dumps(params)})
#     return f"{base_url}?{url_params}"

In [None]:
# actuals_table = f"{dataset_path}.actuals"
# query = f"""
#     CREATE OR REPLACE TABLE `{actuals_table}` AS
#     {base_data_query}
#     SELECT *
#     FROM base_data
#     WHERE split != 'TRAIN'
# """
# client.query(query).result()
# print(f"Created {actuals_table}.")

In [None]:
# print("Click the link below to view ARIMA predictions:")
# print(
#     get_data_studio_link(
#         batch_prediction_bq_input_uri=actuals_table,
#         batch_prediction_bq_output_uri=f"{dataset_path}.{pred_table}",
#         time_column=time_column,
#         time_series_identifier_column=time_series_identifier_column,
#         target_column=target_column,
#     )
# )