# Run multiple experiments with pipeline runs

* some ideas [from here](https://codelabs.developers.google.com/vertex_experiments_pipelines_intro#5)

In [1]:
PROJECT_ID = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
LOCATION = 'us-central1'
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [2]:
# GCP Project Configuration:
# project where pipeline and vertex jobs are executed

GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
REGION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"REGION: {REGION}")

assert LOCATION, 'the value for this variable must be set'
assert PROJECT_ID, 'the value for this variable must be set'
# assert PROJECT_NUMBER, 'the value for this variable must be set'

%env GOOGLE_CLOUD_PROJECT={PROJECT_ID}

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
REGION: us-central1
env: GOOGLE_CLOUD_PROJECT=hybrid-vertex


In [3]:
# packages
from datetime import datetime
import json
import os
import time
from typing import Any, Callable, Dict, NamedTuple, Optional
import pandas as pd

# visualization
# from matplotlib import dates as mdates
# from matplotlib import pyplot as plt
# import seaborn as sns

# google cloud
from google.api_core import exceptions as google_exceptions
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.experimental import forecasting as gcc_aip_forecasting

import google.cloud.aiplatform as vertex_ai
from google.cloud import bigquery
from google.cloud import storage

# kfp
import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)

print(f'vertex_ai SDK version: {vertex_ai.__version__}')
print(f'bigquery SDK version: {bigquery.__version__}')

vertex_ai SDK version: 1.21.0
bigquery SDK version: 2.34.4


### setup clients

In [4]:
bq_client = bigquery.Client(
    project=PROJECT_ID, 
    # credentials=credentials
)

storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID,
    location=REGION
)

## Build Pipeline

In [5]:
!pwd

/home/jupyter/vertex-forecas-repo


In [6]:
from src import (
    create_bq_dataset,create_input_table_specs, get_eval_dataset_path_uri,
    create_combined_preds_table, create_forecast_input_table_specs, get_predict_table_path,
    model_batch_prediction_job, create_combined_preds_forecast_table, get_model_path,
    create_final_pred_table, args_generator_op_1
)

@kfp.v2.dsl.pipeline(
  name=PIPELINE_NAME
)
def pipeline(
    vertex_project: str,
    location: str,
    version: str,
    data_source_dataset: str,
    eval_destination_dataset: str,
    preprocess_dataset_us: str,
    model_version: str,
    model_display_name: str,
    context_window: str,
    forecast_horizon: str,
    budget_milli_node_hours: str,
    optimization_objective: str,

# TODO


    # create BQ dataset
    create_train_dataset_op = (
      create_bq_dataset.create_bq_dataset(
          project=vertex_project,
          vertex_dataset=data_source_dataset,
          new_bq_dataset=eval_destination_dataset,
          bq_location=location
      )
    )

    # ======================================
    # prep train jobs
    # ======================================

    create_input_table_specs_op = (
        create_input_table_specs.create_input_table_specs(
            products_table_uri=products_table_uri,
            activities_table_uri=activities_table_uri,
            locations_table_uri=locations_table_uri,
            time_granularity_unit=time_granularity_unit,
            time_granularity_quantity=time_granularity_quantity,
            # train_data_bq_source=train_data_bq_source,
        )
        .after(create_train_dataset_op)
    )

    forecasting_validation_op = (
        gcc_aip_forecasting.ForecastingValidationOp(
            input_tables=str(create_input_table_specs_op.outputs['input_table_specs']),
            validation_theme='FORECASTING_TRAINING',
      )
    )

    forecasting_preprocessing_op = (
      gcc_aip_forecasting.ForecastingPreprocessingOp(
          project=vertex_project,
          input_tables=str(create_input_table_specs_op.outputs['input_table_specs']),
          preprocessing_bigquery_dataset=data_source_dataset,
      )
      .after(forecasting_validation_op)
    )

    prepare_data_for_train_op = (
      gcc_aip_forecasting.ForecastingPrepareDataForTrainOp(
          input_tables=(
              str(create_input_table_specs_op.outputs['input_table_specs'])
          ),
          preprocess_metadata=(
              forecasting_preprocessing_op.outputs['preprocess_metadata']
          ),
          model_feature_columns=(
              str(create_input_table_specs_op.outputs['model_feature_columns'])
          )
      )
    )

    time_series_dataset_create_op = (
      gcc_aip.TimeSeriesDatasetCreateOp(
          display_name=f'train_ds_full_m5_{VERSION}',
          bq_source=prepare_data_for_train_op.outputs['preprocess_bq_uri'],
          project=vertex_project,
          location=location,
      )
    )

    mape_model_version = f'{VERSION}-seq2seq-mape'
    rmse_model_version = f'{VERSION}-seq2seq-rmse' 

    get_eval_dataset_path_uri_op = (
      get_eval_dataset_path_uri.get_eval_dataset_path_uri(
          project=vertex_project,
          eval_bq_dataset=create_train_dataset_op.outputs['bq_dataset_name'],
          model_1_table=mape_model_version,
          model_2_table=rmse_model_version,
      )
    )
    
    ## training
    train_model_op = (
      gcc_aip.AutoMLForecastingTrainingJobRunOp(
          display_name=f'train-{model_version}',
          model_display_name=model_display_name,
          model_labels={'model_override' : 'se2seq-hier'}, # model_override : se2seq-hier, tft
          # model_labels={'model_type' : 'l2l'},
          dataset=time_series_dataset_create_op.outputs['dataset'],
          context_window=context_window,
          forecast_horizon=forecast_horizon,
          budget_milli_node_hours=budget_milli_node_hours,
          project=vertex_project,
          location=location,
          export_evaluated_data_items=True,
          export_evaluated_data_items_bigquery_destination_uri=get_eval_dataset_path_uri_op.outputs['model_1_bigquery_table_uri'], # must be format:``bq://<project_id>:<dataset_id>:<table>``
          export_evaluated_data_items_override_destination=True,
          target_column=prepare_data_for_train_op.outputs['target_column'],
          time_column=prepare_data_for_train_op.outputs['time_column'],
          time_series_identifier_column=prepare_data_for_train_op.outputs['time_series_identifier_column'],
          time_series_attribute_columns=prepare_data_for_train_op.outputs['time_series_attribute_columns'],
          unavailable_at_forecast_columns=prepare_data_for_train_op.outputs['unavailable_at_forecast_columns'],
          available_at_forecast_columns=prepare_data_for_train_op.outputs['available_at_forecast_columns'],
          data_granularity_unit=prepare_data_for_train_op.outputs['data_granularity_unit'],
          data_granularity_count=prepare_data_for_train_op.outputs['data_granularity_count'],
          predefined_split_column_name= '', # prepare_data_for_train_op.outputs['predefined_split_column'],
          column_transformations=prepare_data_for_train_op.outputs['column_transformations'],
          weight_column=prepare_data_for_train_op.outputs['weight_column'],
          optimization_objective=optimization_objective,
          additional_experiments={
              'forecasting_model_type_override': 'seq2seq',
              'forecasting_hierarchical_group_column_names':'dept_id, cat_id'},
      )
    )

SyntaxError: invalid syntax (3391641767.py, line 42)