# Vertex Forecast - training with SDK

In [43]:
import os

GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
BQ_LOCATION='US'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"REGION: {REGION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
REGION: us-central1


In [2]:
import google.cloud.aiplatform as vertex_ai
from google.cloud import bigquery
from google.cloud import storage

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

In [44]:
bq_client = bigquery.Client(
    project=PROJECT_ID, 
    location=BQ_LOCATION
)

storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID,
    location=LOCATION
)

In [3]:
# previously defined
BQ_DATASET="m5_us"
BQ_TABLE="sdk_train"
BQ_TABLE_PLAN="sdk_plan"

# new vars
EXPERIMENT="m5_nb3"
VERSION="v1"

## Create Vertex Managed Dataset 
* link to BigQuery Table

Reference for [`aiplatform.TimeSeriesDataset.create()`](https://googleapis.dev/python/aiplatform/latest/aiplatform.html#google.cloud.aiplatform.TimeSeriesDataset.create)

In [5]:
dataset = vertex_ai.TimeSeriesDataset.create(
    display_name = f'{EXPERIMENT}_{VERSION}', 
    bq_source = f'bq://{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_prepped',
    labels = {'experiment':f'{EXPERIMENT}'}
)

Creating TimeSeriesDataset
Create TimeSeriesDataset backing LRO: projects/934903580331/locations/us-central1/datasets/462153324456574976/operations/658077294274805760
TimeSeriesDataset created. Resource name: projects/934903580331/locations/us-central1/datasets/462153324456574976
To use this TimeSeriesDataset in another session:
ds = aiplatform.TimeSeriesDataset('projects/934903580331/locations/us-central1/datasets/462153324456574976')


In [None]:
# dataset = vertex_ai.TimeSeriesDataset('projects/715288179162/locations/us-central1/datasets/462153324456574976')

# Train Forecasting Model with AutoML

Reference for [`aiplatform.AutoMLForecastingTrainingJob`](https://googleapis.dev/python/aiplatform/latest/aiplatform.html#google.cloud.aiplatform.AutoMLForecastingTrainingJob)

## train job config

### column specs

In [22]:
dataset.column_names

['event_name_1',
 'year',
 'month',
 'dept_id',
 'wday',
 'snap_CA',
 'gross_quantity',
 'product_id',
 'date',
 'event_name_2',
 'timeseries_id',
 'splits',
 'cat_id',
 'event_type_1',
 'sell_price',
 'event_type_2',
 'state_id',
 'snap_WI',
 'snap_TX',
 'location_id',
 'weekday']

### define target, time, and ID

In [35]:
TARGET_COLUMN = 'gross_quantity'
TIME_COLUMN = 'date'
SERIES_COLUMN = 'timeseries_id'

### features which will be available at forecast (inference)

In [36]:
# column_specs = list(set(dataset.column_names) - set(['splits','timeseries_id']))
# column_specs = dict.fromkeys(column_specs, 'auto')

AVAILABLE_AT_FORECAST_COLS = list(set(dataset.column_names) - set(['splits','timeseries_id','gross_quantity','sell_price']))
AVAILABLE_AT_FORECAST_COLS

['event_name_1',
 'year',
 'event_type_1',
 'month',
 'dept_id',
 'event_type_2',
 'wday',
 'state_id',
 'snap_WI',
 'snap_CA',
 'snap_TX',
 'product_id',
 'date',
 'event_name_2',
 'location_id',
 'weekday',
 'cat_id']

### features which will be unavailable at forecast (inference)

In [37]:
UNAVAILABLE_AT_FORECAST_COLS=[
    TARGET_COLUMN,
    'sell_price'
]

Create a dictionary containing all features

In [38]:
COL_TRANSFORMS = {
    TIME_COLUMN:"timestamp",
    TARGET_COLUMN:"numeric",
    "product_id":"categorical",
    "location_id":"categorical",
    "weekday":"categorical",
    "event_name_1":"categorical",
    "year":"categorical",
    "event_type_1":"categorical",
    "month":"categorical",
    "dept_id":"categorical",
    "event_type_2":"categorical",
    "wday":"categorical",
    "state_id":"categorical",
    "snap_WI":"categorical",
    "snap_CA":"categorical",
    "snap_TX":"categorical",
    "event_name_2":"categorical",
    "cat_id":"categorical",
    "sell_price":"numeric"
}

### model config

**optimization_objective:**
* "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
* "minimize-mae" - Minimize mean-absolute error (MAE).
* "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
* "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE).
* "minimize-wape-mae" - Minimize the combination of weighted absolute percentage error (WAPE) and mean-absolute-error (MAE).
* "minimize-quantile-loss" - Minimize the quantile loss at the defined quantiles. (Set this objective to build quantile forecasts.)

In [39]:
# TODO - edit these

# forecast spec
FORECAST_GRANULARITY = 'DAY'
DATA_GRANULARITY_COUNT=1
FORECAST_HORIZON = 14
CONTEXT_WINDOW = 14
forecast_test_length = 14
forecast_val_length = 14

# model config
OPTIMIZATION_OBJECTIVE="minimize-rmse"

# job spec
MILLI_NODE_HRS=1000
HOLIDAY_REGIONS=['GLOBAL', 'NA', 'US']

# export eval set BQ destination
f"bq://{PROJECT_ID}:{BQ_DATASET}:{BQ_TABLE}_automl_{VERSION}",

print(f"EXPERIMENT: {EXPERIMENT}")
print(f"VERSION: {VERSION}")
print(f"OPTIMIZATION_OBJECTIVE: {OPTIMIZATION_OBJECTIVE}")
print(f"TARGET_COLUMN: {TARGET_COLUMN}")
print(f"TIME_COLUMN: {TIME_COLUMN}")
print(f"SERIES_COLUMN: {SERIES_COLUMN}")
print(f"AVAILABLE_AT_FORECAST_COLS: {AVAILABLE_AT_FORECAST_COLS}")
print(f"FORECAST_HORIZON: {FORECAST_HORIZON}")
print(f"FORECAST_GRANULARITY: {FORECAST_GRANULARITY.lower()}")
print(f"CONTEXT_WINDOW: {CONTEXT_WINDOW}")
print(f"TARGET_COLUMN: {TARGET_COLUMN}")
print(f"TARGET_COLUMN: {TARGET_COLUMN}")

EXPERIMENT: m5_nb3
VERSION: v1
OPTIMIZATION_OBJECTIVE: minimize-rmse
TARGET_COLUMN: gross_quantity
TIME_COLUMN: date
SERIES_COLUMN: timeseries_id
AVAILABLE_AT_FORECAST_COLS: ['event_name_1', 'year', 'event_type_1', 'month', 'dept_id', 'event_type_2', 'wday', 'state_id', 'snap_WI', 'snap_CA', 'snap_TX', 'product_id', 'date', 'event_name_2', 'location_id', 'weekday', 'cat_id']
FORECAST_HORIZON: 14
FORECAST_GRANULARITY: day
CONTEXT_WINDOW: 14
TARGET_COLUMN: gross_quantity
TARGET_COLUMN: gross_quantity


## create and submit job

In [31]:
forecast_job = vertex_ai.AutoMLForecastingTrainingJob(
    display_name = f'{EXPERIMENT}_{VERSION}_training',
    optimization_objective=OPTIMIZATION_OBJECTIVE,
    column_specs = COL_TRANSFORMS,
    labels = {'experiment':f'{EXPERIMENT}'}
)

In [32]:
forecast=forecast_job.run(
    dataset=dataset,
    target_column=TARGET_COLUMN,
    time_column=TIME_COLUMN,
    time_series_identifier_column=SERIES_COLUMN,
    unavailable_at_forecast_columns=UNAVAILABLE_AT_FORECAST_COLS,
    available_at_forecast_columns=AVAILABLE_AT_FORECAST_COLS,
    forecast_horizon=FORECAST_HORIZON,
    data_granularity_unit=FORECAST_GRANULARITY.lower(),
    data_granularity_count=DATA_GRANULARITY_COUNT,
    predefined_split_column_name="splits",
    context_window = CONTEXT_WINDOW,
    export_evaluated_data_items=True,
    export_evaluated_data_items_bigquery_destination_uri=f"bq://{PROJECT_ID}:{BQ_DATASET}:{BQ_TABLE}_automl",
    validation_options="fail-pipeline",
    budget_milli_node_hours = MILLI_NODE_HRS,
    model_display_name=f"{EXPERIMENT}_{BQ_TABLE}_{VERSION}",
    model_labels={'experiment':f'{EXPERIMENT}'},
    holiday_regions=HOLIDAY_REGIONS,
    sync=False
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7132435584676528128?project=934903580331
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/7132435584676528128 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/7132435584676528128 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/7132435584676528128 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/7132435584676528128 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/7132435584676528128 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/lo

In [60]:
FORECAST_MODEL_RSC_NAME = forecast.resource_name
print(f"FORECAST_MODEL_RSC_NAME: {FORECAST_MODEL_RSC_NAME}")

FORECAST_MODEL_RSC_NAME: projects/934903580331/locations/us-central1/models/7005333827212017664


## default Model Evaluation

In [111]:
forecast_EVALS = forecast.list_model_evaluations()

for model_evaluation in forecast_EVALS:
    print(model_evaluation.to_dict())

{'name': 'projects/934903580331/locations/us-central1/models/7005333827212017664@1/evaluations/6460567139166462004', 'metricsSchemaUri': 'gs://google-cloud-aiplatform/schema/modelevaluation/forecasting_metrics_1.0.0.yaml', 'metrics': {'rootMeanSquaredLogError': 0.5136268, 'meanAbsolutePercentageError': 319171650.0, 'rootMeanSquaredError': 2.0453851, 'weightedAbsolutePercentageError': 69.91979, 'meanAbsoluteError': 1.0452875, 'rootMeanSquaredPercentageError': 736705400.0, 'rSquared': 0.6968847}, 'createTime': '2023-03-29T02:59:47.699501Z', 'modelExplanation': {'meanAttributions': [{'featureAttributions': {'weekday': 0.03940948912202448, 'snap_WI': 0.020936330299017765, 'snap_CA': 0.018657082080329047, 'event_name_1': 0.0034677933750684176, 'month': 0.002100957425595409, 'cat_id': 0.0151225464475233, 'wday': 0.034578529526502484, 'snap_TX': 0.002699295153349477, 'event_type_2': 5.7893715215150484e-11, 'year': 0.008367055008976831, 'dept_id': 0.05254432222548768, 'event_type_1': 0.0018022

### retrieve default model evaluation metrics

In [77]:
# Drop any metrics the ARIMA pipeline doesn't support yet.
model_evaluation = list(forecast.list_model_evaluations())[0]
metrics_dict = {k: [v] for k, v in dict(model_evaluation.metrics).items()}
metrics_dict

{'rootMeanSquaredLogError': [0.5136268],
 'rSquared': [0.6968847],
 'meanAbsoluteError': [1.0452875],
 'meanAbsolutePercentageError': [319171650.0],
 'rootMeanSquaredError': [2.0453851],
 'rootMeanSquaredPercentageError': [736705400.0],
 'weightedAbsolutePercentageError': [69.91979]}

## log metrics to Vertex AI Experiments

In [96]:
from datetime import datetime

# create run name
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
EXPERIMENT_RUN_NAME = f"run-{TIMESTAMP}"

# log params and metrics to dicts
params = {}
params["budget_hrs"] = MILLI_NODE_HRS
params["horizon"] = FORECAST_HORIZON
params["context_window"] = CONTEXT_WINDOW

metrics = {}
metrics["MAE"] = metrics_dict['meanAbsoluteError'][0]
metrics["RMSE"] = metrics_dict['rootMeanSquaredError'][0]
metrics["MAPE"] = metrics_dict['meanAbsolutePercentageError'][0]
metrics["rSquared"] = metrics_dict['rSquared'][0]
metrics["RMSLE"] = metrics_dict['rootMeanSquaredLogError'][0]
metrics["WAPE"] = metrics_dict['weightedAbsolutePercentageError'][0]

# # Create and log experiment
vertex_ai.init(experiment=EXPERIMENT.replace("_","-"))

with vertex_ai.start_run(EXPERIMENT_RUN_NAME) as my_run:
    my_run.log_metrics(metrics)
    my_run.log_params(params)

    vertex_ai.end_run()

Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/m5-nb3-run-20230329055257 to Experiment: m5-nb3


INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/m5-nb3-run-20230329055257 to Experiment: m5-nb3


## Using The Results

In [46]:
query = f"""
WITH
    RAW AS (
        SELECT 
            DATE({TIME_COLUMN}) as {TIME_COLUMN}, 
            DATE(predicted_on_date) as predicted_on_date, 
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN}, 
            #splits, 
            {SERIES_COLUMN}, 
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_automl`
    ),
    LEAD AS (
        SELECT 
            *, 
            DATE_DIFF({TIME_COLUMN}, predicted_on_date, {FORECAST_GRANULARITY}) as prediction_lead_days
        FROM RAW
    ),
    LEFTSIDE AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            min(prediction_lead_days) as prediction_lead_days
        FROM LEAD
        GROUP BY {SERIES_COLUMN}, {TIME_COLUMN}
    )
SELECT *
FROM LEFTSIDE
LEFT OUTER JOIN LEAD
USING ({SERIES_COLUMN}, {TIME_COLUMN}, prediction_lead_days)
"""
autoML = bq_client.query(query).to_dataframe()
autoML

# print to run in BigQuery console
# print(query)

Unnamed: 0,timeseries_id,date,prediction_lead_days,predicted_on_date,gross_quantity,predicted_gross_quantity
0,FOODS_3_681_CA_3,2016-05-10,0,2016-05-10,34,32.296787
1,HOUSEHOLD_1_465_TX_2,2016-05-10,0,2016-05-10,16,11.202044
2,FOODS_2_399_TX_3,2016-05-11,0,2016-05-11,9,0.371759
3,FOODS_3_413_WI_1,2016-05-11,0,2016-05-11,15,6.104545
4,FOODS_3_234_TX_2,2016-05-12,0,2016-05-12,24,16.150120
...,...,...,...,...,...,...
426855,FOODS_3_329_WI_1,2016-05-12,0,2016-05-12,8,10.064318
426856,FOODS_3_692_CA_2,2016-05-13,0,2016-05-13,8,3.398916
426857,HOUSEHOLD_1_459_WI_3,2016-05-13,0,2016-05-13,8,9.950679
426858,HOUSEHOLD_1_351_CA_2,2016-05-17,0,2016-05-17,8,3.783339


### Review Custom Metrics with SQL

Some common metrics for evaluating forecasting effectiveness are 
- MAPE, or Mean Absolute Percentage Error
    - $\textrm{MAPE} = \frac{1}{n}\sum{\frac{\mid(actual - forecast)\mid}{actual}}$
- MAE, or Mean Absolute Error
     - $\textrm{MAE} = \frac{1}{n}\sum{\mid(actual - forecast)\mid}$
- MAE divided by average demand so it yields a % like MAPE
    - $\textrm{pMAE} = \frac{\sum{\mid(actual - forecast)\mid}}{\sum{actual}}$
- MSE, or Mean Squared Error
    - $\textrm{MSE} = \frac{1}{n}\sum{(actual-forecast)^2}$
- RMSE, or Root Mean Squared Error
    - $\textrm{RMSE} = \sqrt{\frac{1}{n}\sum{(actual-forecast)^2}}$
- RMSE divided by average demand so it yeilds a % like MAPE
    - $\textrm{pRMSE} = \frac{\sqrt{\frac{1}{n}\sum{(actual-forecast)^2}}}{\frac{1}{n}\sum{actual}}$

It can be helpful to explicity caculate these to make comparison between datasets and models fair.  This section demonstration these calculation with SQL.

In [57]:
query = f"""
WITH
    FORECASTS AS (
        SELECT DATE({TIME_COLUMN}) as {TIME_COLUMN}, 
            DATE(predicted_on_date) as predicted_on_date, 
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN}, 
            #splits,
            {SERIES_COLUMN}, 
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_automl`
    ),
    LEAD_DAYS AS (
        SELECT *, DATE_DIFF({TIME_COLUMN}, predicted_on_date, {FORECAST_GRANULARITY}) as prediction_lead_days
        FROM FORECASTS
    ),
    LATEST AS (
        SELECT {SERIES_COLUMN}, {TIME_COLUMN}, min(prediction_lead_days) as prediction_lead_days
        FROM LEAD_DAYS
        GROUP BY {SERIES_COLUMN}, {TIME_COLUMN}
    ),
    DIFFS AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            'forecast' as time_series_type,
            predicted_{TARGET_COLUMN} as forecast_value,
            {TARGET_COLUMN} as actual_value,
            ({TARGET_COLUMN} - predicted_{TARGET_COLUMN}) as diff
        FROM LATEST
        LEFT OUTER JOIN LEAD_DAYS
        USING ({SERIES_COLUMN}, {TIME_COLUMN}, prediction_lead_days)    
    )
SELECT {SERIES_COLUMN}, time_series_type, 
    AVG(SAFE_DIVIDE(ABS(diff), actual_value)) as MAPE,
    AVG(ABS(diff)) as MAE,
    SAFE_DIVIDE(SUM(ABS(diff)),SUM(actual_value)) as pMAE,
    AVG(POW(diff, 2)) as MSE,
    SQRT(AVG(POW(diff, 2))) as RMSE,
    SAFE_DIVIDE(SQRT( AVG( POW(diff, 2) ) ) , AVG(actual_value) ) as pRMSE
FROM DIFFS
GROUP BY {SERIES_COLUMN}, time_series_type
ORDER BY {SERIES_COLUMN}, time_series_type    
"""

# customMetrics = bq_client.query(query = query).to_dataframe()
# customMetrics

# print to run in BigQuery console
print(query)


WITH
    FORECASTS AS (
        SELECT DATE(date) as date, 
            DATE(predicted_on_date) as predicted_on_date, 
            CAST(gross_quantity as INT64) AS gross_quantity, 
            #splits,
            timeseries_id, 
            predicted_gross_quantity.value as predicted_gross_quantity
        FROM `hybrid-vertex.m5_us.sdk_train_automl`
    ),
    LEAD_DAYS AS (
        SELECT *, DATE_DIFF(date, predicted_on_date, DAY) as prediction_lead_days
        FROM FORECASTS
    ),
    LATEST AS (
        SELECT timeseries_id, date, min(prediction_lead_days) as prediction_lead_days
        FROM LEAD_DAYS
        GROUP BY timeseries_id, date
    ),
    DIFFS AS (
        SELECT 
            timeseries_id, 
            date, 
            'forecast' as time_series_type,
            predicted_gross_quantity as forecast_value,
            gross_quantity as actual_value,
            (gross_quantity - predicted_gross_quantity) as diff
        FROM LATEST
        LEFT OUTER JOIN LEAD_DAYS

Overall Metrics:

In [59]:
query = f"""
WITH
    FORECASTS AS (
        SELECT 
            DATE({TIME_COLUMN}) as {TIME_COLUMN}, 
            DATE(predicted_on_date) as predicted_on_date, 
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN}, 
            #splits, 
            {SERIES_COLUMN}, 
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_automl`
    ),
    LEAD_DAYS AS (
        SELECT 
            *, 
            DATE_DIFF({TIME_COLUMN}, 
            predicted_on_date, 
            {FORECAST_GRANULARITY}) as prediction_lead_days
        FROM FORECASTS
    ),
    LATEST AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            min(prediction_lead_days) as prediction_lead_days
        FROM LEAD_DAYS
        GROUP BY {SERIES_COLUMN}, {TIME_COLUMN}
    ),
    DIFFS AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            'forecast' as time_series_type,
            predicted_{TARGET_COLUMN} as forecast_value,
            {TARGET_COLUMN} as actual_value,
            ({TARGET_COLUMN} - predicted_{TARGET_COLUMN}) as diff
        FROM LATEST
        LEFT OUTER JOIN LEAD_DAYS
        USING ({SERIES_COLUMN}, {TIME_COLUMN}, prediction_lead_days)    
    )
SELECT time_series_type, 
    AVG( SAFE_DIVIDE(ABS( diff ) , actual_value) ) as MAPE,
    AVG(ABS(diff)) as MAE,
    SAFE_DIVIDE(SUM( ABS( diff ) ) , SUM(actual_value) )  as pMAE,
    AVG(POW(diff, 2)) as MSE,
    SQRT(AVG(POW(diff, 2))) as RMSE,
    SAFE_DIVIDE(SQRT( AVG( POW(diff, 2) ) ) , AVG(actual_value) )  as pRMSE
FROM DIFFS
GROUP BY time_series_type
ORDER BY time_series_type    
"""
customMetricsOverall = bq_client.query(query = query).to_dataframe()
customMetricsOverall

# print to run in BigQuery console
# print(query)

Unnamed: 0,time_series_type,MAPE,MAE,pMAE,MSE,RMSE,pRMSE
0,forecast,0.577317,1.028049,0.696077,3.974805,1.993691,1.349898


In [106]:
# create run name
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
EXPERIMENT_RUN_NAME = f"run-{TIMESTAMP}"

# log params and metrics to dicts
params = {}
params["budget_hrs"] = MILLI_NODE_HRS
params["horizon"] = FORECAST_HORIZON
params["context_window"] = CONTEXT_WINDOW

metrics = {}
metrics["MAPE"] = customMetricsOverall['MAPE'][0]
metrics["MAE"] = customMetricsOverall['MAE'][0]
metrics["pMAE"] = customMetricsOverall['pMAE'][0]
metrics["MSE"] = customMetricsOverall['MSE'][0]
metrics["RMSE"] = customMetricsOverall['RMSE'][0]
metrics["pRMSE"] = customMetricsOverall['pRMSE'][0]

# # Create and log experiment
vertex_ai.init(experiment=EXPERIMENT.replace("_","-"))

with vertex_ai.start_run(EXPERIMENT_RUN_NAME) as my_run:
    my_run.log_metrics(metrics)
    my_run.log_params(params)

    vertex_ai.end_run()

Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/m5-nb3-v2-run-20230329055639 to Experiment: m5-nb3-v2


INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/m5-nb3-v2-run-20230329055639 to Experiment: m5-nb3-v2


## Get Forecasted Values for Future Horizon

In [66]:
## TODO
query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_automl_batch_input` AS
WITH
    DATELIST AS (
        SELECT *
        FROM (SELECT DISTINCT {SERIES_COLUMN} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_source`) A
        CROSS JOIN (SELECT * 
                    FROM UNNEST(GENERATE_DATE_ARRAY(
                                    DATE_SUB((SELECT MAX({TIME_COLUMN}) FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_prepped`), INTERVAL {CONTEXT_WINDOW-1} {FORECAST_GRANULARITY}),
                                    DATE_ADD((SELECT MAX({TIME_COLUMN}) FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_prepped`), INTERVAL {FORECAST_HORIZON} {FORECAST_GRANULARITY}),
                                    INTERVAL 1 {FORECAST_GRANULARITY}
                                )
                            ) AS {TIME_COLUMN}
                    ) B
    ),
    ADDTARGET AS (
        SELECT *
        FROM DATELIST
        LEFT OUTER JOIN (SELECT {SERIES_COLUMN}, {TIME_COLUMN}, {TARGET_COLUMN} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_source`)
        USING ({SERIES_COLUMN}, {TIME_COLUMN})
        ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
    ),
    LOCF AS (
        SELECT 
          {SERIES_COLUMN}, 
          {TIME_COLUMN},
          LAST_VALUE({TARGET_COLUMN} IGNORE NULLS) OVER (PARTITION BY {SERIES_COLUMN} ORDER BY {TIME_COLUMN} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as {TARGET_COLUMN}
        FROM ADDTARGET
    )
SELECT 
    {SERIES_COLUMN}, 
    {TIME_COLUMN},
    CASE
        WHEN {TIME_COLUMN} > (SELECT MAX({TIME_COLUMN}) FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_source`) THEN NULL
        ELSE {TARGET_COLUMN}
    END AS {TARGET_COLUMN}
FROM LOCF
ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
# """
job = bq_client.query(query = query)
job.result()

# print to run in BigQuery console
# print(query)

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f4aac694990>

# Batch Prediction Job

In [108]:
# batchjob = vertex_ai.BatchPredictionJob.create(
batch_prediction_job = forecast.batch_predict(
    job_display_name = f'{EXPERIMENT}_automl_{CONTEXT_WINDOW}_{VERSION}',
    # model_name = forecast.resource_name,
    instances_format = 'bigquery',
    predictions_format = 'bigquery',
    bigquery_source = f"bq://{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_automl_batch_input",
    bigquery_destination_prefix = f"bq://{PROJECT_ID}.{BQ_DATASET}",
    sync=False
)

Creating BatchPredictionJob


INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob


BatchPredictionJob created. Resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008


To use this BatchPredictionJob in another session:


INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:


bpj = aiplatform.BatchPredictionJob('projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008')


INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008')


View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/732046457997099008?project=934903580331


INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/732046457997099008?project=934903580331


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


## Process Predicted Forecast

In [109]:
# batchjob.output_info.bigquery_output_table
batch_prediction_job.bigquery_output_table

AttributeError: 'BatchPredictionJob' object has no attribute 'bigquery_output_table'

BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_RUNNING


BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_SUCCEEDED


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008 current state:
JobState.JOB_STATE_SUCCEEDED


BatchPredictionJob run completed. Resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008


INFO:google.cloud.aiplatform.jobs:BatchPredictionJob run completed. Resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/732046457997099008


In [73]:
batch_predict_bq_output_uri = "{}.{}".format(
    batch_prediction_job.output_info.bigquery_output_dataset,
    batch_prediction_job.output_info.bigquery_output_table
)

batch_predict_bq_output_uri

'bq://hybrid-vertex.m5_us.predictions_2023_03_28T22_01_23_629Z_734'

In [71]:
query = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_automl_batch_output` AS
    SELECT {SERIES_COLUMN}, DATE({TIME_COLUMN}) as {TIME_COLUMN}, predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
    FROM `{PROJECT_ID}.{BQ_DATASET}.{batch_prediction_job.output_info.bigquery_output_table}`
"""
job = bq_client.query(query = query)
job.result()

# print to run in BigQuery console
# print(query)

NotFound: 404 Not found: Table hybrid-vertex:m5_us.predictions_2023_03_28T22_01_23_629Z_734 was not found in location US

Location: US
Job ID: 71a25984-1485-443e-b435-578f379d509e


In [None]:
query = f"""
    SELECT *
    FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_automl_batch_output`
    ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
"""
predict = bq_client.query(query = query).to_dataframe()
predict.head()

## Review Results

### Retrieve the Forecasting Data (raw series)

In [None]:

# TODO

# query = f"""
# SELECT start_station_name, date, splits, num_trips
# FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_PLAN}_prepped`
# ORDER by start_station_name, date
# """
# rawSeries = bigquery.query(query = query).to_dataframe()