# Vertex Forecast - training with SDK

In [1]:
import os

GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
REGION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"REGION: {REGION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
REGION: us-central1


In [2]:
import google.cloud.aiplatform as vertex_ai
from google.cloud import bigquery
from google.cloud import storage

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

In [3]:
# previously defined
BQ_DATASET="m5_us"
BQ_TABLE="sdk_train"
BQ_TABLE_PLAN="sdk_plan"

# new vars
EXPERIMENT="m5_nb3"
VERSION="v1"

## Create Vertex Managed Dataset 
* link to BigQuery Table

Reference for [`aiplatform.TimeSeriesDataset.create()`](https://googleapis.dev/python/aiplatform/latest/aiplatform.html#google.cloud.aiplatform.TimeSeriesDataset.create)

In [5]:
dataset = vertex_ai.TimeSeriesDataset.create(
    display_name = f'{EXPERIMENT}_{VERSION}', 
    bq_source = f'bq://{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_prepped',
    labels = {'experiment':f'{EXPERIMENT}'}
)

Creating TimeSeriesDataset
Create TimeSeriesDataset backing LRO: projects/934903580331/locations/us-central1/datasets/462153324456574976/operations/658077294274805760
TimeSeriesDataset created. Resource name: projects/934903580331/locations/us-central1/datasets/462153324456574976
To use this TimeSeriesDataset in another session:
ds = aiplatform.TimeSeriesDataset('projects/934903580331/locations/us-central1/datasets/462153324456574976')


In [None]:
# dataset = vertex_ai.TimeSeriesDataset('projects/715288179162/locations/us-central1/datasets/462153324456574976')

## Train Forecasting Model with AutoML

Reference for [`aiplatform.AutoMLForecastingTrainingJob`](https://googleapis.dev/python/aiplatform/latest/aiplatform.html#google.cloud.aiplatform.AutoMLForecastingTrainingJob)

In [9]:
dataset.column_names

In [18]:
AVAILABLE_AT_FORECAST_COLS = list(set(dataset.column_names) - set(['splits','timeseries_id','gross_quantity']))
AVAILABLE_AT_FORECAST_COLS

['event_name_1',
 'year',
 'event_type_1',
 'month',
 'dept_id',
 'sell_price',
 'event_type_2',
 'wday',
 'state_id',
 'snap_WI',
 'snap_CA',
 'snap_TX',
 'product_id',
 'date',
 'event_name_2',
 'location_id',
 'weekday',
 'cat_id']

In [7]:
column_specs = list(set(dataset.column_names) - set(['splits','timeseries_id']))
column_specs = dict.fromkeys(column_specs, 'auto')
column_specs

{'event_name_1': 'auto',
 'year': 'auto',
 'event_type_1': 'auto',
 'month': 'auto',
 'dept_id': 'auto',
 'sell_price': 'auto',
 'event_type_2': 'auto',
 'wday': 'auto',
 'state_id': 'auto',
 'snap_WI': 'auto',
 'snap_CA': 'auto',
 'snap_TX': 'auto',
 'gross_quantity': 'auto',
 'product_id': 'auto',
 'date': 'auto',
 'event_name_2': 'auto',
 'location_id': 'auto',
 'weekday': 'auto',
 'cat_id': 'auto'}

optimization_objective
* "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
* "minimize-mae" - Minimize mean-absolute error (MAE).
* "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
* "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE).
* "minimize-wape-mae" - Minimize the combination of weighted absolute percentage error (WAPE) and mean-absolute-error (MAE).
* "minimize-quantile-loss" - Minimize the quantile loss at the defined quantiles. (Set this objective to build quantile forecasts.)

In [None]:
# CUSTOMIZE
FORECAST_GRANULARITY = 'DAY'
FORECAST_HORIZON = 14
CONTEXT_WINDOW=14
forecast_test_length = 14
forecast_val_length = 14

TARGET_COLUMN = 'gross_quantity'
TIME_COLUMN = 'date'
SERIES_COLUMN = 'timeseries_id'

In [16]:
forecast_job = vertex_ai.AutoMLForecastingTrainingJob(
    display_name = f'{EXPERIMENT}_{VERSION}_training',
    optimization_objective = "minimize-rmse",
    column_specs = column_specs,
    labels = {'experiment':f'{EXPERIMENT}'}
)

In [None]:
forecast = forecast_job.run(
    dataset = dataset,
    target_column = TARGET_COLUMN,
    time_column = TIME_COLUMN,
    time_series_identifier_column = SERIES_COLUMN,
    unavailable_at_forecast_columns = [TARGET_COLUMN,],
    available_at_forecast_columns = AVAILABLE_AT_FORECAST_COLS,
    forecast_horizon = FORECAST_HORIZON,
    data_granularity_unit = FORECAST_GRANULARITY.lower(),
    data_granularity_count = 1,
    predefined_split_column_name = "splits",
    context_window = CONTEXT_WINDOW,
    export_evaluated_data_items = True,
    export_evaluated_data_items_bigquery_destination_uri = f"bq://{PROJECT_ID}:{BQ_DATASET}:{BQ_TABLE}_automl",
    validation_options = "fail-pipeline",
    budget_milli_node_hours = 1000,
    model_display_name = f"{EXPERIMENT}_{BQ_TABLE}_{VERSION}",
    model_labels = {'experiment':f'{EXPERIMENT}'},
    holiday_regions = ['GLOBAL', 'NA', 'US'],
    sync = True
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6795228562577162240?project=934903580331
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6795228562577162240 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6795228562577162240 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6795228562577162240 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6795228562577162240 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6795228562577162240 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/lo

## Using The Results

In [None]:
query = f"""
WITH
    RAW AS (
        SELECT DATE({TIME_COLUMN}) as {TIME_COLUMN}, DATE(predicted_on_date) as predicted_on_date, CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN}, splits, {SERIES_COLUMN}, predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_automl`
    ),
    LEAD AS (
        SELECT *, DATE_DIFF({TIME_COLUMN}, predicted_on_date, {forecast_granularity}) as prediction_lead_days
        FROM RAW
    ),
    LEFTSIDE AS (
        SELECT {SERIES_COLUMN}, {TIME_COLUMN}, min(prediction_lead_days) as prediction_lead_days
        FROM LEAD
        GROUP BY {SERIES_COLUMN}, {TIME_COLUMN}
    )
SELECT *
FROM LEFTSIDE
LEFT OUTER JOIN LEAD
USING ({SERIES_COLUMN}, {TIME_COLUMN}, prediction_lead_days)
"""
autoML = bigquery.query(query = query).to_dataframe()
autoML

### Review Custom Metrics with SQL

Some common metrics for evaluating forecasting effectiveness are 
- MAPE, or Mean Absolute Percentage Error
    - $\textrm{MAPE} = \frac{1}{n}\sum{\frac{\mid(actual - forecast)\mid}{actual}}$
- MAE, or Mean Absolute Error
     - $\textrm{MAE} = \frac{1}{n}\sum{\mid(actual - forecast)\mid}$
- MAE divided by average demand so it yields a % like MAPE
    - $\textrm{pMAE} = \frac{\sum{\mid(actual - forecast)\mid}}{\sum{actual}}$
- MSE, or Mean Squared Error
    - $\textrm{MSE} = \frac{1}{n}\sum{(actual-forecast)^2}$
- RMSE, or Root Mean Squared Error
    - $\textrm{RMSE} = \sqrt{\frac{1}{n}\sum{(actual-forecast)^2}}$
- RMSE divided by average demand so it yeilds a % like MAPE
    - $\textrm{pRMSE} = \frac{\sqrt{\frac{1}{n}\sum{(actual-forecast)^2}}}{\frac{1}{n}\sum{actual}}$

It can be helpful to explicity caculate these to make comparison between datasets and models fair.  This section demonstration these calculation with SQL.

In [None]:
query = f"""
WITH
    FORECASTS AS (
        SELECT DATE({TIME_COLUMN}) as {TIME_COLUMN}, 
            DATE(predicted_on_date) as predicted_on_date, 
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN}, 
            splits,
            {SERIES_COLUMN}, 
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{PROJECT_ID}.{DATANAME}.{NOTEBOOK}_automl`
    ),
    LEAD_DAYS AS (
        SELECT *, DATE_DIFF({TIME_COLUMN}, predicted_on_date, {FORECAST_GRANULARITY}) as prediction_lead_days
        FROM FORECASTS
    ),
    LATEST AS (
        SELECT {SERIES_COLUMN}, {TIME_COLUMN}, min(prediction_lead_days) as prediction_lead_days
        FROM LEAD_DAYS
        GROUP BY {SERIES_COLUMN}, {TIME_COLUMN}
    ),
    DIFFS AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            'forecast' as time_series_type,
            predicted_{TARGET_COLUMN} as forecast_value,
            {TARGET_COLUMN} as actual_value,
            ({TARGET_COLUMN} - predicted_{TARGET_COLUMN}) as diff
        FROM LATEST
        LEFT OUTER JOIN LEAD_DAYS
        USING ({SERIES_COLUMN}, {TIME_COLUMN}, prediction_lead_days)    
    )
SELECT {SERIES_COLUMN}, time_series_type, 
    AVG(ABS(diff)/actual_value) as MAPE,
    AVG(ABS(diff)) as MAE,
    SUM(ABS(diff))/SUM(actual_value) as pMAE,
    AVG(POW(diff, 2)) as MSE,
    SQRT(AVG(POW(diff, 2))) as RMSE,
    SQRT(AVG(POW(diff, 2)))/AVG(actual_value) as pRMSE
FROM DIFFS
GROUP BY {SERIES_COLUMN}, time_series_type
ORDER BY {SERIES_COLUMN}, time_series_type    
"""
customMetrics = bigquery.query(query = query).to_dataframe()
customMetrics

Overall Metrics:

In [None]:
query = f"""
WITH
    FORECASTS AS (
        SELECT 
            DATE({TIME_COLUMN}) as {TIME_COLUMN}, 
            DATE(predicted_on_date) as predicted_on_date, 
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN}, 
            splits, 
            {SERIES_COLUMN}, 
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{PROJECT_ID}.{DATANAME}.{NOTEBOOK}_automl`
    ),
    LEAD_DAYS AS (
        SELECT *, DATE_DIFF({TIME_COLUMN}, predicted_on_date, {FORECAST_GRANULARITY}) as prediction_lead_days
        FROM FORECASTS
    ),
    LATEST AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            min(prediction_lead_days) as prediction_lead_days
        FROM LEAD_DAYS
        GROUP BY {SERIES_COLUMN}, {TIME_COLUMN}
    ),
    DIFFS AS (
        SELECT 
            {SERIES_COLUMN}, 
            {TIME_COLUMN}, 
            'forecast' as time_series_type,
            predicted_{TARGET_COLUMN} as forecast_value,
            {TARGET_COLUMN} as actual_value,
            ({TARGET_COLUMN} - predicted_{TARGET_COLUMN}) as diff
        FROM LATEST
        LEFT OUTER JOIN LEAD_DAYS
        USING ({SERIES_COLUMN}, {TIME_COLUMN}, prediction_lead_days)    
    )
SELECT time_series_type, 
    AVG(ABS(diff)/actual_value) as MAPE,
    AVG(ABS(diff)) as MAE,
    SUM(ABS(diff))/SUM(actual_value) as pMAE,
    AVG(POW(diff, 2)) as MSE,
    SQRT(AVG(POW(diff, 2))) as RMSE,
    SQRT(AVG(POW(diff, 2)))/AVG(actual_value) as pRMSE
FROM DIFFS
GROUP BY time_series_type
ORDER BY time_series_type    
"""
customMetricsOverall = bigquery.query(query = query).to_dataframe()
customMetricsOverall

## Get Forecasted Values for Future Horizon

In [None]:
## TODO

In [None]:
# CUSTOMIZE
FORECAST_GRANULARITY = 'DAY'
FORECAST_HORIZON = 14
CONTEXT_WINDOW=14
forecast_test_length = 14
forecast_val_length = 14

TARGET_COLUMN = 'gross_quantity'
TIME_COLUMN = 'date'
SERIES_COLUMN = 'timeseries_id'