# Vertex Forecast

In [1]:
import os

GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
BQ_LOCATION='US'

# TODO: Service Account address
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com' 

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


In [17]:
from google.cloud import aiplatform
from google.cloud import bigquery
from google.cloud import storage

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [18]:
bq = bigquery.Client(project=PROJECT_ID)
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [6]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [7]:
VERSION='av1'

In [9]:
REGION = 'us-central1'
EXPERIMENT = 'forecasting-2'
SERIES = f'{VERSION}-forecasting'

BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-','_')
BQ_TABLE = 'forecasting-1_prepped'

viz_limit = 12

VERTEX_AI_MODEL_ID='vertexf'
MODEL_VERSION='v1'
XAI_FLAG="TRUE"

EXPERIMENT_NAME = f"nyc_{BQ_DATASET}"
print(f'EXPERIMENT_NAME: {EXPERIMENT_NAME}')

EXPERIMENT_NAME: nyc_av1_forecasting


In [10]:
# CUSTOMIZE
TARGET_COLUMN = 'num_trips'
TIME_COLUMN = 'starttime'
SERIES_COLUMN = 'start_station_name'
SPLIT_COLUMN = 'splits'
#COVARIATE_COLUMNS = ['avg_tripduration', 'pct_subscriber', 'ratio_gender', 'capacity'] # could be empty
COVARIATE_COLUMNS_ATTRIBUTES = []
COVARIATE_COLUMNS_KNOWN = ['capacity']
COVARIATE_COLUMNS_UNKNOWN = ['avg_tripduration', 'pct_subscriber', 'ratio_gender']

# CUSTOMIZE
FORECAST_GRANULARITY = 'DAY' # the data preparation included preparing the data at this level
FORECAST_HORIZON_LENGTH = 14
FORECAST_TEST_LENGTH = 14 # the data preparation included setting this value for splits = TEST
FORECAST_VALIDATE_LENGTH = 14 # the data preparation included setting this value for splits = VALIDATE

In [11]:
query = f"""
    WITH
        SPLIT AS (
            SELECT splits, min({TIME_COLUMN}) as mindate, max({TIME_COLUMN}) as maxdate
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
            GROUP BY {SPLIT_COLUMN}
        ),
        TRAIN AS (
            SELECT mindate as start_date
            FROM SPLIT
            WHERE {SPLIT_COLUMN} ='TRAIN'
        ),
        VAL AS (
            SELECT mindate as val_start
            FROM SPLIT
            WHERE {SPLIT_COLUMN} = 'VALIDATE'
        ),
        TEST AS (
            SELECT mindate as test_start, maxdate as end_date
            FROM SPLIT
            WHERE {SPLIT_COLUMN} = 'TEST'
        )
    SELECT * EXCEPT(pos) FROM
    (SELECT *, ROW_NUMBER() OVER() pos FROM TRAIN)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM VAL)
    USING (pos)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM TEST)
    USING (pos)
"""
keyDates = bq.query(query).to_dataframe()
keyDates

Unnamed: 0,start_date,val_start,test_start,end_date
0,2013-07-01,2016-09-03,2016-09-17,2016-09-30


In [12]:
query = f"""
    SELECT {SERIES_COLUMN}, {TIME_COLUMN}, {SPLIT_COLUMN}, {TARGET_COLUMN}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
    ORDER by {SERIES_COLUMN}, {TIME_COLUMN}
"""
rawSeries = bq.query(query).to_dataframe()

### Vertex Managed Dataset

In [19]:
if SERIES in [ds.display_name for ds in aiplatform.TimeSeriesDataset.list()]:
    dataset = aiplatform.TimeSeriesDataset.list(filter = f'display_name={SERIES}')[0]
else:
    dataset = aiplatform.TimeSeriesDataset.create(
        display_name = f'{SERIES}', 
        bq_source = f'bq://{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}',
        labels = {'series' : f'{SERIES}', 'experiment' : f'{EXPERIMENT}'}
    )

print(f'Created/Retrieve Dataset: {dataset.display_name}')

Creating TimeSeriesDataset
Create TimeSeriesDataset backing LRO: projects/934903580331/locations/us-central1/datasets/9174410784069910528/operations/307700734214799360
TimeSeriesDataset created. Resource name: projects/934903580331/locations/us-central1/datasets/9174410784069910528
To use this TimeSeriesDataset in another session:
ds = aiplatform.TimeSeriesDataset('projects/934903580331/locations/us-central1/datasets/9174410784069910528')
Created/Retrieve Dataset: av1-forecasting


In [20]:
dataset.column_names

['starttime',
 'num_trips',
 'ratio_gender',
 'splits',
 'start_station_name',
 'capacity',
 'pct_subscriber',
 'avg_tripduration']

In [21]:
modelmatch = aiplatform.Model.list(filter = f'display_name={SERIES}_{EXPERIMENT} AND labels.series={SERIES} AND labels.experiment={EXPERIMENT}')

if modelmatch:
    print("There is an existing model with versions: ", [f'{m.version_id}' for m in modelmatch])
    parent = modelmatch[0].resource_name
else:
    print("This is the first training for this model")
    parent = ''

This is the first training for this model


## Vertex Forecast 

In [22]:
column_specs = dict.fromkeys(
    list(
        set(dataset.column_names) - set([SPLIT_COLUMN, SERIES_COLUMN])
    ),
    'auto'
)
column_specs

{'starttime': 'auto',
 'num_trips': 'auto',
 'ratio_gender': 'auto',
 'capacity': 'auto',
 'pct_subscriber': 'auto',
 'avg_tripduration': 'auto'}

In [23]:
forecasting_job = aiplatform.AutoMLForecastingTrainingJob(
    display_name = f'{SERIES}_{EXPERIMENT}_{TIMESTAMP}',
    optimization_objective = "minimize-rmse",
    column_specs = column_specs,
    labels = {'series' : f'{SERIES}', 'experiment' : f'{EXPERIMENT}'}
)

In [24]:
forecast = forecasting_job.run(
    # data parameters
    dataset = dataset,
    target_column = TARGET_COLUMN,
    time_column = TIME_COLUMN,
    time_series_identifier_column = SERIES_COLUMN,
    time_series_attribute_columns = COVARIATE_COLUMNS_ATTRIBUTES,
    unavailable_at_forecast_columns = [TARGET_COLUMN] + COVARIATE_COLUMNS_UNKNOWN,
    available_at_forecast_columns = [TIME_COLUMN] + COVARIATE_COLUMNS_KNOWN,
    predefined_split_column_name = SPLIT_COLUMN,
    
    # forecast parameters
    forecast_horizon = FORECAST_HORIZON_LENGTH,
    data_granularity_unit = FORECAST_GRANULARITY,
    data_granularity_count = 1,
    context_window = 28,
    holiday_regions = ['GLOBAL', 'NA', 'US'],
    
    hierarchy_group_columns = [],
    hierarchy_group_total_weight = 1.0,
    hierarchy_temporal_total_weight = 2.0,
    hierarchy_group_temporal_total_weight = 1.0,
    
    # output parameters
    export_evaluated_data_items = True,
    export_evaluated_data_items_bigquery_destination_uri = f"bq://{BQ_PROJECT}:{BQ_DATASET}:{EXPERIMENT}_eval",
    export_evaluated_data_items_override_destination = True,
    
    # running parameters
    validation_options = "fail-pipeline",
    budget_milli_node_hours = 1000,
    
    # model parameters
    model_display_name = f"{SERIES}_{EXPERIMENT}",
    model_labels = {'series' : f'{SERIES}', 'experiment' : f'{EXPERIMENT}'},
    model_id = f"model_{SERIES}_{EXPERIMENT}",
    parent_model = parent,
    is_default_version = True,
    
    # session parameters: False means continue in local session, True waits and logs progress
    sync = True
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6100742383857565696?project=934903580331
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6100742383857565696 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6100742383857565696 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6100742383857565696 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6100742383857565696 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/locations/us-central1/trainingPipelines/6100742383857565696 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLForecastingTrainingJob projects/934903580331/lo

In [25]:
forecast.display_name, forecast.resource_name


('av1-forecasting_forecasting-2',
 'projects/934903580331/locations/us-central1/models/model_av1-forecasting_forecasting-2')

In [26]:
forecast.name


'model_av1-forecasting_forecasting-2'

In [27]:
print(f'Review the model in the Vertex AI Model Registry:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/models/{forecast.name}?project={PROJECT_ID}')


Review the model in the Vertex AI Model Registry:
https://console.cloud.google.com/vertex-ai/locations/us-central1/models/model_av1-forecasting_forecasting-2?project=hybrid-vertex


## Forecast

In [28]:
query = f"""
SELECT *
FROM `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_eval`
ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
LIMIT 20
"""
bq.query(query = query).to_dataframe()

Unnamed: 0,avg_tripduration,capacity,num_trips,pct_subscriber,predicted_num_trips,predicted_on_starttime,ratio_gender,start_station_name,starttime
0,1808.63197,36,269,0.423792,{'value': 199.2572021484375},2016-09-17,0.469945,Central Park North & Adam Clayton Powell Blvd,2016-09-17
1,2027.411765,36,272,0.367647,{'value': 193.52529907226562},2016-09-17,0.346535,Central Park North & Adam Clayton Powell Blvd,2016-09-18
2,2027.411765,36,272,0.367647,{'value': 202.67967224121094},2016-09-18,0.346535,Central Park North & Adam Clayton Powell Blvd,2016-09-18
3,1203.820513,36,39,0.74359,{'value': 156.79617309570312},2016-09-17,1.6,Central Park North & Adam Clayton Powell Blvd,2016-09-19
4,1203.820513,36,39,0.74359,{'value': 163.81826782226562},2016-09-18,1.6,Central Park North & Adam Clayton Powell Blvd,2016-09-19
5,1203.820513,36,39,0.74359,{'value': 166.6332550048828},2016-09-19,1.6,Central Park North & Adam Clayton Powell Blvd,2016-09-19
6,1750.691667,36,120,0.625,{'value': 149.94969177246094},2016-09-20,0.578947,Central Park North & Adam Clayton Powell Blvd,2016-09-20
7,1750.691667,36,120,0.625,{'value': 147.3031005859375},2016-09-18,0.578947,Central Park North & Adam Clayton Powell Blvd,2016-09-20
8,1750.691667,36,120,0.625,{'value': 155.94932556152344},2016-09-19,0.578947,Central Park North & Adam Clayton Powell Blvd,2016-09-20
9,1750.691667,36,120,0.625,{'value': 140.16575622558594},2016-09-17,0.578947,Central Park North & Adam Clayton Powell Blvd,2016-09-20


In [29]:
query = f"""
SELECT
    DATE({TIME_COLUMN}) as {TIME_COLUMN},
    DATE(predicted_on_{TIME_COLUMN}) as predicted_on_{TIME_COLUMN},
    CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN},
    {SERIES_COLUMN},
    predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
FROM `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_eval`
WHERE {TIME_COLUMN} = predicted_on_{TIME_COLUMN}
ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
"""
test_predictions = bq.query(query = query).to_dataframe()
test_predictions

Unnamed: 0,starttime,predicted_on_starttime,num_trips,start_station_name,predicted_num_trips
0,2016-09-17,2016-09-17,269,Central Park North & Adam Clayton Powell Blvd,199.257202
1,2016-09-18,2016-09-18,272,Central Park North & Adam Clayton Powell Blvd,202.679672
2,2016-09-19,2016-09-19,39,Central Park North & Adam Clayton Powell Blvd,166.633255
3,2016-09-20,2016-09-20,120,Central Park North & Adam Clayton Powell Blvd,149.949692
4,2016-09-21,2016-09-21,164,Central Park North & Adam Clayton Powell Blvd,156.939285
...,...,...,...,...,...
154,2016-09-26,2016-09-26,102,W 82 St & Central Park West,81.955360
155,2016-09-27,2016-09-27,105,W 82 St & Central Park West,82.590263
156,2016-09-28,2016-09-28,72,W 82 St & Central Park West,82.395874
157,2016-09-29,2016-09-29,143,W 82 St & Central Park West,84.467598


### Review Custom Metrics with SQL

Some common metrics for evaluating forecasting effectiveness are 
- MAPE, or Mean Absolute Percentage Error
    - $\textrm{MAPE} = \frac{1}{n}\sum{\frac{\mid(actual - forecast)\mid}{actual}}$
- MAE, or Mean Absolute Error
     - $\textrm{MAE} = \frac{1}{n}\sum{\mid(actual - forecast)\mid}$
- MAE divided by average demand so it yields a % like MAPE
    - $\textrm{pMAE} = \frac{\sum{\mid(actual - forecast)\mid}}{\sum{actual}}$
- MSE, or Mean Squared Error
    - $\textrm{MSE} = \frac{1}{n}\sum{(actual-forecast)^2}$
- RMSE, or Root Mean Squared Error
    - $\textrm{RMSE} = \sqrt{\frac{1}{n}\sum{(actual-forecast)^2}}$
- RMSE divided by average demand so it yeilds a % like MAPE
    - $\textrm{pRMSE} = \frac{\sqrt{\frac{1}{n}\sum{(actual-forecast)^2}}}{\frac{1}{n}\sum{actual}}$

It can be helpful to explicity caculate these to make comparison between datasets and models fair.  This section demonstration these calculation with SQL.

In [30]:
query = f"""
WITH
    FORECASTS AS (
        SELECT
            DATE({TIME_COLUMN}) as {TIME_COLUMN},
            DATE(predicted_on_{TIME_COLUMN}) as predicted_on_{TIME_COLUMN},
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN},
            {SERIES_COLUMN},
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_eval`
        WHERE {TIME_COLUMN} = predicted_on_{TIME_COLUMN}
    ),
    DIFFS AS (
        SELECT 
            {SERIES_COLUMN},
            {TIME_COLUMN},
            'forecast' as time_series_type,
            predicted_{TARGET_COLUMN} as forecast_value,
            {TARGET_COLUMN} as actual_value,
            ({TARGET_COLUMN} - predicted_{TARGET_COLUMN}) as diff
        FROM FORECASTS   
    )
SELECT
    start_station_name,
    time_series_type, 
    AVG(SAFE_DIVIDE(ABS(diff), actual_value)) as MAPE,
    AVG(ABS(diff)) as MAE,
    SAFE_DIVIDE(SUM(ABS(diff)), SUM(actual_value)) as pMAE,
    AVG(POW(diff, 2)) as MSE,
    SQRT(AVG(POW(diff, 2))) as RMSE,
    SAFE_DIVIDE(SQRT(AVG(POW(diff, 2))), AVG(actual_value)) as pRMSE
FROM DIFFS
GROUP BY
    {SERIES_COLUMN},
    time_series_type
ORDER BY
    {SERIES_COLUMN},
    time_series_type    
"""
customMetrics = bq.query(query = query).to_dataframe()
customMetrics

Unnamed: 0,start_station_name,time_series_type,MAPE,MAE,pMAE,MSE,RMSE,pRMSE
0,Central Park North & Adam Clayton Powell Blvd,forecast,0.755271,53.154532,0.342459,4333.022591,65.825699,0.424096
1,Central Park S & 6 Ave,forecast,0.589436,90.740997,0.278042,15334.753666,123.833572,0.379442
2,Central Park W & W 96 St,forecast,0.536494,25.080069,0.250622,1093.053066,33.061353,0.330378
3,Central Park West & W 100 St,forecast,0.945172,16.359788,0.412679,411.222784,20.278629,0.511533
4,Central Park West & W 102 St,forecast,0.503995,10.967216,0.21656,256.900025,16.028101,0.316493
5,Central Park West & W 68 St,forecast,0.514474,46.537876,0.307181,3129.939455,55.945862,0.36928
6,Central Park West & W 72 St,forecast,0.733304,57.436839,0.324643,5558.024003,74.552156,0.421382
7,Central Park West & W 76 St,forecast,0.346405,24.73112,0.225855,1054.374078,32.471127,0.29654
8,Central Park West & W 85 St,forecast,1.268336,55.593222,0.442219,4999.384815,70.706328,0.562437
9,Grand Army Plaza & Central Park S,forecast,0.642928,66.054038,0.30375,7022.415667,83.799855,0.385355


In [31]:
query = f"""
WITH
    FORECASTS AS (
        SELECT
            DATE({TIME_COLUMN}) as {TIME_COLUMN},
            DATE(predicted_on_{TIME_COLUMN}) as predicted_on_{TIME_COLUMN},
            CAST({TARGET_COLUMN} as INT64) AS {TARGET_COLUMN},
            {SERIES_COLUMN},
            predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_eval`
        WHERE {TIME_COLUMN} = predicted_on_{TIME_COLUMN}
    ),
    DIFFS AS (
        SELECT 
            {SERIES_COLUMN},
            {TIME_COLUMN},
            'forecast' as time_series_type,
            predicted_{TARGET_COLUMN} as forecast_value,
            {TARGET_COLUMN} as actual_value,
            ({TARGET_COLUMN} - predicted_{TARGET_COLUMN}) as diff
        FROM FORECASTS   
    )
SELECT
    #start_station_name,
    time_series_type, 
    AVG(SAFE_DIVIDE(ABS(diff), actual_value)) as MAPE,
    AVG(ABS(diff)) as MAE,
    SAFE_DIVIDE(SUM(ABS(diff)), SUM(actual_value)) as pMAE,
    AVG(POW(diff, 2)) as MSE,
    SQRT(AVG(POW(diff, 2))) as RMSE,
    SAFE_DIVIDE(SQRT(AVG(POW(diff, 2))), AVG(actual_value)) as pRMSE
FROM DIFFS
GROUP BY
    #{SERIES_COLUMN},
    time_series_type
ORDER BY
    #{SERIES_COLUMN},
    time_series_type  
"""
customMetricsOverall = bq.query(query = query).to_dataframe()
customMetricsOverall

Unnamed: 0,time_series_type,MAPE,MAE,pMAE,MSE,RMSE,pRMSE
0,forecast,0.6162,40.746007,0.304232,3690.265429,60.747555,0.453574


## Log Vertex Experiments

In [32]:
forecast_EVALS = forecast.list_model_evaluations()

for model_evaluation in forecast_EVALS:
    print(model_evaluation.to_dict())

{'name': 'projects/934903580331/locations/us-central1/models/model_av1-forecasting_forecasting-2@1/evaluations/3600617584516480006', 'metricsSchemaUri': 'gs://google-cloud-aiplatform/schema/modelevaluation/forecasting_metrics_1.0.0.yaml', 'metrics': {'meanAbsolutePercentageError': 72.893425, 'rSquared': 0.6412784, 'meanAbsoluteError': 42.348526, 'rootMeanSquaredPercentageError': 143.56311, 'rootMeanSquaredLogError': 0.61139524, 'rootMeanSquaredError': 62.8196, 'weightedAbsolutePercentageError': 35.457237}, 'createTime': '2023-05-15T14:38:30.631704Z', 'modelExplanation': {'meanAttributions': [{'featureAttributions': {'pct_subscriber': 1.7733573418158997, 'ratio_gender': 1.5669398472113016, 'avg_tripduration': 2.9821595671682655, 'num_trips': 60.53625331531873, 'starttime': 12.314080295315042, 'capacity': 8.234795519015494}}]}}


In [33]:
model_evaluation = list(forecast.list_model_evaluations())[0]
metrics_dict = {k: [v] for k, v in dict(model_evaluation.metrics).items()}
metrics_dict

{'meanAbsoluteError': [42.348526],
 'rSquared': [0.6412784],
 'weightedAbsolutePercentageError': [35.457237],
 'rootMeanSquaredPercentageError': [143.56311],
 'meanAbsolutePercentageError': [72.893425],
 'rootMeanSquaredError': [62.8196],
 'rootMeanSquaredLogError': [0.61139524]}

In [36]:
from datetime import datetime

# create run name
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
EXPERIMENT_RUN_NAME = f"run-{TIMESTAMP}"

# log params and metrics to dicts
params = {}
# params["budget_hrs"] = MILLI_NODE_HRS
# params["horizon"] = FORECAST_HORIZON
# params["context_window"] = CONTEXT_WINDOW

metrics = {}
metrics["MAE"] = metrics_dict['meanAbsoluteError'][0]
metrics["RMSE"] = metrics_dict['rootMeanSquaredError'][0]
metrics["MAPE"] = metrics_dict['meanAbsolutePercentageError'][0]
metrics["rSquared"] = metrics_dict['rSquared'][0]
metrics["RMSLE"] = metrics_dict['rootMeanSquaredLogError'][0]
metrics["WAPE"] = metrics_dict['weightedAbsolutePercentageError'][0]

# # Create and log experiment
vertex_ai.init(experiment=EXPERIMENT_NAME.replace("_","-"))

with vertex_ai.start_run(EXPERIMENT_RUN_NAME) as my_run:
    my_run.log_metrics(metrics)
    my_run.log_params(params)

    vertex_ai.end_run()

Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/nyc-av1-forecasting-run-20230515144023 to Experiment: nyc-av1-forecasting


In [None]:
context_window = 28

query_a = ""
query_b = ""
for v in COVARIATE_COLUMNS_KNOWN + COVARIATE_COLUMNS_UNKNOWN + COVARIATE_COLUMNS_ATTRIBUTES:
    query_a += f""",
            LAST_VALUE({v} IGNORE NULLS) OVER (PARTITION BY {SERIES_COLUMN} ORDER BY {TIME_COLUMN} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as {v}"""
    if v not in COVARIATE_COLUMNS_ATTRIBUTES:
        query_b += f""",
        CASE WHEN {TIME_COLUMN} > (SELECT MAX({TIME_COLUMN}) FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`) THEN NULL ELSE {v} END AS {v}"""
    else:
        query_b += f""",
        {v}"""

query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_horizon_input` AS
WITH
    DATELIST AS (
        SELECT *
        FROM (SELECT DISTINCT {SERIES_COLUMN} FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`) A
        CROSS JOIN (SELECT * 
                    FROM UNNEST(GENERATE_DATE_ARRAY(
                                    DATE_SUB((SELECT MAX({TIME_COLUMN}) FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`), INTERVAL {context_window-1} DAY),
                                    DATE_ADD((SELECT MAX({TIME_COLUMN}) FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`), INTERVAL {FORECAST_HORIZON_LENGTH} DAY),
                                    INTERVAL 1 DAY
                                )
                            ) AS {TIME_COLUMN}
                    ) B
    ),
    ADDTARGET AS (
        SELECT *
        FROM DATELIST
        LEFT OUTER JOIN (SELECT * FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`)
        USING ({SERIES_COLUMN}, {TIME_COLUMN})
        ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
    ),
    LOCF AS (
        SELECT {SERIES_COLUMN}, {TIME_COLUMN},
        LAST_VALUE({TARGET_COLUMN} IGNORE NULLS) OVER (PARTITION BY {SERIES_COLUMN} ORDER BY {TIME_COLUMN} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as {TARGET_COLUMN}
        {query_a}
        FROM ADDTARGET
    )
SELECT {SERIES_COLUMN}, {TIME_COLUMN},
    CASE
        WHEN {TIME_COLUMN} > (SELECT MAX({TIME_COLUMN}) FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`) THEN NULL
        ELSE {TARGET_COLUMN}
    END AS {TARGET_COLUMN}
    {query_b}
FROM LOCF
ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
"""
job = bq.query(query = query)
job.result()

## Batch Prediction Job

In [None]:
batchjob = forecast.batch_predict(
    job_display_name = f'{SERIES}_{EXPERIMENT}_{TIMESTAMP}',
    bigquery_source = f"bq://{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_horizon_input",
    bigquery_destination_prefix = f"bq://{BQ_PROJECT}.{BQ_DATASET}",
    sync = True
)

In [None]:
batchjob.output_info.bigquery_output_table


In [None]:
query = f"""
    CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_horizon_output` AS
    SELECT {SERIES_COLUMN}, DATE({TIME_COLUMN}) as {TIME_COLUMN}, predicted_{TARGET_COLUMN}.value as predicted_{TARGET_COLUMN}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{batchjob.output_info.bigquery_output_table}`
"""
job = bq.query(query = query)
job.result()

In [None]:
query = f"""
    SELECT *
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{EXPERIMENT}_horizon_output`
    ORDER BY {SERIES_COLUMN}, {TIME_COLUMN}
"""
predict = bq.query(query = query).to_dataframe()
predict.head()