## Data prep - M5

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1
REGION: us-central1
VERTEX_SA: jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com


In [36]:
VERSION='v01'

REGION = 'us-central1'
EXPERIMENT = 'm5_bqarima'
SERIES = f'applied-forecasting_{VERSION}'

BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-','_')
BQ_TABLE = EXPERIMENT

BQ_SOURCE = 'hybrid-vertex.m5_us.combined_full_train'
# BQ_SOURCE = 'hybrid-vertex.m5_us.combined_small20k_train'
viz_limit = 12

## packages & clients

In [8]:
from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

from google.cloud import aiplatform as vertex_ai

In [9]:
bq = bigquery.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID, 
    location=REGION,
    # credentials=credentials
)

## create BigQuery Dataset

In [10]:
ds = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET}")
ds.location = 'us' #REGION
ds.labels = {'notebook': f"{EXPERIMENT}"}
ds = bq.create_dataset(dataset = ds, exists_ok = True)
ds.full_dataset_id

'hybrid-vertex:applied_forecasting_v01'

In [48]:
# CUSTOMIZE THIS CELL
TARGET_COLUMN = 'gross_quantity'
TIME_COLUMN = 'date'
SERIES_COLUMN = 'timeseries_id'
COVARIATE_COLUMNS = [
        'product_id',
        'location_id',
        'gross_quantity',
        # 'date',
        'weekday',
        'wday',
        'month',
        'year',
        'event_name_1',
        'event_type_1',
        'event_name_2',
        'event_type_2',
        'snap_CA',
        'snap_TX',
        'snap_WI',
        'dept_id',
        'cat_id',
        'state_id',
]

# CUSTOMIZE
forecast_granularity = 'WEEK'
forecast_horizon = 14
forecast_test_length = 14
forecast_val_length = 14

In [38]:
query = f"""
    SELECT 
        MIN({TIME_COLUMN}) as start_date,
        DATE_SUB(MAX({TIME_COLUMN}), INTERVAL {forecast_test_length+forecast_val_length}-1 {forecast_granularity}) as val_start,
        DATE_SUB(MAX({TIME_COLUMN}), INTERVAL {forecast_test_length}-1 {forecast_granularity}) as test_start,
        MAX({TIME_COLUMN}) as end_date
    FROM `{BQ_SOURCE}`  
"""
keyDates = bq.query(query).to_dataframe()
keyDates

Unnamed: 0,start_date,val_start,test_start,end_date
0,2011-01-29,2015-11-15,2016-02-21,2016-05-22


In [39]:
train_start = keyDates['start_date']
test_start = keyDates['test_start']
val_start = keyDates['val_start']
train_start = keyDates['start_date']

In [40]:
query = f"""
    CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_prepped` AS
    SELECT *,
       CASE
           WHEN {TIME_COLUMN} > DATE_SUB((SELECT MAX({TIME_COLUMN}) FROM `{BQ_SOURCE}`), INTERVAL {forecast_test_length} {forecast_granularity}) THEN "TEST"
           WHEN {TIME_COLUMN} > DATE_SUB((SELECT MAX({TIME_COLUMN}) FROM `{BQ_SOURCE}`), INTERVAL {forecast_test_length}+{forecast_val_length} {forecast_granularity}) THEN "VALIDATE"
           ELSE "TRAIN"
       END AS splits
    FROM `{BQ_SOURCE}`
"""
job = bq.query(query)
job.result()
(job.ended-job.started).total_seconds()

8.444

### reveiw forecast data

In [41]:
query = f"""
    SELECT 
        {SERIES_COLUMN},
        COUNTIF(splits='TRAIN') as TRAIN,
        COUNTIF(splits='VALIDATE') as VALIDATE,
        COUNTIF(splits='TEST') as TEST,
        sum({TARGET_COLUMN}) as {TARGET_COLUMN}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_prepped`
    GROUP BY {SERIES_COLUMN}
    ORDER BY {SERIES_COLUMN}
"""
splitCounts = bq.query(query).to_dataframe()
splitCounts.head(viz_limit)

Unnamed: 0,timeseries_id,TRAIN,VALIDATE,TEST,gross_quantity
0,FOODS_1_001_CA_1,1745,98,98,1526
1,FOODS_1_001_CA_2,1745,98,98,2232
2,FOODS_1_001_CA_3,1745,98,98,2329
3,FOODS_1_001_CA_4,1745,98,98,694
4,FOODS_1_001_TX_1,1745,98,98,1145
5,FOODS_1_001_TX_2,1745,98,98,1120
6,FOODS_1_001_TX_3,1738,98,98,797
7,FOODS_1_001_WI_1,1745,98,98,1095
8,FOODS_1_001_WI_2,1745,98,98,897
9,FOODS_1_001_WI_3,1745,98,98,569


In [42]:
splitCounts[['TRAIN','VALIDATE','TEST']].sum()

TRAIN       40907476
VALIDATE     2986181
TEST         2988020
dtype: int64

In [43]:
splitCounts[['TRAIN','VALIDATE','TEST']].sum().sum()

46881677

In [44]:
query = f"""
    WITH
        SPLIT AS (
            SELECT splits, min({TIME_COLUMN}) as mindate, max({TIME_COLUMN}) as maxdate
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_prepped`
            GROUP BY splits
        ),
        TRAIN AS (
            SELECT mindate as start_date
            FROM SPLIT
            WHERE splits ='TRAIN'
        ),
        VAL AS (
            SELECT mindate as val_start
            FROM SPLIT
            WHERE splits = 'VALIDATE'
        ),
        TEST AS (
            SELECT mindate as test_start, maxdate as end_date
            FROM SPLIT
            WHERE splits = 'TEST'
        )
    SELECT * EXCEPT(pos) FROM
    (SELECT *, ROW_NUMBER() OVER() pos FROM TRAIN)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM VAL)
    USING (pos)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM TEST)
    USING (pos)
"""
keyDates = bq.query(query).to_dataframe()
keyDates

Unnamed: 0,start_date,val_start,test_start,end_date
0,2011-01-29,2015-11-09,2016-02-15,2016-05-22


In [None]:
query = f"""
    SELECT 
        {SERIES_COLUMN}, {TIME_COLUMN}, {TARGET_COLUMN}, splits,
        {', '.join(COVARIATE_COLUMNS)}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_prepped`
    ORDER by {SERIES_COLUMN}, {TIME_COLUMN}
"""
rawSeries = bq.query(query).to_dataframe()