In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1
REGION: us-central1
VERTEX_SA: jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com


In [5]:
REGION = 'us-central1'
EXPERIMENT = 'control_group1'
# SERIES = 'causal_impact_4'
SERIES='mlr_2'

MODEL_VERSION='v1b'
XAI_FLAG="TRUE"

BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-','_')
BQ_TABLE = EXPERIMENT

BQ_SOURCE1 = 'bigquery-public-data.new_york.citibike_trips'
BQ_SOURCE2 = 'bigquery-public-data.new_york.citibike_stations'
BQ_TABLE_COPY="hybrid-vertex.causal_impact_4.control_group1_grp_b"

viz_limit = 12

EXPERIMENT_NAME = f"nyc_{BQ_DATASET}_{MODEL_VERSION}"
print(f'EXPERIMENT_NAME: {EXPERIMENT_NAME}')

In [6]:
from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

from google.cloud import aiplatform as vertex_ai

bq = bigquery.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID, 
    location=REGION,
    # credentials=credentials
)

In [7]:
# CUSTOMIZE
TARGET_COLUMN = 'num_trips'
TIME_COLUMN = 'starttime'
SERIES_COLUMN = 'start_station_name'
COVARIATE_COLUMNS = [
    'avg_tripduration', 
    'pct_subscriber', 
    'ratio_gender', 
    'capacity'
] # could be empty

# BQ_TABLE_GROUP_A="control_group1_grp_a"
BQ_TABLE_GROUP_B="control_group1_grp_b"

VERTEX_AI_MODEL_ID='v1_mlr_b'

## BigQuery Datasets

In [8]:
query = f"""
    CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}` AS (
        SELECT 
            * 
        FROM `{BQ_TABLE_COPY}`
    );
"""
print(query)
# `hybrid-vertex.causal_impact_4.control_group1_grp_a`


    CREATE OR REPLACE TABLE `hybrid-vertex.mlr_2.control_group1_grp_b` AS (
        SELECT 
            * 
        FROM `hybrid-vertex.causal_impact_4.control_group1_grp_b`
    );



In [9]:
job = bq.query(query)
job.result()
print(job.state, (job.ended-job.started).total_seconds())

DONE 1.45


### Key Dates

In [10]:
query = f"""
    WITH
        SPLIT AS (
            SELECT splits, min({TIME_COLUMN}) as mindate, max({TIME_COLUMN}) as maxdate
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}`
            GROUP BY splits
        ),
        TRAIN AS (
            SELECT mindate as start_date
            FROM SPLIT
            WHERE splits ='TRAIN'
        ),
        VAL AS (
            SELECT mindate as val_start
            FROM SPLIT
            WHERE splits = 'VALIDATE'
        ),
        TEST AS (
            SELECT mindate as test_start, maxdate as end_date
            FROM SPLIT
            WHERE splits = 'TEST'
        )
    SELECT * EXCEPT(pos) FROM
    (SELECT *, ROW_NUMBER() OVER() pos FROM TRAIN)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM VAL)
    USING (pos)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM TEST)
    USING (pos)
"""
keyDates = bq.query(query).to_dataframe()
keyDates

Unnamed: 0,start_date,val_start,test_start,end_date
0,2013-07-01,2016-05-14,2016-07-23,2016-09-30


In [11]:
query = f"""
    SELECT {SERIES_COLUMN}, {TIME_COLUMN}, {TARGET_COLUMN}, splits,
        {', '.join(COVARIATE_COLUMNS)}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}`
    ORDER by {SERIES_COLUMN}, {TIME_COLUMN}
"""
rawSeries = bq.query(query).to_dataframe()

In [12]:
rawSeries.tail(50)

Unnamed: 0,start_station_name,starttime,num_trips,splits,avg_tripduration,pct_subscriber,ratio_gender,capacity
4189,Marcy Ave & Lafayette Ave,2016-06-25,9,VALIDATE,952.222222,0.777778,0.8,23
4190,Marcy Ave & Lafayette Ave,2016-06-26,13,VALIDATE,1302.615385,0.692308,0.857143,23
4191,Marcy Ave & Lafayette Ave,2016-06-28,4,VALIDATE,779.75,1.0,0.0,23
4192,Marcy Ave & Lafayette Ave,2016-06-30,10,VALIDATE,1281.1,0.9,4.0,23
4193,Marcy Ave & Lafayette Ave,2016-07-02,17,VALIDATE,1179.647059,0.764706,0.545455,23
4194,Marcy Ave & Lafayette Ave,2016-07-03,4,VALIDATE,1033.25,0.75,0.333333,23
4195,Marcy Ave & Lafayette Ave,2016-07-04,6,VALIDATE,908.833333,0.833333,2.0,23
4196,Marcy Ave & Lafayette Ave,2016-07-05,6,VALIDATE,1491.333333,1.0,2.0,23
4197,Marcy Ave & Lafayette Ave,2016-07-06,7,VALIDATE,1033.571429,0.714286,1.333333,23
4198,Marcy Ave & Lafayette Ave,2016-07-07,7,VALIDATE,1180.0,1.0,0.166667,23


## Train MLR - Group B

In [21]:
# CUSTOMIZE
forecast_granularity = 'DAY'
forecast_horizon = 7 #14
forecast_test_length = 14
#forecast_val_length = 14

In [22]:
query = f"""
    CREATE OR REPLACE MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_mlr_{MODEL_VERSION}`
    OPTIONS
      (model_type = 'linear_reg',
       input_label_cols = ['{TARGET_COLUMN}'],
        model_registry="vertex_ai", 
        vertex_ai_model_id='{VERTEX_AI_MODEL_ID}',
        vertex_ai_model_version_aliases=['{MODEL_VERSION}', 'experimental'],
        enable_global_explain={XAI_FLAG}
      ) AS
    SELECT {TIME_COLUMN}, {TARGET_COLUMN},
        {', '.join(COVARIATE_COLUMNS)}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}`
    WHERE splits in ('TRAIN','VALIDATE')
"""
print(query)


    CREATE OR REPLACE MODEL `hybrid-vertex.mlr_2.control_group1_grp_b_mlr_v1b`
    OPTIONS
      (model_type = 'linear_reg',
       input_label_cols = ['num_trips'],
        model_registry="vertex_ai", 
        vertex_ai_model_id='v1_mlr_b',
        vertex_ai_model_version_aliases=['v1b', 'experimental'],
        enable_global_explain=TRUE
      ) AS
    SELECT starttime, num_trips,
        avg_tripduration, pct_subscriber, ratio_gender, capacity
    FROM `hybrid-vertex.mlr_2.control_group1_grp_b`
    WHERE splits in ('TRAIN','VALIDATE')



In [23]:
job = bq.query(query)
job.result()
print(job.state, (job.ended-job.started).total_seconds())

DONE 19.595


### Review Input Features

In [25]:
query = f"""
    SELECT *
    FROM ML.FEATURE_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_mlr_{MODEL_VERSION}`)
"""
featureInfo = bq.query(query).to_dataframe()
featureInfo.head()

Unnamed: 0,input,min,max,mean,median,stddev,category_count,null_count,dimension
0,starttime,,,,,,1091.0,0,
1,avg_tripduration,82.0,597575.0,1162.228073,683.5,11269.24911,,0,
2,pct_subscriber,0.0,1.0,0.920442,0.958333,0.111391,,0,
3,ratio_gender,0.0,23.0,2.395952,2.0,1.945128,,0,
4,capacity,0.0,91.0,41.275732,39.0,26.557293,,0,


In [26]:
query = f"""
    SELECT *
    FROM ML.TRAINING_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_mlr_{MODEL_VERSION}`)
"""
trainingInfo = bq.query(query).to_dataframe()
trainingInfo.head()

Unnamed: 0,training_run,iteration,loss,eval_loss,learning_rate,duration_ms
0,0,2,3522.164596,7131.593358,0.8,2010
1,0,1,4720.867827,7195.197657,0.4,2144
2,0,0,9003.033524,9885.400199,0.2,1884


## Forecast Evaluation

In [27]:
query = f"""
    SELECT *
    FROM ML.EVALUATE(
        MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_mlr_{MODEL_VERSION}`,
        (
            SELECT {TIME_COLUMN}, {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}`
            WHERE splits = 'TEST'
        )
    )
"""
metrics = bq.query(query).to_dataframe()
metrics

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,43.741885,3964.791828,2.056306,29.21676,0.038572,0.282381


## Forecast Test Set

In [28]:
query = f"""
    SELECT *
    FROM ML.PREDICT(
        MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_mlr_{MODEL_VERSION}`,
        (
            SELECT 
                {TIME_COLUMN}, 
                {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)},
                {SERIES_COLUMN}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}`
            WHERE splits = 'TEST'
        )
        )
"""
forecast = bq.query(query).to_dataframe()
forecast
# print(query)

Unnamed: 0,predicted_num_trips,starttime,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name
0,154.324684,2016-08-20,556,1217.298561,0.593525,0.566197,91,Lafayette St & E 8 St
1,144.196412,2016-08-11,93,585.118280,0.935484,2.444444,91,Lafayette St & E 8 St
2,145.468451,2016-08-12,108,680.472222,0.916667,1.842105,91,Lafayette St & E 8 St
3,144.172371,2016-08-16,190,688.705263,0.957895,2.114754,91,Lafayette St & E 8 St
4,155.335819,2016-08-13,470,1161.431915,0.546809,0.577181,91,Lafayette St & E 8 St
...,...,...,...,...,...,...,...,...
250,4.742709,2016-09-23,42,805.547619,0.952381,2.000000,27,Lafayette Ave & Classon Ave
251,5.291113,2016-08-04,42,735.547619,0.952381,1.625000,27,Cumberland St & Lafayette Ave
252,6.833876,2016-08-24,42,856.047619,0.928571,0.909091,27,Cumberland St & Lafayette Ave
253,3.549017,2016-08-05,42,929.357143,0.952381,2.818182,27,Cumberland St & Lafayette Ave


In [29]:
# CUSTOMIZE
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_pred_Test` AS (
    SELECT * FROM ML.PREDICT(
            MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_mlr_{MODEL_VERSION}`,
            (
                SELECT
                {TIME_COLUMN}, 
                {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)},
                {SERIES_COLUMN}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}`
                WHERE splits = 'TEST'
            )
            )
)
"""
job = bq.query(query = query)
job.result()
(job.ended-job.started).total_seconds()

1.456

In [30]:
BQ_DATASET

'mlr_2'

# Register Model in Vertex AI

In [24]:
# Initiate Vertex AI Model Registry for `VERTEX_AI_MODEL_ID` model entry
registry = vertex_ai.models.ModelRegistry(VERTEX_AI_MODEL_ID)

In [31]:
# Get model versions
versions = registry.list_versions()

for version in versions:
    version_id = version.version_id
    version_created_time = datetime.fromtimestamp(
        version.version_create_time.timestamp()
    ).strftime("%m/%d/%Y %H:%M:%S")
    version_aliases = version.version_aliases
    print(
        f"Model version {version_id} was created at {version_created_time} with aliases {version_aliases}",
    )

Getting versions for projects/hybrid-vertex/locations/us-central1/models/v1_mlr_b
Model version 1 was created at 05/14/2023 23:49:19 with aliases ['v1b', 'experimental', 'default']


In [33]:
# Get the model
model = registry.get_model(version=MODEL_VERSION)
print(model)

<google.cloud.aiplatform.models.Model object at 0x7ff8dfd93a50> 
resource name: projects/934903580331/locations/us-central1/models/v1_mlr_b
