In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1
REGION: us-central1
VERTEX_SA: jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com


In [2]:
REGION = 'us-central1'
EXPERIMENT = 'control_group1'
# SERIES = 'causal_impact_4'
SERIES='mlr_1'

MODEL_VERSION='v3'
XAI_FLAG="TRUE"

BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-','_')
BQ_TABLE = EXPERIMENT

BQ_SOURCE1 = 'bigquery-public-data.new_york.citibike_trips'
BQ_SOURCE2 = 'bigquery-public-data.new_york.citibike_stations'
BQ_TABLE_COPY="hybrid-vertex.causal_impact_4.control_group1_grp_a"

viz_limit = 12

In [3]:
from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

from google.cloud import aiplatform as vertex_ai

bq = bigquery.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID, 
    location=REGION,
    # credentials=credentials
)

In [4]:
# CUSTOMIZE
TARGET_COLUMN = 'num_trips'
TIME_COLUMN = 'starttime'
SERIES_COLUMN = 'start_station_name'
COVARIATE_COLUMNS = [
    'avg_tripduration', 
    'pct_subscriber', 
    'ratio_gender', 
    'capacity'
] # could be empty

BQ_TABLE_GROUP_A="control_group1_grp_a"
# BQ_TABLE_GROUP_B="control_group1_grp_b"

VERTEX_AI_MODEL_ID='v1_mlr_a'

## New BQ dataset

In [6]:
ds = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET}")
ds.location = 'us' #REGION
ds.labels = {'notebook': f"{EXPERIMENT}"}
ds = bq.create_dataset(dataset = ds, exists_ok = True)

print(f" ds.dataset_id: {ds.dataset_id}")
print(f" ds.full_dataset_id: {ds.full_dataset_id}")

 ds.dataset_id: mlr_1
 ds.full_dataset_id: hybrid-vertex:mlr_1


In [7]:
query = f"""
    CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}` AS (
        SELECT 
            * 
        FROM `{BQ_TABLE_COPY}`
    );
"""
print(query)
# `hybrid-vertex.causal_impact_4.control_group1_grp_a`


    CREATE OR REPLACE TABLE `hybrid-vertex.mlr_1.control_group1_grp_a` AS (
        SELECT 
            * 
        FROM `hybrid-vertex.causal_impact_4.control_group1_grp_a`
    );



In [8]:
job = bq.query(query)
job.result()
print(job.state, (job.ended-job.started).total_seconds())

DONE 1.513


## Key Dates

In [5]:
query = f"""
    WITH
        SPLIT AS (
            SELECT splits, min({TIME_COLUMN}) as mindate, max({TIME_COLUMN}) as maxdate
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}`
            GROUP BY splits
        ),
        TRAIN AS (
            SELECT mindate as start_date
            FROM SPLIT
            WHERE splits ='TRAIN'
        ),
        VAL AS (
            SELECT mindate as val_start
            FROM SPLIT
            WHERE splits = 'VALIDATE'
        ),
        TEST AS (
            SELECT mindate as test_start, maxdate as end_date
            FROM SPLIT
            WHERE splits = 'TEST'
        )
    SELECT * EXCEPT(pos) FROM
    (SELECT *, ROW_NUMBER() OVER() pos FROM TRAIN)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM VAL)
    USING (pos)
    JOIN (SELECT *, ROW_NUMBER() OVER() pos FROM TEST)
    USING (pos)
"""
keyDates = bq.query(query).to_dataframe()
keyDates

Unnamed: 0,start_date,val_start,test_start,end_date
0,2013-07-01,2016-05-14,2016-07-23,2016-09-30


In [6]:
query = f"""
    SELECT {SERIES_COLUMN}, {TIME_COLUMN}, {TARGET_COLUMN}, splits,
        {', '.join(COVARIATE_COLUMNS)}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}`
    ORDER by {SERIES_COLUMN}, {TIME_COLUMN}
"""
rawSeries = bq.query(query).to_dataframe()

## Train MLR - Group A

In [7]:
# CUSTOMIZE
forecast_granularity = 'DAY'
forecast_horizon = 7 #14
forecast_test_length = 14
#forecast_val_length = 14

## what's a bug look like?

In [8]:
query = f"""
    CREATE OR REPLACE MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`
    OPTIONS
      (
        model_type = 'linear_reg',
        input_label_cols = ['{TARGET_COLUMN}'],
        model_registry="vertex_ai", 
        vertex_ai_model_id='{VERTEX_AI_MODEL_ID}',
        vertex_ai_model_version_aliases=['{MODEL_VERSION}', 'experimental'],
        enable_global_explain={XAI_FLAG}
      ) AS
    SELECT {TIME_COLUMN}, {TARGET_COLUMN},
        {', '.join(COVARIATE_COLUMNS)}
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}`
    WHERE splits in ('TRAIN','VALIDATE')
"""
print(query)


    CREATE OR REPLACE MODEL `hybrid-vertex.mlr_1.control_group1_grp_a_mlr_v2`
    OPTIONS
      (
        model_type = 'linear_reg',
        input_label_cols = ['num_trips'],
        model_registry="vertex_ai", 
        vertex_ai_model_id='v1_mlr_a',
        vertex_ai_model_version_aliases=['v2', 'experimental'],
        enable_global_explain=TRUE
      ) AS
    SELECT starttime, num_trips,
        avg_tripduration, pct_subscriber, ratio_gender, capacity
    FROM `hybrid-vertex.mlr_1.control_group1_grp_a`
    WHERE splits in ('TRAIN','VALIDATE')



In [9]:
job = bq.query(query)
job.result()
print(job.state, (job.ended-job.started).total_seconds())

DONE 33.268


### Review Input Features

In [10]:
query = f"""
    SELECT *
    FROM ML.FEATURE_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`)
"""
featureInfo = bq.query(query).to_dataframe()
featureInfo.head()

Unnamed: 0,input,min,max,mean,median,stddev,category_count,null_count,dimension
0,starttime,,,,,,1089.0,0,
1,avg_tripduration,116.666667,45776.5,817.984338,680.1875,1404.576554,,0,
2,pct_subscriber,0.0,1.0,0.921349,0.96,0.110237,,0,
3,ratio_gender,0.0,20.0,2.47887,2.083333,2.000844,,0,
4,capacity,0.0,91.0,40.583178,39.0,26.488621,,0,


In [12]:
query = f"""
    SELECT *
    FROM ML.TRAINING_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`)
"""
trainingInfo = bq.query(query).to_dataframe()
trainingInfo.head()

Unnamed: 0,training_run,iteration,loss,eval_loss,learning_rate,duration_ms
0,0,5,3101.960828,6550.015469,0.8,2189
1,0,4,3123.625647,6601.246628,0.8,2075
2,0,3,3185.819015,6727.225416,0.8,4318
3,0,2,3382.700092,6983.005701,0.8,2149
4,0,1,4482.891787,7293.532606,0.4,2069


## Forecast Evaluation

In [13]:
query = f"""
    SELECT *
    FROM ML.EVALUATE(
        MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`,
        (
            SELECT {TIME_COLUMN}, {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}`
            WHERE splits = 'TEST'
        )
    )
"""
metrics = bq.query(query).to_dataframe()
metrics

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,45.043066,3550.440725,4.042181,31.721676,-0.587681,-0.074246


## Explainability

* if `XAI_FLAG` set to `TRUE`

### local XAI

In [28]:
query = f"""
    CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_xai_{MODEL_VERSION}` AS (
    SELECT
      *
    FROM
      ML.EXPLAIN_PREDICT(
          MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`,
          (
            SELECT 
                {TIME_COLUMN}, 
                {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)},
                {SERIES_COLUMN}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}` 
            WHERE splits = 'TEST'
          ),
          STRUCT(3 as top_k_features)
        )
    );
"""
print(query)
# overview = bq.query(query).to_dataframe()
# overview.head(2)


    CREATE OR REPLACE TABLE `hybrid-vertex.mlr_1.control_group1_grp_a_xai_v2` AS (
    SELECT
      *
    FROM
      ML.EXPLAIN_PREDICT(
          MODEL `hybrid-vertex.mlr_1.control_group1_grp_a_mlr_v2`,
          (
            SELECT 
                starttime, 
                num_trips,
                avg_tripduration, pct_subscriber, ratio_gender, capacity,
                start_station_name
            FROM `hybrid-vertex.mlr_1.control_group1_grp_a` 
            WHERE splits = 'TEST'
          ),
          STRUCT(3 as top_k_features)
        )
    );



In [29]:
job = bq.query(query)
job.result()
print(job.state, (job.ended-job.started).total_seconds())

DONE 1.365


### global XAI

In [22]:
query = f"""
    SELECT
      *
    FROM
      ML.GLOBAL_EXPLAIN(
          MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`
    )
"""
# print(query)
overview = bq.query(query).to_dataframe()
overview

Unnamed: 0,feature,attribution
0,capacity,53.368157
1,starttime,40.251806
2,ratio_gender,2.727573
3,pct_subscriber,0.282598
4,avg_tripduration,0.16612


In [None]:
# job = bq.query(query)
# job.result()
# print(job.state, (job.ended-job.started).total_seconds())

## Forecast Test Set

In [23]:
query = f"""
    SELECT *
    FROM ML.PREDICT(
        MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`,
        (
            SELECT 
                {TIME_COLUMN}, 
                {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)},
                {SERIES_COLUMN}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}`
        )
        )
"""
forecast = bq.query(query).to_dataframe()
forecast
# print(query)

Unnamed: 0,predicted_num_trips,starttime,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name
0,137.164834,2014-12-29,123,668.040650,0.934959,3.392857,91,Lafayette St & E 8 St
1,253.029117,2016-05-30,231,687.766234,0.878788,1.625000,91,Lafayette St & E 8 St
2,156.428962,2015-01-10,69,451.289855,1.000000,3.600000,91,Lafayette St & E 8 St
3,187.795808,2014-09-14,310,657.487097,1.000000,2.647059,91,Lafayette St & E 8 St
4,176.431228,2015-12-20,152,533.980263,0.953947,2.234043,91,Lafayette St & E 8 St
...,...,...,...,...,...,...,...,...
4254,21.582573,2016-07-11,43,1156.209302,0.837209,1.047619,0,Lafayette Ave & Fort Greene Pl
4255,10.896531,2015-05-09,43,1302.023256,0.767442,1.866667,0,Lafayette Ave & Fort Greene Pl
4256,-36.459672,2016-06-21,43,671.697674,0.976744,2.307692,0,Lafayette Ave & Fort Greene Pl
4257,-8.844810,2016-06-01,43,788.581395,0.953488,1.529412,0,Lafayette Ave & Fort Greene Pl


In [24]:
# CUSTOMIZE
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_pred_Test` AS (
    SELECT * FROM ML.PREDICT(
            MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_mlr_{MODEL_VERSION}`,
            (
                SELECT
                {TIME_COLUMN}, 
                {TARGET_COLUMN},
                {', '.join(COVARIATE_COLUMNS)},
                {SERIES_COLUMN}
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}`
                WHERE splits = 'TEST'
            )
            )
)
"""
job = bq.query(query = query)
job.result()
(job.ended-job.started).total_seconds()

1.454

# Register Model in Vertex AI

In [25]:
# Initiate Vertex AI Model Registry for `VERTEX_AI_MODEL_ID` model entry
registry = vertex_ai.models.ModelRegistry(VERTEX_AI_MODEL_ID)

In [26]:
# Get model versions
versions = registry.list_versions()

for version in versions:
    version_id = version.version_id
    version_created_time = datetime.fromtimestamp(
        version.version_create_time.timestamp()
    ).strftime("%m/%d/%Y %H:%M:%S")
    version_aliases = version.version_aliases
    print(
        f"Model version {version_id} was created at {version_created_time} with aliases {version_aliases}",
    )

Getting versions for projects/hybrid-vertex/locations/us-central1/models/v1_mlr_a
Model version 1 was created at 05/04/2023 12:35:12 with aliases ['default']
Model version 2 was created at 05/04/2023 13:18:24 with aliases ['v2', 'experimental']


In [27]:
# Get the model
model = registry.get_model(version="1")
print(model)

<google.cloud.aiplatform.models.Model object at 0x7fc512d1dc50> 
resource name: projects/934903580331/locations/us-central1/models/v1_mlr_a


In [27]:
# Get the model
model = registry.get_model(version="v2")
print(model)

<google.cloud.aiplatform.models.Model object at 0x7fc9c40359d0> 
resource name: projects/934903580331/locations/us-central1/models/v1_mlr_a
