In [45]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1
REGION: us-central1
VERTEX_SA: jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com


In [46]:
REGION = 'us-central1'
EXPERIMENT = 'control_group1'
SERIES = 'causal_impact_3'

BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-','_')
BQ_TABLE = EXPERIMENT

BQ_SOURCE1 = 'bigquery-public-data.new_york.citibike_trips'
BQ_SOURCE2 = 'bigquery-public-data.new_york.citibike_stations'

viz_limit = 12

### packages & client SDK

In [47]:
from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta

from google.cloud import aiplatform as vertex_ai

bq = bigquery.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID, 
    location=REGION,
    # credentials=credentials
)

## Test Set Predictions 

In [49]:
# CUSTOMIZE
TARGET_COLUMN = 'num_trips'
TIME_COLUMN = 'starttime'
SERIES_COLUMN = 'start_station_name'
COVARIATE_COLUMNS = [
    'avg_tripduration', 
    'pct_subscriber', 
    'ratio_gender', 
    # 'capacity'
] # could be empty

BQ_TABLE_GROUP_A="control_group1_grp_a"
BQ_TABLE_GROUP_B="control_group1_grp_b"

GROUP_A_PREDS_BQ_URI=f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_pred_Test'
GROUP_B_PREDS_BQ_URI=f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_pred_Test'

print(f"GROUP_A_PREDS_BQ_URI: {GROUP_A_PREDS_BQ_URI}")
print(f"GROUP_B_PREDS_BQ_URI: {GROUP_B_PREDS_BQ_URI}")

GROUP_A_PREDS_BQ_URI: hybrid-vertex.causal_impact_3.control_group1_grp_a_pred_Test
GROUP_B_PREDS_BQ_URI: hybrid-vertex.causal_impact_3.control_group1_grp_b_pred_Test


In [50]:
query = f"""
    SELECT * 
    FROM `{GROUP_A_PREDS_BQ_URI}`
    ORDER BY starttime ASC;
"""
groupa_test_preds = bq.query(query = query).to_dataframe()

groupa_test_preds['residuals'] = groupa_test_preds['num_trips'] - groupa_test_preds['predicted_num_trips']

groupa_test_preds = groupa_test_preds.rename(columns={'predicted_num_trips': 'a_predicted_num_trips', 'residuals': 'a_residuals'})
# groupa_test_preds.columns = 'a_' + groupa_test_preds.columns

print(f"Shape: {groupa_test_preds.shape}")
groupa_test_preds.head(10)

Shape: (1295, 8)


Unnamed: 0,a_predicted_num_trips,starttime,num_trips,avg_tripduration,pct_subscriber,ratio_gender,start_station_name,a_residuals
0,-2957.157774,2016-07-23,212,813.721698,0.872642,1.494118,Broadway & E 22 St,3169.157774
1,-2967.845218,2016-07-23,113,1041.672566,0.716814,1.132075,Broadway & W 55 St,3080.845218
2,-2967.200808,2016-07-23,136,1843.742647,0.727941,1.060606,W 84 St & Broadway,3103.200808
3,-2968.247497,2016-07-23,31,861.451613,0.709677,0.9375,Reade St & Broadway,2999.247497
4,-2949.098991,2016-07-23,10,741.0,1.0,4.0,Broadway & Whipple St,2959.098991
5,-2960.446862,2016-07-23,34,632.5,0.823529,1.266667,Broadway & Roebling St,2994.446862
6,-2966.720583,2016-07-23,123,1036.398374,0.731707,0.808824,W Broadway & Spring St,3089.720583
7,-2958.72996,2016-07-23,13,443.538462,0.846154,0.857143,Boerum St & Broadway,2971.72996
8,-2966.321338,2016-07-23,61,748.377049,0.737705,1.033333,Broadway & Berry St,3027.321338
9,-2977.510764,2016-07-23,66,1065.166667,0.575758,0.885714,Broadway & W 41 St,3043.510764


In [33]:
query = f"""
    SELECT * 
    FROM `{GROUP_B_PREDS_BQ_URI}`
    ORDER BY starttime ASC;
"""
groupb_test_preds = bq.query(query = query).to_dataframe()

groupb_test_preds['residuals'] = groupb_test_preds['num_trips'] - groupb_test_preds['predicted_num_trips']

groupb_test_preds = groupb_test_preds.rename(columns={'predicted_num_trips': 'b_predicted_num_trips', 'residuals': 'b_residuals'})
# groupb_test_preds.columns = 'b_' + groupb_test_preds.columns

print(f"Shape: {groupb_test_preds.shape}")
groupb_test_preds.head(10)

Shape: (267, 9)


Unnamed: 0,b_predicted_num_trips,starttime,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name,b_residuals
0,13.428862,2016-09-17,298,873.449664,0.832215,1.504202,,Broadway & E 22 St,284.571138
1,3.698592,2016-09-17,67,1060.716418,0.701493,1.791667,,Broadway & W 39 St,63.301408
2,3.375005,2016-09-17,131,1305.030534,0.679389,1.046875,,Broadway & W 55 St,127.624995
3,11.239987,2016-09-17,97,921.597938,0.804124,1.621622,,E 11 St & Broadway,85.760013
4,10.05345,2016-09-17,109,921.522936,0.779817,1.319149,,W 78 St & Broadway,98.94655
5,-31.674911,2016-09-17,25,773.36,0.88,1.083333,19.0,Boerum St & Broadway,56.674911
6,-20.431837,2016-09-17,25,568.72,0.88,0.785714,27.0,Broadway & Whipple St,45.431837
7,-16.363075,2016-09-17,147,843.564626,0.891156,1.94,31.0,W 67 St & Broadway,163.363075
8,-19.191555,2016-09-17,85,2013.847059,0.776471,1.073171,34.0,Broadway & Berry St,104.191555
9,-11.128003,2016-09-17,151,890.86755,0.761589,1.359375,41.0,Broadway & W 29 St,162.128003


### group by station name

In [41]:
a_df = groupa_test_preds.groupby('start_station_name').agg(
    a_pred_sum=pd.NamedAgg(column="a_predicted_num_trips", aggfunc="sum")
).reset_index()

a_df.head(3)

Unnamed: 0,start_station_name,a_pred_sum
0,Boerum St & Broadway,4195.914201
1,Broadway & Battery Pl,11530.971892
2,Broadway & Berry St,8548.403265


In [42]:
b_df = groupb_test_preds.groupby('start_station_name').agg(
    b_pred_sum=pd.NamedAgg(column="b_predicted_num_trips", aggfunc="sum")
).reset_index()

b_df.head(3)

Unnamed: 0,start_station_name,b_pred_sum
0,Boerum St & Broadway,-367.518657
1,Broadway & Battery Pl,-33.503853
2,Broadway & Berry St,-117.810246


### merge & calculate incremental difference

In [44]:
new_df = pd.merge(a_df, b_df,  how='left', left_on='start_station_name', right_on ='start_station_name')

new_df['incremental_b'] = new_df['b_pred_sum'] - new_df['a_pred_sum']

new_df

Unnamed: 0,start_station_name,a_pred_sum,b_pred_sum,incremental_b
0,Boerum St & Broadway,4195.914201,-367.518657,-4563.432858
1,Broadway & Battery Pl,11530.971892,-33.503853,-11564.475746
2,Broadway & Berry St,8548.403265,-117.810246,-8666.21351
3,Broadway & E 14 St,12273.747481,581.402222,-11692.345259
4,Broadway & E 22 St,7278.863541,160.414371,-7118.44917
5,Broadway & Roebling St,15576.42358,-93.050994,-15669.474574
6,Broadway & W 24 St,10135.377638,118.363974,-10017.013664
7,Broadway & W 29 St,7178.811482,-53.616826,-7232.428308
8,Broadway & W 32 St,13087.107593,78.36815,-13008.739443
9,Broadway & W 36 St,10352.345257,289.749371,-10062.595886
