In [4]:
# !pip install scikit-uplift catboost
# pip install pycausalimpact
# pip install -U DoubleML

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1
REGION: us-central1
VERTEX_SA: jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com


In [2]:
REGION = 'us-central1'
EXPERIMENT = 'control_group1'
SERIES = 'causal_impact_4'

BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-','_')
BQ_TABLE = EXPERIMENT

BQ_SOURCE1 = 'bigquery-public-data.new_york.citibike_trips'
BQ_SOURCE2 = 'bigquery-public-data.new_york.citibike_stations'

viz_limit = 12

### packages & client SDK

In [3]:
from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta
import seaborn as sns

# Causal impact
from causalimpact import CausalImpact

# from uplift_utils import 
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds

from google.cloud import aiplatform as vertex_ai

bq = bigquery.Client(project=PROJECT_ID)

vertex_ai.init(
    project=PROJECT_ID, 
    location=REGION,
    # credentials=credentials
)

## Test Set Predictions 

In [4]:
# CUSTOMIZE
TARGET_COLUMN = 'num_trips'
TIME_COLUMN = 'starttime'
SERIES_COLUMN = 'start_station_name'
COVARIATE_COLUMNS = [
    'avg_tripduration', 
    'pct_subscriber', 
    'ratio_gender', 
    'capacity'
] # could be empty

BQ_TABLE_GROUP_A="control_group1_grp_a"
BQ_TABLE_GROUP_B="control_group1_grp_b"

GROUP_A_PREDS_BQ_URI=f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_A}_pred_Test'
GROUP_B_PREDS_BQ_URI=f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_GROUP_B}_pred_Test'

print(f"GROUP_A_PREDS_BQ_URI: {GROUP_A_PREDS_BQ_URI}")
print(f"GROUP_B_PREDS_BQ_URI: {GROUP_B_PREDS_BQ_URI}")

GROUP_A_PREDS_BQ_URI: hybrid-vertex.causal_impact_4.control_group1_grp_a_pred_Test
GROUP_B_PREDS_BQ_URI: hybrid-vertex.causal_impact_4.control_group1_grp_b_pred_Test


In [5]:
query = f"""
    SELECT * 
    FROM `{GROUP_A_PREDS_BQ_URI}`
    ORDER BY starttime ASC;
"""
groupa_test_preds = bq.query(query = query).to_dataframe()

groupa_test_preds['residuals'] = groupa_test_preds['num_trips'] - groupa_test_preds['predicted_num_trips']

groupa_test_preds = groupa_test_preds.rename(columns={'predicted_num_trips': 'a_predicted_num_trips', 'residuals': 'a_residuals'})
# groupa_test_preds.columns = 'a_' + groupa_test_preds.columns

print(f"Shape: {groupa_test_preds.shape}")
groupa_test_preds.head(10)

Shape: (208, 9)


Unnamed: 0,a_predicted_num_trips,starttime,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name,a_residuals
0,-12.221727,2016-07-23,8,867.75,0.875,3.0,23,Marcy Ave & Lafayette Ave,20.221727
1,-5.718968,2016-07-23,28,738.0,0.928571,4.6,27,Lafayette Ave & Classon Ave,33.718968
2,29.382119,2016-07-23,19,759.736842,0.842105,2.166667,39,Lafayette Ave & St James Pl,-10.382119
3,62.312957,2016-07-23,44,786.636364,0.727273,0.833333,51,Clermont Ave & Lafayette Ave,-18.312957
4,-8.922878,2016-07-24,18,968.777778,0.944444,1.25,23,Marcy Ave & Lafayette Ave,26.922878
5,30.613588,2016-07-24,26,631.115385,0.846154,1.6,39,Lafayette Ave & St James Pl,-4.613588
6,-22.154098,2016-07-25,11,12164.818182,0.909091,4.5,23,Marcy Ave & Lafayette Ave,33.154098
7,-1.082576,2016-07-25,24,1065.833333,0.833333,2.428571,27,Lafayette Ave & Classon Ave,25.082576
8,-7.983626,2016-07-26,20,837.9,0.85,1.0,23,Marcy Ave & Lafayette Ave,27.983626
9,-5.964148,2016-07-26,34,660.382353,0.970588,4.666667,27,Cumberland St & Lafayette Ave,39.964148


In [6]:
query = f"""
    SELECT * 
    FROM `{GROUP_B_PREDS_BQ_URI}`
    ORDER BY starttime ASC;
"""
groupb_test_preds = bq.query(query = query).to_dataframe()

groupb_test_preds['residuals'] = groupb_test_preds['num_trips'] - groupb_test_preds['predicted_num_trips']

groupb_test_preds = groupb_test_preds.rename(columns={'predicted_num_trips': 'b_predicted_num_trips', 'residuals': 'b_residuals'})
# groupb_test_preds.columns = 'b_' + groupb_test_preds.columns

print(f"Shape: {groupb_test_preds.shape}")
groupb_test_preds.head(10)

Shape: (255, 9)


Unnamed: 0,b_predicted_num_trips,starttime,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name,b_residuals
0,-52.023356,2016-07-23,32,822.65625,0.875,1.666667,0,Lafayette Ave & Fort Greene Pl,84.023356
1,6.26119,2016-07-23,45,612.688889,0.933333,1.25,27,Cumberland St & Lafayette Ave,38.73881
2,-49.030197,2016-07-24,39,1334.333333,0.794872,0.772727,0,Lafayette Ave & Fort Greene Pl,88.030197
3,5.785577,2016-07-24,23,783.391304,0.913043,1.875,27,Lafayette Ave & Classon Ave,17.214423
4,5.987348,2016-07-24,26,537.5,0.923077,1.6,27,Cumberland St & Lafayette Ave,20.012652
5,63.452991,2016-07-24,19,607.368421,0.736842,0.9,51,Clermont Ave & Lafayette Ave,-44.452991
6,-51.130414,2016-07-25,31,1037.16129,0.83871,1.583333,0,Lafayette Ave & Fort Greene Pl,82.130414
7,5.322646,2016-07-25,28,801.107143,0.892857,2.5,27,Cumberland St & Lafayette Ave,22.677354
8,28.164222,2016-07-25,21,793.333333,1.0,3.2,39,Lafayette Ave & St James Pl,-7.164222
9,57.676542,2016-07-25,25,630.64,0.92,2.125,51,Clermont Ave & Lafayette Ave,-32.676542


### group by station name

In [7]:
a_df = groupa_test_preds.groupby('start_station_name').agg(
    a_pred_sum=pd.NamedAgg(column="a_predicted_num_trips", aggfunc="sum")
).reset_index()

a_df.head(3)

Unnamed: 0,start_station_name,a_pred_sum
0,Clermont Ave & Lafayette Ave,1785.538084
1,Cumberland St & Lafayette Ave,-24.806472
2,Lafayette Ave & Classon Ave,-18.876421


In [8]:
b_df = groupb_test_preds.groupby('start_station_name').agg(
    b_pred_sum=pd.NamedAgg(column="b_predicted_num_trips", aggfunc="sum")
).reset_index()

b_df.head(3)

Unnamed: 0,start_station_name,b_pred_sum
0,Clermont Ave & Lafayette Ave,2339.678094
1,Cumberland St & Lafayette Ave,193.382301
2,Lafayette Ave & Classon Ave,167.054322


### merge & calculate incremental difference

In [9]:
new_df = pd.merge(a_df, b_df,  how='left', left_on='start_station_name', right_on ='start_station_name')

new_df['incremental_b'] = new_df['b_pred_sum'] - new_df['a_pred_sum']

new_df

Unnamed: 0,start_station_name,a_pred_sum,b_pred_sum,incremental_b
0,Clermont Ave & Lafayette Ave,1785.538084,2339.678094,554.14001
1,Cumberland St & Lafayette Ave,-24.806472,193.382301,218.188773
2,Lafayette Ave & Classon Ave,-18.876421,167.054322,185.930743
3,Lafayette Ave & Fort Greene Pl,-2145.398416,-1970.228071,175.170345
4,Lafayette Ave & St James Pl,963.41111,1207.991062,244.579952
5,Lafayette St & E 8 St,795.12188,1173.510689,378.388809
6,Lafayette St & Jersey St,899.289054,1747.289176,848.000121
7,Marcy Ave & Lafayette Ave,-392.276526,-59.367348,332.909177


### Visualize series

In [30]:
test_b_df=groupb_test_preds.set_index('starttime')
test_a_df=groupa_test_preds.set_index('starttime')

test_a_df['control_grp'] = 0
test_b_df['control_grp'] = 1

test_a_df=test_a_df.rename(columns={"a_predicted_num_trips":"predicted_num_trips"})
test_b_df=test_b_df.rename(columns={"b_predicted_num_trips":"predicted_num_trips"})

print(f"test_b_df shape: {test_b_df.shape}")
print(f"test_a_df shape: {test_a_df.shape}")

test_b_df shape: (255, 9)
test_a_df shape: (208, 9)


In [31]:
test_a_df

Unnamed: 0_level_0,predicted_num_trips,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name,a_residuals,control_grp
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-07-23,-12.221727,8,867.750000,0.875000,3.000000,23,Marcy Ave & Lafayette Ave,20.221727,0
2016-07-23,-5.718968,28,738.000000,0.928571,4.600000,27,Lafayette Ave & Classon Ave,33.718968,0
2016-07-23,29.382119,19,759.736842,0.842105,2.166667,39,Lafayette Ave & St James Pl,-10.382119,0
2016-07-23,62.312957,44,786.636364,0.727273,0.833333,51,Clermont Ave & Lafayette Ave,-18.312957,0
2016-07-24,-8.922878,18,968.777778,0.944444,1.250000,23,Marcy Ave & Lafayette Ave,26.922878,0
...,...,...,...,...,...,...,...,...,...
2016-09-28,-66.498403,53,672.962264,0.905660,1.650000,0,Lafayette Ave & Fort Greene Pl,119.498403,0
2016-09-29,-66.915263,49,794.204082,0.959184,1.722222,0,Lafayette Ave & Fort Greene Pl,115.915263,0
2016-09-29,59.737487,29,652.413793,1.000000,1.636364,51,Clermont Ave & Lafayette Ave,-30.737487,0
2016-09-29,79.934955,225,646.822222,0.955556,2.750000,60,Lafayette St & Jersey St,145.065045,0


In [32]:
frames = [test_b_df,test_a_df]

combined_df = pd.concat(frames)
combined_df

Unnamed: 0_level_0,predicted_num_trips,num_trips,avg_tripduration,pct_subscriber,ratio_gender,capacity,start_station_name,b_residuals,control_grp,a_residuals
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-07-23,-52.023356,32,822.656250,0.875000,1.666667,0,Lafayette Ave & Fort Greene Pl,84.023356,1,
2016-07-23,6.261190,45,612.688889,0.933333,1.250000,27,Cumberland St & Lafayette Ave,38.73881,1,
2016-07-24,-49.030197,39,1334.333333,0.794872,0.772727,0,Lafayette Ave & Fort Greene Pl,88.030197,1,
2016-07-24,5.785577,23,783.391304,0.913043,1.875000,27,Lafayette Ave & Classon Ave,17.214423,1,
2016-07-24,5.987348,26,537.500000,0.923077,1.600000,27,Cumberland St & Lafayette Ave,20.012652,1,
...,...,...,...,...,...,...,...,...,...,...
2016-09-28,-66.498403,53,672.962264,0.905660,1.650000,0,Lafayette Ave & Fort Greene Pl,,0,119.498403
2016-09-29,-66.915263,49,794.204082,0.959184,1.722222,0,Lafayette Ave & Fort Greene Pl,,0,115.915263
2016-09-29,59.737487,29,652.413793,1.000000,1.636364,51,Clermont Ave & Lafayette Ave,,0,-30.737487
2016-09-29,79.934955,225,646.822222,0.955556,2.750000,60,Lafayette St & Jersey St,,0,145.065045


In [39]:
# from sklift.metrics import uplift_at_k

# # k = 10%
# k = 0.1 

# y_val = combined_df['num_trips']
# preds = combined_df['predicted_num_trips']
# treatment = combined_df['control_grp']
# # grp_a_preds = test_a_df['a_predicted_num_trips']
# # grp_b_preds = test_b_df['b_predicted_num_trips']


# # strategy='overall' sort by uplift treatment and control together
# uplift_overall = uplift_at_k(y_val.values, preds.values, treatment.values, strategy='overall', k=k)

# # strategy='by_group' sort by uplift treatment and control separately
# uplift_bygroup = uplift_at_k(y_val.values, preds.values, treatment.values, strategy='by_group', k=k)


# print(f"uplift@{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)")
# print(f"uplift@{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)")

In [41]:
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel

# y_val = combined_df['num_trips']
# preds = combined_df['predicted_num_trips']
# treatment = combined_df['control_grp']

grp_a_preds = test_a_df['predicted_num_trips']
grp_b_preds = test_b_df['predicted_num_trips']

plot_uplift_preds(trmnt_preds=test_b_df, ctrl_preds=grp_a_preds);

ValueError: Found input variables with inconsistent numbers of samples: [255, 208]

### raw differences

In [None]:
# Calculate the pre-daily average
pre_daily_avg = df['y'][:300].mean()

# Calculate the post-daily average
post_daily_avg = df['y'][300:].mean()

# Print out the results
print(f'The pre-treatment daily average is {pre_daily_avg}.')
print(f'The post-treatment daily average is {post_daily_avg}.')
print(f'The raw difference between the pre and the post treatment is {post_daily_avg - pre_daily_avg}.')