In [125]:
import pandas as pd
import numpy as np
import networkx as nx
from dowhy import CausalModel

In [85]:
df = pd.read_csv('data/new_dataset.csv')
df = df.drop(columns=['order_ships', 'first_3_events'])

In [86]:
df.head(3)

Unnamed: 0,num_journeys,max_journey,order_shipped,discover,approved_credit,first_purchase,account_activitation,downpayment_received,downpayment_cleared,max_milestone,number_accounts,one_more_journey,most_repeated_event,average_length_seq
0,1,27,False,True,True,False,False,False,False,1.0,1,False,browse_products,24.0
1,1,5,False,True,True,False,False,False,False,1.0,1,False,application_web_approved,3.0
2,1,30,True,True,True,True,True,True,True,6.0,1,False,view_cart,30.0


In [None]:
pd.read_csv('data/Event Definitions.csv')

In [110]:
# Establish a naive causal graph, can add more in the future

'''
graph = nx.DiGraph([('discover', 'first_purchase'), ('first_purchase', 'account_activation'), ('account_activation', 'downpayment_received'), ('downpayment_received', 'downpayment_cleared'), ('downpayment_cleared', 'order_shipped')])
graph_gml = ''.join(nx.generate_gml(graph))
'''

model = CausalModel(
    data=df,
    treatment='discover',
    outcome='order_shipped',
    common_causes=['num_journeys', 'max_journey', 'max_milestone', 'number_accounts', 'one_more_journey', 'most_repeated_event', 'average_length_seq'],
    effect_modifiers=['first_purchase', 'account_activation', 'downpayment_received', 'downpayment_cleared']
    # graph=graph_gml
)

In [111]:
identified_estimand = model.identify_effect()

In [112]:
estimate = model.estimate_effect(identified_estimand=identified_estimand, method_name='backdoor.linear_regression')

linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': ['first_purchase', 'account_activation', 'downpayment_received', 'downpayment_cleared']}


In [113]:
estimate.interpret()

Increasing the treatment variable(s) [discover] from 0 to 1 causes an increase of -0.0003121286570691717 in the expected value of the outcome [order_shipped], over the data distribution/population represented by the dataset.


In [114]:
estimate.test_stat_significance()

{'p_value': array([4.899196e-10])}

In [None]:
estimate.get_confidence_intervals()

(−0.00152676624930032, 0.00114034686083087)

In [None]:
refute_res = model.refute_estimate(estimand=identified_estimand, estimate=estimate, method_name='random_common_cause', show_progress_bar=True)

In [123]:
print(refute_res)

Refute: Add a random common cause
Estimated effect:-0.0003121286570691717
New effect:-0.00031223762868127223
p value:0.8999999999999999



P value for refute_res > 0.05, meaning that linear regression is reasonably robust to refutation