In [43]:
import pandas as pd
import numpy as np
from utils import *
import warnings

In [5]:
warnings.filterwarnings('ignore')

In [6]:
data = pd.read_csv('smaller_sample.csv')
events = pd.read_csv('event_definitions.csv')
# df = df.drop(columns=['order_ships', 'first_3_events'])

In [25]:
def data_cleaner(og_df, defs):
    df = og_df[['customer_id',
             'account_id',
             'ed_id',
             'event_name',
             'event_timestamp',
             'journey_steps_until_end',
             'milestone_number',
             'journey_id',]]
    
    df.loc[:,['milestone_number']] = df['milestone_number'].copy().fillna(0)

    df = df.drop_duplicates(subset=['customer_id', 'account_id', 'ed_id', 'event_name', 'event_timestamp'])
    df = df.reset_index(drop=True)

    j_steps = df['journey_steps_until_end']
    s_corrected = correct_sequences(j_steps)
    df['journey_steps_until_end'] = s_corrected

    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'])
    df_stages = defs[['event_name', 'stage']]
    
    df = pd.merge(df, df_stages, on ='event_name', how = 'left')
    
    df['account_id'] = remove_if(df, 'account_id')

    df['customer_id'] = remove_if(df, 'customer_id')
    
    return df


def get_time_since_last_event(cust_df, n=10):
    cust_df = cust_df.head(n)
    x = cust_df.groupby(['customer_id', 'journey_id'])['event_timestamp'].diff()
    x = x.fillna(pd.Timedelta(seconds=0))
    x = x.dt.total_seconds()
    x = x.tolist() + [0] * (n - len(x))
    return np.array(x)


def classification_dataset(data, event_defs, n_events = 10):
    df = data_cleaner(data, event_defs)

    idxs = list(df[df['event_name'] == 'promotion_created'].index)
    df.drop(idxs, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    new_df = df.groupby('customer_id').apply(group_by_approach)
    new_df.drop(columns=['index'], inplace=True)
    
    x = list(df.groupby('customer_id').apply(get_first_n_events, n = n_events))
    new_df['first_' + str(n_events) + '_events'] = x
    
    # x = list(df.groupby('customer_id').apply(get_time_since_last_event, n = n_events))
    # new_df['time_since_last_event'] = x
    
    return new_df

In [None]:
number_events_fixed = 15
col_name = 'first_' + str(number_events_fixed) +'_events'

df = classification_dataset(data, events, n_events=number_events_fixed)
df.reindex(sorted(df.columns), axis=1)

In [None]:
cust_ids = df.index
cust_ids = [x[0] for x in cust_ids]
df.reset_index(drop=True, inplace=True)
df['customer_id'] = cust_ids

In [33]:
df.columns

Index(['num_journeys', 'max_journey', 'discover', 'number_accounts',
       'one_more_journey', 'most_repeated_event', 'average_length_seq',
       'approved_credit', 'first_purchase', 'account_activitation',
       'downpayment_received', 'downpayment_cleared', 'order_ships',
       'max_milestone', 'has_prospecting', 'has_pre_application',
       'initial_device', 'time_in_discover', 'time_in_apply',
       'first_15_events', 'customer_id'],
      dtype='object')

In [37]:
# directly examines "downpayment received" since this leads to orders being shipped
# dropping some columns

df = df.drop(columns=['order_ships', 'max_milestone', 'downpayment_cleared', 'first_purchase', 'customer_id'], axis=1)

In [46]:
df = df.drop(columns='first_15_events')

In [53]:
# change time to hours to prevent large numbers
df.time_in_apply = df.time_in_apply / 3600
df.time_in_discover = df.time_in_discover / 3600

# Using Causal ML

In [100]:
from causalml.inference.meta import LRSRegressor, MLPTRegressor, XGBTRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor, ElasticNet, LogisticRegression

df = df[~df.initial_device.isna()]
X = df.drop(columns=['discover', 'downpayment_received'])
target = df.downpayment_received.astype(int)
treatment = df.discover.astype(int)

nn_regressor = MLPTRegressor(hidden_layer_sizes=(50, 50),
                             learning_rate_init=.05,
                             early_stopping=True,
                             random_state=2024)
treatment_effects, lower_bound, upper_bound = nn_regressor.estimate_ate(X=X, treatment=treatment, y=target)
print(f'Average Treatment Effect: {treatment_effects[0]:.5f} \n Lower Bound: {lower_bound[0]:.5f}, Upper Bound: {upper_bound[0]:.5f}')

Average Treatment Effect: -0.00108 
 Lower Bound: -0.00403, Upper Bound: 0.00188


In [101]:
xgb_regressor = XGBTRegressor(random_state=2024)
treatment_effects, lower_bound, upper_bound = xgb_regressor.estimate_ate(X=X, treatment=treatment, y=target)
print(f'Average Treatment Effect: {treatment_effects[0]:.5f} \n Lower Bound: {lower_bound[0]:.5f}, Upper Bound: {upper_bound[0]:.5f}')

Average Treatment Effect: -0.03264 
 Lower Bound: -0.03496, Upper Bound: -0.03032


To be continued: some other regressors and placebo test

# Using Dowhy

In [None]:
from dowhy import CausalModel

In [110]:
# Establish a naive causal graph, can add more in the future

'''
graph = nx.DiGraph([('discover', 'first_purchase'), ('first_purchase', 'account_activation'), ('account_activation', 'downpayment_received'), ('downpayment_received', 'downpayment_cleared'), ('downpayment_cleared', 'order_shipped')])
graph_gml = ''.join(nx.generate_gml(graph))
'''

model = CausalModel(
    data=df,
    treatment='discover',
    outcome='order_shipped',
    common_causes=['num_journeys', 'max_journey', 'max_milestone', 'number_accounts', 'one_more_journey', 'most_repeated_event', 'average_length_seq'],
    effect_modifiers=['first_purchase', 'account_activation', 'downpayment_received', 'downpayment_cleared']
    # graph=graph_gml
)

In [111]:
identified_estimand = model.identify_effect()

In [112]:
estimate = model.estimate_effect(identified_estimand=identified_estimand, method_name='backdoor.linear_regression')

linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': ['first_purchase', 'account_activation', 'downpayment_received', 'downpayment_cleared']}


In [113]:
estimate.interpret()

Increasing the treatment variable(s) [discover] from 0 to 1 causes an increase of -0.0003121286570691717 in the expected value of the outcome [order_shipped], over the data distribution/population represented by the dataset.


In [114]:
estimate.test_stat_significance()

{'p_value': array([4.899196e-10])}

In [None]:
estimate.get_confidence_intervals()

(−0.00152676624930032, 0.00114034686083087)

In [None]:
refute_res = model.refute_estimate(estimand=identified_estimand, estimate=estimate, method_name='random_common_cause', show_progress_bar=True)

In [123]:
print(refute_res)

Refute: Add a random common cause
Estimated effect:-0.0003121286570691717
New effect:-0.00031223762868127223
p value:0.8999999999999999



P value for refute_res > 0.05, meaning that linear regression is reasonably robust to refutation