# Import Packages

In [36]:
import doubleml as dml
from doubleml.datasets import make_irm_data
import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, average_precision_score, make_scorer, mean_absolute_error, median_absolute_error
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse.linalg import lobpcg
import patsy
# import causalml.inference.meta as cml

# Load Data

In [37]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021]

In [20]:
nulls = [col for col, val in df.isnull().any().to_dict().items() if val == True]
print(nulls)

[]


# Setup Data

In [39]:
cols_to_drop = ["policy_nr_hashed", "last_data_year", "first_data_year", "control_group", 'count', 'first_datapoint_year', 'last_datapoint_year']
df = df[[col for col in df.columns.to_list() if (col not in cols_to_drop)]]

In [40]:
categorical_features = []
continuous_features = []
binary_features = []

# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 10

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

categorical_features = [col for col in categorical_features if col != "nr_years"]
continuous_features = continuous_features + ['nr_years']

print(f'Binary Features: {binary_features}')
print(f'Categorical Features: {categorical_features}')
print(f'Continuous Features: {continuous_features}')

df = pd.get_dummies(df, columns=categorical_features)

Binary Features: ['churn', 'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal', 'last_wa-extra', 'fake_alarm', 'policyholder_change', 'n_last_vs_peak', 'lpa']
Categorical Features: ['last_brand', 'last_type', 'last_fuel_type', 'last_postcode', 'last_product', 'nr_cars', 'max_nr_coverages', 'last_nr_coverages']
Continuous Features: ['welcome_discount', 'first_premium', 'last_premium', 'first_split', 'last_split', 'last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', 'last_weight', 'accident_years', 'last_change_premium_abs', 'last_change_premium_perc', 'last_vs_first_split', 'cum_change_premium_abs', 'cum_change_premium_perc', 'nr_years']


# Setup Models

In [7]:
best_d = {'colsample_bytree': 0.5651430631040584, 'learning_rate': 0.05024033157100756, 'max_depth': 70.0, 'min_child_samples': 33.0, 'min_data_in_leaf': 5.0, 'min_split_gain': 0.0024206836721644767, 'n_estimators': 54.0, 'num_leaves': 185.0, 'reg_alpha': 0.19913197144824663, 'reg_lambda': 0.19906785062440704, 'subsample': 0.9121630873508754, 'subsample_freq': 26.0}

best_params_d = {
    'max_depth': int(best_d['max_depth']),
    'n_estimators': int(best_d['n_estimators']),
    'num_leaves': int(best_d['num_leaves']),
    'min_child_samples': int(best_d['min_child_samples']),
    'colsample_bytree': best_d['colsample_bytree'],
    'subsample': best_d['subsample'],
    'subsample_freq': int(best_d['subsample_freq']),
    'reg_alpha': best_d['reg_alpha'],
    'reg_lambda': best_d['reg_lambda'],
    'min_split_gain': best_d['min_split_gain'],
    'learning_rate': best_d['learning_rate'],
    'min_data_in_leaf': int(best_d['min_data_in_leaf']),
}

ml_d = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    # is_unbalance=True,
    **best_params_d
)

In [8]:
best_y = {'colsample_bytree': 0.2983935721861137, 'learning_rate': 0.04740706929909022, 'max_depth': 59.0, 'min_child_samples': 18.0, 'min_data_in_leaf': 13.0, 'min_split_gain': 0.3863623673164322, 'n_estimators': 74.0, 'num_leaves': 54.0, 'reg_alpha': 0.1198683978345154, 'reg_lambda': 0.18168767473399486, 'subsample': 0.9841777438197711, 'subsample_freq': 27.0}

best_params_y = {
    'max_depth': int(best_y['max_depth']),
    'n_estimators': int(best_y['n_estimators']),
    'num_leaves': int(best_y['num_leaves']),
    'min_child_samples': int(best_y['min_child_samples']),
    'colsample_bytree': best_y['colsample_bytree'],
    'subsample': best_y['subsample'],
    'subsample_freq': int(best_y['subsample_freq']),
    'reg_alpha': best_y['reg_alpha'],
    'reg_lambda': best_y['reg_lambda'],
    'min_split_gain': best_y['min_split_gain'],
    'learning_rate': best_y['learning_rate'],
    'min_data_in_leaf': int(best_y['min_data_in_leaf']),
}

ml_y = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    # is_unbalance=True,
    **best_params_y
)

# Setup DML

In [41]:
def perform_doubleml(df):
    obj_dml_data = dml.DoubleMLData(df, 'churn', 'welcome_discount_bin')
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_y, ml_d)
    dml_irm_obj.fit()
    return dml_irm_obj

In [107]:
def split_data(df, num_splits):

    df_nothing = df[df["welcome_discount"] == 0].copy()
    df_others = df[df['welcome_discount'] != 0].copy()
    
    df_others['split'] = pd.qcut(df_others['welcome_discount'], q=num_splits)
    split_names = df_others['split'].unique()

    split_dfs = {label: pd.concat([df_nothing, df_others[df_others['split'] == label]]).drop("split", axis=1) for label in split_names}
    
    return dict(sorted(split_dfs.items()))

In [109]:
split_dfs = split_data(df, 3)

for k, v in split_dfs.items():
    print(k)
    display(v)

(0.00852, 0.16]


Unnamed: 0_level_0,welcome_discount,churn,first_premium,last_premium,first_split,last_split,last_customer_age,last_accident_free_years,last_car_value,last_age_car,...,max_nr_coverages_1,max_nr_coverages_2,max_nr_coverages_3,max_nr_coverages_4,max_nr_coverages_5,last_nr_coverages_1,last_nr_coverages_2,last_nr_coverages_3,last_nr_coverages_4,last_nr_coverages_5
policy_nr_hashed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0WK709p,0.00,0,4178.496,4178.496,0.829849,0.829849,36,0,478004.8,29,...,False,True,False,False,False,False,True,False,False,False
0WK70kZ,0.00,0,4171.776,4171.776,0.829575,0.829575,47,16,236454.4,21,...,False,True,False,False,False,False,True,False,False,False
0WK72B5,0.00,0,3792.768,3792.768,0.812544,0.812544,61,0,266918.4,1,...,False,False,True,False,False,False,False,True,False,False
0WK72gq,0.00,0,3810.240,3810.240,0.813404,0.813404,59,20,584584.0,18,...,False,False,True,False,False,False,False,True,False,False
0WK73z5,0.00,0,8104.320,8104.320,0.912769,0.912769,24,2,120400.0,15,...,False,False,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zPmyPLW,0.06,0,5757.696,17110.464,0.885854,0.958448,35,-2,377496.0,13,...,False,False,True,False,False,False,False,True,False,False
zPmyQXl,0.02,0,11781.504,11898.432,0.941821,0.940246,53,32,1056305.6,2,...,False,False,True,False,False,False,False,True,False,False
zPmymWB,0.02,1,2948.736,2881.536,0.767548,0.753265,65,3,163116.8,14,...,False,True,False,False,False,False,True,False,False,False
zPmyqJx,0.10,0,4506.432,4714.752,1.000000,1.000000,65,4,176444.8,18,...,True,False,False,False,False,True,False,False,False,False


(0.16, 0.243]


Unnamed: 0_level_0,welcome_discount,churn,first_premium,last_premium,first_split,last_split,last_customer_age,last_accident_free_years,last_car_value,last_age_car,...,max_nr_coverages_1,max_nr_coverages_2,max_nr_coverages_3,max_nr_coverages_4,max_nr_coverages_5,last_nr_coverages_1,last_nr_coverages_2,last_nr_coverages_3,last_nr_coverages_4,last_nr_coverages_5
policy_nr_hashed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0WK709p,0.00000,0,4178.496,4178.496,0.829849,0.829849,36,0,478004.8,29,...,False,True,False,False,False,False,True,False,False,False
0WK70kZ,0.00000,0,4171.776,4171.776,0.829575,0.829575,47,16,236454.4,21,...,False,True,False,False,False,False,True,False,False,False
0WK72B5,0.00000,0,3792.768,3792.768,0.812544,0.812544,61,0,266918.4,1,...,False,False,True,False,False,False,False,True,False,False
0WK72gq,0.00000,0,3810.240,3810.240,0.813404,0.813404,59,20,584584.0,18,...,False,False,True,False,False,False,False,True,False,False
0WK73z5,0.00000,0,8104.320,8104.320,0.912769,0.912769,24,2,120400.0,15,...,False,False,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zPmlkgV,0.18000,0,3763.200,3763.200,1.000000,1.000000,28,10,197198.4,5,...,False,True,False,False,False,False,True,False,False,False
zPmlpJW,0.18000,0,10824.576,10824.576,0.904644,0.904644,24,3,212968.0,11,...,False,False,True,False,False,False,False,True,False,False
zPmlvwl,0.23000,0,3884.160,3884.160,0.855017,0.855017,65,26,301168.0,8,...,False,False,True,False,False,False,False,True,False,False
zPmlyqx,0.19000,1,2956.800,2956.800,1.000000,1.000000,66,9,133168.0,9,...,False,True,False,False,False,False,True,False,False,False


(0.243, 0.3]


Unnamed: 0_level_0,welcome_discount,churn,first_premium,last_premium,first_split,last_split,last_customer_age,last_accident_free_years,last_car_value,last_age_car,...,max_nr_coverages_1,max_nr_coverages_2,max_nr_coverages_3,max_nr_coverages_4,max_nr_coverages_5,last_nr_coverages_1,last_nr_coverages_2,last_nr_coverages_3,last_nr_coverages_4,last_nr_coverages_5
policy_nr_hashed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0WK709p,0.00,0,4178.496,4178.496,0.829849,0.829849,36,0,478004.8,29,...,False,True,False,False,False,False,True,False,False,False
0WK70kZ,0.00,0,4171.776,4171.776,0.829575,0.829575,47,16,236454.4,21,...,False,True,False,False,False,False,True,False,False,False
0WK72B5,0.00,0,3792.768,3792.768,0.812544,0.812544,61,0,266918.4,1,...,False,False,True,False,False,False,False,True,False,False
0WK72gq,0.00,0,3810.240,3810.240,0.813404,0.813404,59,20,584584.0,18,...,False,False,True,False,False,False,False,True,False,False
0WK73z5,0.00,0,8104.320,8104.320,0.912769,0.912769,24,2,120400.0,15,...,False,False,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zPm3zdl,0.30,1,6611.136,6611.136,0.945924,0.945924,24,6,324419.2,7,...,False,False,True,False,False,False,False,True,False,False
zPmV2yo,0.30,0,5760.272,8310.064,0.913651,0.911937,75,13,289464.0,3,...,False,False,True,False,False,False,False,True,False,False
zPmZm3K,0.28,0,13067.712,13067.712,0.932634,0.932634,33,5,537846.4,4,...,False,False,False,True,False,False,False,False,True,False
zPmlZNK,0.29,0,3393.600,3393.600,0.736634,0.736634,51,15,130312.0,10,...,False,False,False,True,False,False,False,False,True,False


In [33]:
df_high.loc[:, "welcome_discount_bin"] = (1 - np.floor(df_high["welcome_discount"])).astype(int)
df_med.loc[:, "welcome_discount_bin"] = (1 - np.floor(df_med["welcome_discount"])).astype(int)
df_low.loc[:, "welcome_discount_bin"] = (1 - np.floor(df_low["welcome_discount"])).astype(int)

# df_test = df.sample(frac = 0.3, random_state = 0)
# df_train = df[~df.index.isin(df_test.index.to_list())]

In [35]:
dml_low = perform_doubleml(df_low)
dml_med = perform_doubleml(df_med)
dml_high = perform_doubleml(df_high)

display(dml_low.summary)
display(dml_med.summary)
display(dml_high.summary)

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount_bin,0.076234,0.001707,44.647215,0.0,0.072887,0.07958


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount_bin,0.056781,0.001632,34.802517,2.2281180000000002e-265,0.053583,0.059978


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount_bin,0.187293,0.001822,102.780894,0.0,0.183721,0.190865
