# Import Packages

In [16]:
import doubleml as dml
from doubleml.datasets import make_irm_data
import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, average_precision_score, make_scorer, mean_absolute_error, median_absolute_error
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse.linalg import lobpcg
# import causalml.inference.meta as cml

# Load Data

In [18]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021]

In [19]:
nulls = [col for col, val in df.isnull().any().to_dict().items() if val == True]
print(nulls)

[]


# Setup Data

In [20]:
cols_to_drop = ["policy_nr_hashed", "last_data_year", "first_data_year", "first_datapoint_year", "last_datapoint_year"]
df = df[[col for col in df.columns.to_list() if (col not in cols_to_drop)]]

In [21]:
categorical_features = []
continuous_features = []
binary_features = []

# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 10

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

print(f'Binary Features: {binary_features}')
print(f'Categorical Features: {categorical_features}')
print(f'Continuous Features: {continuous_features}')

df = pd.get_dummies(df, columns=categorical_features)

Binary Features: ['churn', 'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal', 'last_wa-extra', 'fake_alarm', 'policyholder_change', 'n_last_vs_peak', 'lpa']
Categorical Features: ['count', 'control_group', 'last_brand', 'last_type', 'last_fuel_type', 'last_postcode', 'last_product', 'nr_cars', 'max_nr_coverages', 'last_nr_coverages']
Continuous Features: ['welcome_discount', 'first_premium', 'last_premium', 'first_split', 'last_split', 'last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', 'last_weight', 'accident_years', 'last_change_premium_abs', 'last_change_premium_perc', 'last_vs_first_split', 'cum_change_premium_abs', 'cum_change_premium_perc']


# Setup Models

In [22]:
best_d = {'colsample_bytree': 0.5651430631040584, 'learning_rate': 0.05024033157100756, 'max_depth': 70.0, 'min_child_samples': 33.0, 'min_data_in_leaf': 5.0, 'min_split_gain': 0.0024206836721644767, 'n_estimators': 54.0, 'num_leaves': 185.0, 'reg_alpha': 0.19913197144824663, 'reg_lambda': 0.19906785062440704, 'subsample': 0.9121630873508754, 'subsample_freq': 26.0}

best_params_d = {
    'max_depth': int(best_d['max_depth']),
    'n_estimators': int(best_d['n_estimators']),
    'num_leaves': int(best_d['num_leaves']),
    'min_child_samples': int(best_d['min_child_samples']),
    'colsample_bytree': best_d['colsample_bytree'],
    'subsample': best_d['subsample'],
    'subsample_freq': int(best_d['subsample_freq']),
    'reg_alpha': best_d['reg_alpha'],
    'reg_lambda': best_d['reg_lambda'],
    'min_split_gain': best_d['min_split_gain'],
    'learning_rate': best_d['learning_rate'],
    'min_data_in_leaf': int(best_d['min_data_in_leaf']),
}

ml_d = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    # is_unbalance=True,
    **best_params_d
)

In [23]:
best_y = {'colsample_bytree': 0.2983935721861137, 'learning_rate': 0.04740706929909022, 'max_depth': 59.0, 'min_child_samples': 18.0, 'min_data_in_leaf': 13.0, 'min_split_gain': 0.3863623673164322, 'n_estimators': 74.0, 'num_leaves': 54.0, 'reg_alpha': 0.1198683978345154, 'reg_lambda': 0.18168767473399486, 'subsample': 0.9841777438197711, 'subsample_freq': 27.0}

best_params_y = {
    'max_depth': int(best_y['max_depth']),
    'n_estimators': int(best_y['n_estimators']),
    'num_leaves': int(best_y['num_leaves']),
    'min_child_samples': int(best_y['min_child_samples']),
    'colsample_bytree': best_y['colsample_bytree'],
    'subsample': best_y['subsample'],
    'subsample_freq': int(best_y['subsample_freq']),
    'reg_alpha': best_y['reg_alpha'],
    'reg_lambda': best_y['reg_lambda'],
    'min_split_gain': best_y['min_split_gain'],
    'learning_rate': best_y['learning_rate'],
    'min_data_in_leaf': int(best_y['min_data_in_leaf']),
}

ml_y = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    # is_unbalance=True,
    **best_params_y
)

# Setup DML

In [24]:
df_high = df[(df["welcome_discount"] == 1) | (df["welcome_discount"] <= 0.75)]
df_med = df[(df["welcome_discount"] == 1) | ((df["welcome_discount"] <= 0.85) & (df["welcome_discount"] > 0.75))]
df_low = df[(df["welcome_discount"] == 1) | ((df["welcome_discount"] <= 0.95) & (df["welcome_discount"] > 0.85))]

df_high.loc[:, "welcome_discount"] = (1 - np.floor(df_high["welcome_discount"])).astype(int)
df_med.loc[:, "welcome_discount"] = (1 - np.floor(df_med["welcome_discount"])).astype(int)
df_low.loc[:, "welcome_discount"] = (1 - np.floor(df_low["welcome_discount"])).astype(int)

# df_test = df.sample(frac = 0.3, random_state = 0)
# df_train = df[~df.index.isin(df_test.index.to_list())]

In [25]:
obj_dml_data = dml.DoubleMLData(df_high, 'churn', 'welcome_discount')

dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_y, ml_d)

dml_irm_obj.fit()

<doubleml.double_ml_irm.DoubleMLIRM at 0x16e15b550>

In [27]:
display(dml_irm_obj.summary)

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.1895,0.005843,32.431382,9.914665000000001e-231,0.178048,0.200953
