# Load Packages

In [1]:
import functions as f
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import numpy as np

# Load Data

In [8]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()
segments = pd.read_csv("../customer_segmentation/segments.csv", index_col=0)

df = df[df["first_data_year"] >= 2021]
df.index = df["policy_nr_hashed"]
df = df.drop("policy_nr_hashed", axis=1)

segments.index = segments["policy_nr_hashed"]
segments = segments.drop("policy_nr_hashed", axis=1)
segments = pd.get_dummies(segments, columns=["cluster"])

group_names = {
    'cluster_0': 'Group 1', 
    'cluster_1': 'Group 2',
    'cluster_2': 'Group 3',
    'cluster_3': 'Group 4',
    'cluster_4': 'Group 5',
}

segments = segments.rename(columns=group_names)

# Run Double ML

In [16]:
first_stage_1, first_stage_2, double_mls, splits = f.global_run(df, splits=3, cols_to_drop_manual=['last_type'], iters=10, log=False, intermediary_scores=False)

Running Split 1...
100%|██████████| 10/10 [01:15<00:00,  7.59s/trial, best loss: 0.0774080825664907]
100%|██████████| 10/10 [01:13<00:00,  7.36s/trial, best loss: 0.07769213357931415]
Done!!
Running Split 2...
100%|██████████| 10/10 [01:12<00:00,  7.26s/trial, best loss: 0.09793157534649799]
100%|██████████| 10/10 [01:06<00:00,  6.64s/trial, best loss: 0.03640826625713246]
Done!!
Running Split 3...
100%|██████████| 10/10 [01:12<00:00,  7.20s/trial, best loss: 0.10374604452568315]
100%|██████████| 10/10 [01:00<00:00,  6.07s/trial, best loss: 0.026389950513679837]
Done!!


In [28]:
for k, v in double_mls.items():
    print(k)
    display(v.summary)
    included_policy_nr = splits[k].index.to_list()
    segments_i = segments.loc[included_policy_nr]
    gate = v.gate(groups=segments_i)
    display(gate.summary)
    # gate.sensitivity_analysis(cf_y=0.00898, cf_d=0.14166, rho=0.4833)
    # print(gate.sensitivity_summary)
    # v.sensitivity_plot()

(0.00852, 0.162]


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.073636,0.013387,5.500576,3.785519e-08,0.047398,0.099874


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group 1,0.02816,0.030814,0.91386,0.3608001,-0.032238,0.088557
Group 2,0.059877,0.027707,2.161104,0.03069769,0.00557,0.114184
Group 3,0.142133,0.027213,5.222962,1.776364e-07,0.088794,0.195473
Group 4,0.061288,0.026802,2.286743,0.02222001,0.008755,0.113821
Group 5,0.053985,0.045515,1.186089,0.2355995,-0.035227,0.143197


(0.162, 0.243]


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.111807,0.010473,10.675489,1.3255719999999998e-26,0.09128,0.132335


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group 1,0.082116,0.024068,3.411882,0.0006462783,0.034942,0.12929
Group 2,0.151894,0.021814,6.963107,3.418713e-12,0.109137,0.194651
Group 3,0.102355,0.02114,4.841687,1.29583e-06,0.060918,0.143791
Group 4,0.108095,0.021771,4.965154,6.91413e-07,0.065423,0.150767
Group 5,0.10756,0.032592,3.300172,0.0009677352,0.043677,0.171443


(0.243, 0.3]


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.195147,0.009481,20.584043,3.814927e-94,0.176566,0.213729


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group 1,0.158674,0.021042,7.540954,4.841561e-14,0.117431,0.199918
Group 2,0.217008,0.019583,11.081191,1.833611e-28,0.178623,0.255393
Group 3,0.199076,0.019271,10.3306,5.825162e-25,0.161305,0.236848
Group 4,0.20351,0.019239,10.578094,4.335014e-26,0.165801,0.24122
Group 5,0.186132,0.034,5.474461,4.435243e-08,0.11949,0.252775


In [5]:
# for k_, v_ in double_mls.items():

#     print(k_)
#     display(v_.summary)
#     features = [col for col in splits[k_].columns if col not in ['welcome_discount', 'churn']]
#     benchmark_sensitivities = {}

#     def process_feature(feature):
#         return feature, v_.sensitivity_benchmark(benchmarking_set=[feature])

#     results = Parallel(n_jobs=-1)(delayed(process_feature)(feature) for feature in features)

#     for feature, result in results:
#         benchmark_sensitivities[feature] = result

#     cf_y_lst = []
#     cf_d_lst = []
#     names = []
#     rhos = []

#     for k, v in benchmark_sensitivities.items():
#         cf_y_lst.append(v.loc["welcome_discount", "cf_y"])
#         cf_d_lst.append(v.loc["welcome_discount", "cf_d"])
#         rhos.append(v.loc["welcome_discount", "rho"])
#         names.append(k)

#     benchmark_dict = {
#         "cf_y" : cf_y_lst,
#         "cf_d" : cf_d_lst,
#         "name" : names
#     }

#     v_.sensitivity_analysis(cf_y=0.04, cf_d=0.03)
#     v_.sensitivity_plot(benchmarks=benchmark_dict)

# print(f"Max cf_y: {np.max(cf_y_lst)}")
# print(f"Max cf_d: {np.max(cf_d_lst)}")
# print(f"Max rho: {np.max([np.abs(rho) for rho in rhos if np.abs(rho) != 1.0])}")