# Load Packages

In [1]:
import functions as f
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import numpy as np

# Load Data

In [2]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()
segments = pd.read_csv("../customer_segmentation/segments.csv", index_col=0)

df = df[df["first_data_year"] >= 2021]
df.index = df["policy_nr_hashed"]
df = df.drop("policy_nr_hashed", axis=1)

segments.index = segments["policy_nr_hashed"]
segments = segments.drop("policy_nr_hashed", axis=1)
segments = pd.get_dummies(segments, columns=["cluster"])

group_names = {
    'cluster_0': 'Value Seekers', 
    'cluster_1': 'High-Income Customers',
    'cluster_2': 'Basic Coverage',
    'cluster_3': 'Rural Customers',
}

segments = segments.rename(columns=group_names)

# Run Double ML

In [4]:
first_stage_1, first_stage_2, double_mls, splits = f.global_run(df, splits=3, cols_to_drop_manual=['last_type'], iters=50, log=False, intermediary_scores=False)

Running Split 1...
100%|██████████| 50/50 [05:58<00:00,  7.17s/trial, best loss: 0.08575677711590667]
 66%|██████▌   | 33/50 [05:14<02:42,  9.54s/trial, best loss: 0.07504635333829834]


KeyboardInterrupt: 

In [10]:
for k, v in double_mls.items():
    print(k)
    display(v.summary)
    included_policy_nr = splits[k].index.to_list()
    segments_i = segments.loc[included_policy_nr]
    gate = v.gate(groups=segments_i)
    display(gate.summary)
    # v.sensitivity_analysis(cf_y=0.00898, cf_d=0.14166, rho=0.4833)
    # print(v.sensitivity_summary)
    # v.sensitivity_plot()

(0.00852, 0.162]


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.06946,0.013342,5.206208,1.927385e-07,0.043311,0.095609


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group 1,0.092091,0.02293,4.016192,5.9e-05,0.047147,0.137036
Group 2,0.067402,0.025171,2.677745,0.007417,0.018065,0.116738
Group 3,0.073296,0.040561,1.807063,0.070766,-0.006206,0.152799
Group 4,0.041915,0.025571,1.639174,0.101191,-0.008206,0.092036


(0.162, 0.243]


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.119195,0.011391,10.463965,1.264508e-25,0.096869,0.141521


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group 1,0.103119,0.019509,5.285668,1.263969e-07,0.06488,0.141358
Group 2,0.163976,0.021747,7.540266,4.864762e-14,0.121351,0.206601
Group 3,0.067604,0.032029,2.11068,0.03481066,0.004824,0.130384
Group 4,0.118108,0.022409,5.270554,1.372476e-07,0.074185,0.162031


(0.243, 0.3]


Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
welcome_discount,0.197846,0.009759,20.272401,2.25386e-91,0.178718,0.216974


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Group 1,0.198216,0.016637,11.914349,1.2486610000000002e-32,0.165607,0.230825
Group 2,0.193226,0.018416,10.492163,1.075704e-25,0.157129,0.229323
Group 3,0.159151,0.031116,5.114681,3.168451e-07,0.09816,0.220141
Group 4,0.215837,0.018556,11.631707,3.4911510000000003e-31,0.179466,0.252208


In [5]:
# for k_, v_ in double_mls.items():

#     print(k_)
#     display(v_.summary)
#     features = [col for col in splits[k_].columns if col not in ['welcome_discount', 'churn']]
#     benchmark_sensitivities = {}

#     def process_feature(feature):
#         return feature, v_.sensitivity_benchmark(benchmarking_set=[feature])

#     results = Parallel(n_jobs=-1)(delayed(process_feature)(feature) for feature in features)

#     for feature, result in results:
#         benchmark_sensitivities[feature] = result

#     cf_y_lst = []
#     cf_d_lst = []
#     names = []
#     rhos = []

#     for k, v in benchmark_sensitivities.items():
#         cf_y_lst.append(v.loc["welcome_discount", "cf_y"])
#         cf_d_lst.append(v.loc["welcome_discount", "cf_d"])
#         rhos.append(v.loc["welcome_discount", "rho"])
#         names.append(k)

#     benchmark_dict = {
#         "cf_y" : cf_y_lst,
#         "cf_d" : cf_d_lst,
#         "name" : names
#     }

#     v_.sensitivity_analysis(cf_y=0.04, cf_d=0.03)
#     v_.sensitivity_plot(benchmarks=benchmark_dict)

# print(f"Max cf_y: {np.max(cf_y_lst)}")
# print(f"Max cf_d: {np.max(cf_d_lst)}")
# print(f"Max rho: {np.max([np.abs(rho) for rho in rhos if np.abs(rho) != 1.0])}")