# Import Packages

In [None]:
import doubleml as dml
from doubleml.datasets import make_irm_data
import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, average_precision_score, make_scorer, mean_absolute_error, median_absolute_error
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse.linalg import lobpcg

# Load Data

In [None]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021]

In [None]:
nulls = [col for col, val in df.isnull().any().to_dict().items() if val == True]
print(nulls)

# Setup Data

In [None]:
cols_to_drop = ["policy_nr_hashed", "last_data_year", "first_data_year", "control_group", 'count', 'first_datapoint_year', 'last_datapoint_year']
df = df[[col for col in df.columns.to_list() if (col not in cols_to_drop)]]

In [None]:
categorical_features = []
continuous_features = []
binary_features = []

# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 5

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

categorical_features = [col for col in categorical_features if col != "nr_years"]
continuous_features = continuous_features + ['nr_years']

print(f'Binary Features: {binary_features}')
print(f'Categorical Features: {categorical_features}')
print(f'Continuous Features: {continuous_features}')

df = pd.get_dummies(df, columns=categorical_features, dtype="int")

# Setup Models

In [None]:
best_d = {'colsample_bytree': 0.5651430631040584, 'learning_rate': 0.05024033157100756, 'max_depth': 70.0, 'min_child_samples': 33.0, 'min_data_in_leaf': 5.0, 'min_split_gain': 0.0024206836721644767, 'n_estimators': 54.0, 'num_leaves': 185.0, 'reg_alpha': 0.19913197144824663, 'reg_lambda': 0.19906785062440704, 'subsample': 0.9121630873508754, 'subsample_freq': 26.0}

best_params_d = {
    'max_depth': int(best_d['max_depth']),
    'n_estimators': int(best_d['n_estimators']),
    'num_leaves': int(best_d['num_leaves']),
    'min_child_samples': int(best_d['min_child_samples']),
    'colsample_bytree': best_d['colsample_bytree'],
    'subsample': best_d['subsample'],
    'subsample_freq': int(best_d['subsample_freq']),
    'reg_alpha': best_d['reg_alpha'],
    'reg_lambda': best_d['reg_lambda'],
    'min_split_gain': best_d['min_split_gain'],
    'learning_rate': best_d['learning_rate'],
    'min_data_in_leaf': int(best_d['min_data_in_leaf']),
}

ml_d = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    # is_unbalance=True,
    **best_params_d
)

In [None]:
best_y = {'colsample_bytree': 0.2983935721861137, 'learning_rate': 0.04740706929909022, 'max_depth': 59.0, 'min_child_samples': 18.0, 'min_data_in_leaf': 13.0, 'min_split_gain': 0.3863623673164322, 'n_estimators': 74.0, 'num_leaves': 54.0, 'reg_alpha': 0.1198683978345154, 'reg_lambda': 0.18168767473399486, 'subsample': 0.9841777438197711, 'subsample_freq': 27.0}

best_params_y = {
    'max_depth': int(best_y['max_depth']),
    'n_estimators': int(best_y['n_estimators']),
    'num_leaves': int(best_y['num_leaves']),
    'min_child_samples': int(best_y['min_child_samples']),
    'colsample_bytree': best_y['colsample_bytree'],
    'subsample': best_y['subsample'],
    'subsample_freq': int(best_y['subsample_freq']),
    'reg_alpha': best_y['reg_alpha'],
    'reg_lambda': best_y['reg_lambda'],
    'min_split_gain': best_y['min_split_gain'],
    'learning_rate': best_y['learning_rate'],
    'min_data_in_leaf': int(best_y['min_data_in_leaf']),
}

ml_y = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    # is_unbalance=True,
    **best_params_y
)

# Setup DML

In [None]:
def perform_single_doubleml(df):
    obj_dml_data = dml.DoubleMLData(df, 'churn', 'welcome_discount_bin')
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_y, ml_d, score="ATE", weights=None)
    dml_irm_obj.fit()
    return dml_irm_obj

In [None]:
def split_data(df, num_splits):

    df_nothing = df[df["welcome_discount"] == 0].copy()
    df_others = df[df['welcome_discount'] != 0].copy()
    
    df_others['split'] = pd.qcut(df_others['welcome_discount'], q=num_splits)
    split_names = df_others['split'].unique()

    split_dfs = {label: pd.concat([df_nothing, df_others[df_others['split'] == label]]).drop("split", axis=1) for label in split_names}
    
    return dict(sorted(split_dfs.items()))

In [None]:
def run_doubleml(split_dfs):

    double_mls = {}

    for k, v in split_dfs.items():
        v["welcome_discount_bin"] = np.ceil(v["welcome_discount"]).astype(int)
        double_mls[k] = perform_single_doubleml(v)

    return double_mls

In [None]:
split_dfs = split_data(df, 3)

double_mls = run_doubleml(split_dfs)

In [None]:
columns_clustering = ['last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', "last_postcode", "last_fuel_type", "nr_year", "last_premium", 'last_sales_channel']
regex_pattern = '^' + '|'.join(columns_clustering)

for k, v in double_mls.items():
    print(k)
    display(v.summary)
    # print(v.sensitivity_analysis().sensitivity_summary)
    display(split_dfs[k][split_dfs[k]["welcome_discount_bin"] == 1].filter(regex=regex_pattern).describe().T[["count", "mean", "std", "min", "max"]])

In [None]:
[out for out in double_mls.values()][-1].sensitivity_analysis()
[out for out in double_mls.values()][-1].sensitivity_plot()