In [None]:
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, average_precision_score, median_absolute_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import econml.dml as dml
from sklearn.model_selection import train_test_split
from pandas import get_dummies

dml_causal = dml.CausalForestDML

df = pd.read_csv("../data/prepped_data.csv", index_col = 0, low_memory= False)
df = df[df['first_data_year'] >= 2021]
df = df.drop(columns=[ 'last_data_year', 'first_data_year', 'control_group'])
print(df.columns)

In [None]:
categorical_features = []
continuous_features = []
binary_features = []

# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 10

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

# categorical_features = [col for col in categorical_features if col != "years_since_last_car_change"]
# continuous_features = continuous_features + ["years_since_last_car_change"]

print(f'Binary Features: {binary_features}')
print(f'Categorical Features: {categorical_features}')
print(f'Continuous Features: {continuous_features}')

df = pd.get_dummies(df, columns=categorical_features)




In [None]:
y = df['churn']
T = df['welcome_discount']
X = df.drop(columns=['churn', 'welcome_discount'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
best_first =  {'colsample_bytree': 0.6186073247946325, 'gamma': 0.16809946864097097, 'learning_rate': 0.1201301843520244, 'max_depth': 4.0, 'min_child_weight': 11.0, 'n_estimators': 141.0, 'reg_alpha': 0.9947688530022228, 'reg_lambda': 0.9032451749885752, 'subsample': 0.9954225737300325}

model_first = XGBClassifier(
    n_estimators=int(best_first['n_estimators']),
    max_depth=int(best_first['max_depth']),
    learning_rate=best_first['learning_rate'],
    subsample=best_first['subsample'],
    gamma=best_first['gamma'],
    colsample_bytree=best_first['colsample_bytree'],
    min_child_weight = best_first['min_child_weight'],
    # scale_pos_weight=scale_pos_weight_value,
    reg_alpha=best_first['reg_alpha'],
    reg_lambda=best_first['reg_lambda'],
    objective='binary:logistic',
    tree_method='hist',
    enable_categorical = True
)

# Fit the model
model_first.fit(X_train, y_train)

best_second = {'colsample_bytree': 0.8368650515536827, 'gamma': 0.0004689774751606146, 'learning_rate': 0.10657701450178336, 'max_depth': 8.0, 'min_child_weight': 10.0, 'n_estimators': 150.0, 'reg_alpha': 0.3708478046128461, 'reg_lambda': 0.6426896127366873, 'subsample': 0.96392422815337}


# Use the best parameters
model_second = XGBRegressor(
    n_estimators=int(best_second['n_estimators']),
    max_depth=int(best_second['max_depth']),
    learning_rate=best_second['learning_rate'],
    subsample=best_second['subsample'],
    gamma=best_second['gamma'],
    colsample_bytree=best_second['colsample_bytree'],
    min_child_weight = best_second['min_child_weight'],
    # scale_pos_weight=scale_pos_weight_value,
    reg_alpha=best_second['reg_alpha'],
    reg_lambda=best_second['reg_lambda'],
    objective='reg:squarederror',
    tree_method='hist',
    enable_categorical = True
)

# Fit the model
model_second.fit(X_train, y_train)

In [None]:
cf = dml_causal(model_y=model_first,
                     model_t=model_second,
                     n_estimators=100,   # Number of trees in the forest
                     min_samples_leaf=10,  # Minimum number of samples required to be at a leaf node
                     max_depth=None,       # Maximum depth of the tree
                     criterion='het',      # Function to measure the quality of a split
                    
                     # Other parameters can be tuned as required
                     )

# Fit the model - ensure your features (X) do not have missing values
cf.fit(Y=y, T=T, X=X.dropna(axis=1), W=None)

# Estimate the causal effect
effects = cf.effect(X)

effects