In [None]:
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm


In [None]:
df = pd.read_csv("../data/prepped_data.csv", index_col = 0, low_memory=False)
df = df[df['first_data_year'] >= 2021]
df = df.drop(columns=['policy_nr_hashed', 'last_data_year', 'first_data_year', 'control_group', 'churn'])


# Feature type identification

In [None]:
categorical_features = []
continuous_features = []
binary_features = []

# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 10

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

categorical_features.remove('years_since_last_car_change')
print(f'Categorical Features: {categorical_features}')
print(f'Continuous Features: {continuous_features}')
continuous_features.append( 'years_since_last_car_change')


# Splitting Train/Test

In [None]:
from sklearn.model_selection import train_test_split
y = df['welcome_discount']
X = df.drop(columns=['welcome_discount'])


for cat in categorical_features:
     X[cat] = X[cat].astype("category")



#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

y_test.shape


In [None]:
# # Assuming y_train is your training target variable
# number_of_positive_instances = sum(y_train == 1)
# number_of_negative_instances = sum(y_train == 0)

# # Calculate the scale_pos_weight value
# scale_pos_weight_value =  number_of_negative_instances /  number_of_positive_instances 

# print(f"Suggested scale_pos_weight value: {scale_pos_weight_value}")


# Tuning Hyperparameters & Training the model

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# Define the search space
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'min_child_weight': hp.quniform('min_child_weight', 8, 20, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1)
}

def objective(space):
    model = XGBRegressor(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        learning_rate=space['learning_rate'],
        subsample=space['subsample'],
        gamma=space['gamma'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=space['min_child_weight'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        objective='reg:squarederror',
        enable_categorical = True,
        tree_method='hist'
    )
    
    # Using cross-validation for evaluation
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

    # The score is negative MSE, we return its negative value for minimization
    return {'loss': -score, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best parameters:", best)


# Evaluation metrics & Optimal hyperparameters

In [None]:
#Best parameters auc: {'colsample_bytree': 0.729287347150897, 'gamma': 0.13865973186925346, 'learning_rate': 0.04065563148086127, 'max_depth': 4.0, 'min_child_weight': 9.0, 'n_estimators': 120.0, 'reg_alpha': 0.17293900654376804, 'reg_lambda': 0.43602874597504154, 'subsample': 0.9099295847018875}

#Best parameters f1: {'colsample_bytree': 0.6363920729021093, 'gamma': 0.07599537381660419, 'learning_rate': 0.04269086278215556, 'max_depth': 9.0, 'min_child_weight': 10.0, 'n_estimators': 92.0, 'reg_alpha': 0.3268718059713048, 'reg_lambda': 0.19347992629476463, 'subsample': 0.9178288043568541}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, average_precision_score, make_scorer, mean_absolute_error, median_absolute_error

# Use the best parameters
model = XGBRegressor(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    learning_rate=best['learning_rate'],
    subsample=best['subsample'],
    gamma=best['gamma'],
    colsample_bytree=best['colsample_bytree'],
    min_child_weight = best['min_child_weight'],
    # scale_pos_weight=scale_pos_weight_value,
    reg_alpha=best['reg_alpha'],
    reg_lambda=best['reg_lambda'],
    objective='reg:squarederror',
    tree_method='hist',
    enable_categorical = True
)

# Fit the model
model.fit(X_train, y_train)

def mae_prob(y_true, y_pred_probs):
    return mean_absolute_error(y_true, y_pred_probs)

def medae_prob(y_true, y_pred_probs):
    return median_absolute_error(y_true, y_pred_probs)

mae_prob_scorer = make_scorer(mae_prob)
medae_prob_scorer = make_scorer(medae_prob)

# logloss_score = cross_val_score(model, X, y, scoring= 'neg_log_loss')
# brier_score = cross_val_score(model, X, y, cv = 5, scoring = 'neg_brier_score')
scores_mae = cross_val_score(model, X, y, cv=5, scoring=mae_prob_scorer)
scores_medae = cross_val_score(model, X, y, cv=5, scoring=medae_prob_scorer)


# print('CV Average logloss: {0:0.4f}'.format(np.mean(logloss_score)))
print('CV mean_absolute_error: {0:0.4f}'.format(np.mean(scores_mae)))
# print('CV Average  brier score: {0:0.4f}'.format(np.mean(brier_score)))
print('CV median_absolute_error: {0:0.4f}'.format(np.mean(scores_medae)))