# Import Packages

In [54]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, average_precision_score, make_scorer, precision_recall_curve, roc_curve, mean_absolute_error, median_absolute_error
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.linear_model import LogisticRegression

# Load Data

In [2]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["welcome_discount"] == 1]

# Setup Model

In [3]:
print(df.columns)

Index(['policy_nr_hashed', 'welcome_discount', 'last_data_year',
       'first_data_year', 'churn', 'control_group', 'first_premium',
       'last_premium', 'first_split', 'last_split', 'last_customer_age',
       'last_accident_free_years', 'last_car_value', 'last_age_car',
       'last_brand', 'last_type', 'last_weight', 'last_fuel_type',
       'last_postcode', 'last_product', 'last_allrisk basis',
       'last_allrisk compleet', 'last_allrisk royaal', 'last_wa-extra',
       'nr_cars', 'fake_alarm', 'policyholder_change', 'max_nr_coverages',
       'last_nr_coverages', 'last_trend_nr_coverages', 'accident_years',
       'last_year_car_change', 'last_change_premium_abs',
       'last_change_premium_perc', 'years_since_last_car_change',
       'n_last_vs_peak', 'last_vs_first_split', 'lpa',
       'cum_change_premium_abs', 'cum_change_premium_perc'],
      dtype='object')


In [4]:
categorical_features = []
continuous_features = []
binary_features = []

# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 10

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

categorical_features = [col for col in categorical_features if col != "years_since_last_car_change"]
continuous_features = continuous_features + ["years_since_last_car_change"]

print(f'Binary Features: {binary_features}')
print(f'Categorical Features: {categorical_features}')
print(f'Continuous Features: {continuous_features}')

for cat in categorical_features:
     df[cat] = df[cat].astype("category")

Binary Features: ['churn', 'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal', 'last_wa-extra', 'fake_alarm', 'policyholder_change', 'lpa']
Categorical Features: ['policy_nr_hashed', 'last_data_year', 'control_group', 'last_brand', 'last_type', 'last_fuel_type', 'last_postcode', 'last_product', 'nr_cars', 'max_nr_coverages', 'last_nr_coverages', 'last_trend_nr_coverages', 'last_year_car_change', 'n_last_vs_peak']
Continuous Features: ['welcome_discount', 'first_data_year', 'first_premium', 'last_premium', 'first_split', 'last_split', 'last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', 'last_weight', 'accident_years', 'last_change_premium_abs', 'last_change_premium_perc', 'last_vs_first_split', 'cum_change_premium_abs', 'cum_change_premium_perc', 'years_since_last_car_change']


In [5]:
# cols_to_drop_manual = ["first_split", "first_premium", "nr_cars", "last_type", "last_brand", 'last_weight', 'n_last_vs_peak', 'last_fuel_type', 'last_trend_nr_coverages', 'last_change_premium_abs', 'last_change_premium_perc', 'max_nr_coverages', 'last_nr_coverages',]
cols_to_drop = ["churn", "policy_nr_hashed", "last_data_year", "first_data_year", "control_group"]
selected_columns = [col for col in df.columns if not any(col.startswith(prefix) for prefix in cols_to_drop)]

X = df[selected_columns]
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Custom Metric

In [6]:
weights = {
    'TP': 0,  # High importance to correctly identify churners
    'FP': 0,  # Moderate to high cost of misclassifying loyal customers
    'FN': 100.0,   # High cost of missing actual churners
    'TN': 100.0
}

def weighted_cost(y_true, y_scores, weights):

    tn, fp, fn, tp = confusion_matrix(y_true, y_scores).ravel()

    # Weight everyone
    TP_wght = tp * weights['TP']
    FP_wght = fp * weights['FP']
    FN_wght = fn * weights['FN']
    TN_wght = tn * weights['TN']

    # Compute weighted cost and normalise
    wghtd_cost = TP_wght + FP_wght + FN_wght + TN_wght

    return -wghtd_cost

weighted_cost_scorer = make_scorer(weighted_cost, needs_proba=True, weights=weights)

# Model Selection

In [40]:
space = {
    'max_depth': hp.uniformint('max_depth', 50, 100),
    'n_estimators': hp.uniformint('n_estimators', 50, 200),
    'num_leaves': hp.uniformint('num_leaves', 2, 200),
    'min_child_samples': hp.uniformint('min_child_samples', 7, 100),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.25, 1),
    'subsample': hp.uniform('subsample', 0.25, 1),
    'subsample_freq': hp.uniformint('subsample_freq', 1, 100),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.2),
    'reg_lambda': hp.uniform('reg_lambda', 0, 0.2),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.5),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 1, 21),
}

def objective(params):
    clf = lgb.LGBMClassifier(
        objective='binary',
        force_row_wise=True,
        verbosity=-1,
        # n_estimators=50,
        **params
    )
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring="neg_brier_score").mean()
    return {'loss': -score, 'status': STATUS_OK}

n_iter = 50
trials = Trials()

best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=n_iter, trials=trials, rstate=np.random.RandomState(0))

print("Best Score is: ", -trials.best_trial['result']['loss'])
print("Best Parameters: ", best)

  0%|          | 0/50 [00:04<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [43]:
best = {'colsample_bytree': 0.7989227943227654, 'learning_rate': 0.017679237161356855, 'max_depth': 52.0, 'min_child_samples': 8.0, 'min_data_in_leaf': 21.0, 'min_split_gain': 0.2859207858932968, 'n_estimators': 197.0, 'num_leaves': 192.0, 'reg_alpha': 0.1325013951739118, 'reg_lambda': 0.16453662079071193, 'subsample': 0.9956021256933638, 'subsample_freq': 85.0}

best_params = {
    'max_depth': int(best['max_depth']),
    'n_estimators': int(best['n_estimators']),
    'num_leaves': int(best['num_leaves']),
    'min_child_samples': int(best['min_child_samples']),
    'colsample_bytree': best['colsample_bytree'],
    'subsample': best['subsample'],
    'subsample_freq': int(best['subsample_freq']),
    'reg_alpha': best['reg_alpha'],
    'reg_lambda': best['reg_lambda'],
    'min_split_gain': best['min_split_gain'],
    'learning_rate': best['learning_rate'],
    'min_data_in_leaf': int(best['min_data_in_leaf'])
}

lgbm_best = lgb.LGBMClassifier(
    objective='binary',
    force_row_wise=True,
    verbosity=-1,
    **best_params
)

# Validation

In [55]:
def mae_prob(y_true, y_pred_probs):
    return mean_absolute_error(y_true, y_pred_probs)

def medae_prob(y_true, y_pred_probs):
    return median_absolute_error(y_true, y_pred_probs)

mae_prob_scorer = make_scorer(mae_prob, needs_proba=True)
medae_prob_scorer = make_scorer(medae_prob, needs_proba=True)

In [56]:
scores_brier = cross_val_score(lgbm_best, X, y, cv=5, scoring='neg_brier_score')
scores_log_loss = cross_val_score(lgbm_best, X, y, cv=5, scoring='neg_log_loss')
scores_mae = cross_val_score(lgbm_best, X, y, cv=5, scoring=mae_prob_scorer)
scores_medae = cross_val_score(lgbm_best, X, y, cv=5, scoring=medae_prob_scorer)

print('CV Average Brier score: {0:0.4f}'.format(-np.mean(scores_brier)))
print('CV Average Log Loss: {0:0.4f}'.format(-np.mean(scores_log_loss)))
print('CV Average Root Brier score: {0:0.4f}'.format(np.sqrt(-np.mean(scores_brier))))
print('CV MAE: {0:0.4f}'.format(np.mean(scores_mae)))
print('CV MedAE: {0:0.4f}'.format(np.mean(scores_medae)))

CV Average Brier score: 0.0964
CV Average Log Loss: 0.3240
CV Average Root Brier score: 0.3105
CV MAE: 0.1987
CV MAE: 0.0925


In [50]:
lgbm_best.fit(X_train, y_train)
probabilities = lgbm_best.predict_proba(X_test)[:, 1]
display(probabilities)
display(y_test)

array([0.03631447, 0.90287361, 0.06589497, ..., 0.16936156, 0.07543912,
       0.02345877])

62054    0
56228    1
79688    0
30544    0
32448    0
        ..
33760    0
12980    0
45211    0
679      0
43960    0
Name: churn, Length: 22262, dtype: int64