# Import Packages

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import gower
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, AgglomerativeClustering, HDBSCAN, Birch, MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np
from itertools import combinations, chain
from tqdm import tqdm
import matplotlib.pyplot as plt
import kmedoids
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix

plt.style.use('ggplot')

# Load Data

In [None]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()
segments = pd.read_csv("./segments.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021].head(1000)

In [None]:
cols_to_drop_init = ["welcome_discount", "policy_nr_hashed", "control_group", "churn", "last_data_year", "first_datapoint_year", "last_datapoint_year", "first_data_year", 'last_type', 'lpa', 'count', 'cluster']
cols_to_keep = [col for col in df.columns if col not in cols_to_drop_init]

df_filt = df[cols_to_keep]
df_filt_preapplied = pd.merge(df[cols_to_keep + ["policy_nr_hashed"]], segments, on='policy_nr_hashed', how='inner').drop("policy_nr_hashed", axis=1)

dist_matrix = gower.gower_matrix(df_filt)
# dist_matrix = pd.read_csv("../data/gower_matrix.csv").to_numpy()

# Apply Clustering

In [None]:
def kmeans_cluster(dist_matrix, n):
    cluster = MiniBatchKMeans(n_clusters=n, random_state=0, n_init='auto').fit(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster.labels_)
    db_score = davies_bouldin_score(dist_matrix, cluster.labels_)
    ch_score = calinski_harabasz_score(dist_matrix, cluster.labels_)

    return sh_score, db_score, ch_score, cluster

def kmedoids_cluster(dist_matrix, n):
    cluster = kmedoids.KMedoids(n, method='fasterpam', init='build', random_state=0).fit(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster.labels_)
    db_score = davies_bouldin_score(dist_matrix, cluster.labels_)
    ch_score = calinski_harabasz_score(dist_matrix, cluster.labels_)

    return sh_score, db_score, ch_score, cluster

def spectral_cluster(dist_matrix, n):
    cluster_labels = SpectralClustering(n_clusters=n, n_init=100, assign_labels='discretize', affinity="precomputed").fit_predict(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster_labels)
    db_score = davies_bouldin_score(dist_matrix, cluster_labels)
    ch_score = calinski_harabasz_score(dist_matrix, cluster_labels)

    return sh_score, db_score, ch_score, cluster_labels

def hiererchichal_cluster(dist_matrix, n):
    cluster_labels = AgglomerativeClustering(n_clusters=n, linkage='complete', metric="precomputed").fit_predict(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster_labels)
    db_score = davies_bouldin_score(dist_matrix, cluster_labels)
    ch_score = calinski_harabasz_score(dist_matrix, cluster_labels)

    return sh_score, db_score, ch_score, cluster_labels

In [None]:
# sh_score, db_score, ch_score, cluster = kmeans_cluster(dist_matrix, 5)
sh_score, db_score, ch_score, cluster = kmedoids_cluster(dist_matrix, 4)
# sh_score, db_score, ch_score, cluster = spectral_cluster(dist_matrix, 5)

print(f"Silhouette Score: {np.round(sh_score, 3)}")
print(f"Davies Bouldin: {np.round(db_score, 3)}")
print(f"Calinski Harabasz Score: {np.round(ch_score, 3)}")

In [None]:
sh_list = []
db_list = []
ch_list = []
n_list = []

for n in tqdm(np.arange(2, 11, 1)):
    sh_score, db_score, ch_score, cluster = hiererchichal_cluster(dist_matrix, int(n))

    sh_list.append(sh_score)
    db_list.append(db_score)
    ch_list.append(ch_score)
    n_list.append(n)

print(np.max(sh_score))

In [None]:
# plt.plot(n_list, ch_list, marker ='.', color='cornflowerblue')
# plt.xticks(n_list)
# plt.xlabel("Number of Clusters")
# plt.ylabel("Silhouette Score")
# plt.axvline(4, linestyle='--', color='coral', label='Constraint')
# plt.savefig('../plots/segments_sh.png', dpi=100)
# plt.show()

In [None]:
display(df_filt_preapplied.groupby("cluster").count())

In [None]:
# sh_score, db_score, ch_score, cluster = kmedoids_cluster(dist_matrix, 5)

mean_agg = {col: pd.NamedAgg(column=col, aggfunc='mean') for col in df_filt.columns if df_filt[col].dtype != 'object'}

df_clust = (
    df_filt_preapplied
    .groupby("cluster")
    .agg(
        **mean_agg
    )
).drop(["last_postcode", "perc_western_ppl", "perc_nld_ppl", "perc_others_ppl", "last_allrisk royaal", "last_allrisk compleet", "last_vs_first_split", "last_wa-extra", "policyholder_change", "n_last_vs_peak", "fake_alarm", "last_allrisk basis", "last_split", "max_nr_coverages", "nr_years", "cum_change_premium_abs", "cum_change_premium_perc", "pc4", "last_premium"], axis=1)

np_clust_expl = StandardScaler().fit_transform(df_clust)

def top_5_columns_with_values(row):
    # Get the top 5 absolute values and their corresponding column names
    top_5 = row.abs().nlargest(10)
    # Create a dictionary mapping column names to their raw values in the row
    top_5_dict = {col: np.round(row[col], 2) for col in top_5.index if np.abs(row[col]) >= 0.8}
    return top_5_dict

df_clust_expl = pd.DataFrame(np_clust_expl, columns=df_clust.columns, index=df_clust.index)

top_5_per_row = df_clust_expl.apply(top_5_columns_with_values, axis=1).tolist()

for i in range(len(top_5_per_row)):
    print(i, ":", top_5_per_row[i])
    print("")

# Classifier

In [None]:
display(df_filt)

In [None]:
X = df_filt[[col for col in df_filt.columns if col != "cluster"]]
y = df_filt['cluster']

for col in X.columns:
     if X[col].dtype == "object":
        X[col] = X[col].astype("category")

# Run model selection
space = {
    'max_depth': hp.uniformint('max_depth', 50, 100),
    'n_estimators': hp.uniformint('n_estimators', 50, 200),
    'num_leaves': hp.uniformint('num_leaves', 2, 200),
    'min_child_samples': hp.uniformint('min_child_samples', 7, 100),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.25, 1),
    'subsample': hp.uniform('subsample', 0.25, 1),
    'subsample_freq': hp.uniformint('subsample_freq', 1, 100),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.2),
    'reg_lambda': hp.uniform('reg_lambda', 0, 0.2),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.5),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
    'min_data_in_leaf': hp.uniformint('min_data_in_leaf', 1, 21),
}

def objective(params):
    clf = lgb.LGBMClassifier(
        objective='multiclass',
        force_row_wise=True,
        verbosity=-1,
        random_state=0,
        # is_unbalance=True,
        **params
    )
    score = cross_val_score(clf, X, y, cv=5, scoring="f1_macro").mean()
    return {'loss': -score, 'status': STATUS_OK}

n_iter = 10
trials = Trials()

best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=n_iter, trials=trials, rstate=np.random.default_rng(seed=0))

print("Best Score is: ", -trials.best_trial['result']['loss'])
print("Best Parameters: ", best)

In [None]:
# Save best model
best_params = {
    'max_depth': int(best['max_depth']),
    'n_estimators': int(best['n_estimators']),
    'num_leaves': int(best['num_leaves']),
    'min_child_samples': int(best['min_child_samples']),
    'colsample_bytree': best['colsample_bytree'],
    'subsample': best['subsample'],
    'subsample_freq': int(best['subsample_freq']),
    'reg_alpha': best['reg_alpha'],
    'reg_lambda': best['reg_lambda'],
    'min_split_gain': best['min_split_gain'],
    'learning_rate': best['learning_rate'],
    'min_data_in_leaf': int(best['min_data_in_leaf']),
}

lgbm_best = lgb.LGBMClassifier(
    objective='multiclass',
    force_row_wise=True,
    verbosity=-1,
    random_state=0,
    # is_unbalance=True,
    **best_params
)

print(f"Accuracy: {np.mean(cross_val_score(lgbm_best, X, y, cv=5, scoring='accuracy'))}")
print(f"F1 Macro: {np.mean(cross_val_score(lgbm_best, X, y, cv=5, scoring='f1_macro'))}")
print(f"F1 Micro: {np.mean(cross_val_score(lgbm_best, X, y, cv=5, scoring='f1_micro'))}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

lgbm_best = lgbm_best.fit(X_train, y_train)
y_pred = lgbm_best.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)