# Import Packages

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import gower
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, AgglomerativeClustering, HDBSCAN, Birch, MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import kmedoids

plt.style.use('ggplot')

# Load Data

In [None]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()
segments = pd.read_csv("./segments.csv", low_memory=False, index_col=0).drop_duplicates()

# Head is to prevent long run times but if you want to generate the full data, you have to remove it and wait 45 minutes
df = df[df["first_data_year"] >= 2021].head(1000)

In [None]:
cols_to_drop_init = ["welcome_discount", "policy_nr_hashed", "control_group", "churn", "last_data_year", "first_datapoint_year", "last_datapoint_year", "first_data_year", 'last_type', 'lpa', 'count', 'cluster']
cols_to_keep = [col for col in df.columns if col not in cols_to_drop_init]

df_filt = df[cols_to_keep]
df_filt_preapplied = pd.merge(df[cols_to_keep + ["policy_nr_hashed"]], segments, on='policy_nr_hashed', how='inner').drop("policy_nr_hashed", axis=1)

dist_matrix = gower.gower_matrix(df_filt)
# dist_matrix = pd.read_csv("../data/gower_matrix.csv").to_numpy()

# Apply Clustering

## Clustering Functions

In [None]:
def kmeans_cluster(dist_matrix, n):
    cluster = MiniBatchKMeans(n_clusters=n, random_state=0, n_init='auto').fit(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster.labels_)
    db_score = davies_bouldin_score(dist_matrix, cluster.labels_)
    ch_score = calinski_harabasz_score(dist_matrix, cluster.labels_)

    return sh_score, db_score, ch_score, cluster

def kmedoids_cluster(dist_matrix, n):
    cluster = kmedoids.KMedoids(n, method='fasterpam', init='build', random_state=0).fit(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster.labels_)
    db_score = davies_bouldin_score(dist_matrix, cluster.labels_)
    ch_score = calinski_harabasz_score(dist_matrix, cluster.labels_)

    return sh_score, db_score, ch_score, cluster

def spectral_cluster(dist_matrix, n):
    cluster_labels = SpectralClustering(n_clusters=n, n_init=100, assign_labels='discretize', affinity="precomputed").fit_predict(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster_labels)
    db_score = davies_bouldin_score(dist_matrix, cluster_labels)
    ch_score = calinski_harabasz_score(dist_matrix, cluster_labels)

    return sh_score, db_score, ch_score, cluster_labels

def hiererchichal_cluster(dist_matrix, n):
    cluster_labels = AgglomerativeClustering(n_clusters=n, linkage='complete', metric="precomputed").fit_predict(dist_matrix)

    sh_score = silhouette_score(dist_matrix, cluster_labels)
    db_score = davies_bouldin_score(dist_matrix, cluster_labels)
    ch_score = calinski_harabasz_score(dist_matrix, cluster_labels)

    return sh_score, db_score, ch_score, cluster_labels

## Get the clusters

In [None]:
# sh_score, db_score, ch_score, cluster = kmeans_cluster(dist_matrix, 5)
sh_score, db_score, ch_score, cluster = kmedoids_cluster(dist_matrix, 4)
# sh_score, db_score, ch_score, cluster = spectral_cluster(dist_matrix, 5)

print(f"Silhouette Score: {np.round(sh_score, 3)}")
print(f"Davies Bouldin: {np.round(db_score, 3)}")
print(f"Calinski Harabasz Score: {np.round(ch_score, 3)}")

## Compute the data for the graphs to select the best nr of clusters

In [None]:
sh_list = []
db_list = []
ch_list = []
n_list = []

for n in tqdm(np.arange(2, 11, 1)):
    sh_score, db_score, ch_score, cluster = kmedoids_cluster(dist_matrix, int(n))

    sh_list.append(sh_score)
    db_list.append(db_score)
    ch_list.append(ch_score)
    n_list.append(n)

print(np.max(sh_score))

## Graph plotting

In [None]:
# plt.plot(n_list, ch_list, marker ='.', color='cornflowerblue')
# plt.xticks(n_list)
# plt.xlabel("Number of Clusters")
# plt.ylabel("Silhouette Score")
# plt.axvline(4, linestyle='--', color='coral', label='Constraint')
# plt.savefig('../plots/segments_sh.png', dpi=100)
# plt.show()

## Number of customers per clusters

In [None]:
display(df_filt_preapplied.groupby("cluster").count())

## Find the characteristics of each clusters

In [None]:
mean_agg = {col: pd.NamedAgg(column=col, aggfunc='mean') for col in df_filt.columns if df_filt[col].dtype != 'object'}

df_clust = (
    df_filt_preapplied
    .groupby("cluster")
    .agg(
        **mean_agg
    )
).drop(["last_postcode", "perc_western_ppl", "perc_nld_ppl", "perc_others_ppl", "last_allrisk royaal", "last_allrisk compleet", "last_vs_first_split", "last_wa-extra", "policyholder_change", "n_last_vs_peak", "fake_alarm", "last_allrisk basis", "last_split", "max_nr_coverages", "nr_years", "cum_change_premium_abs", "cum_change_premium_perc", "pc4", "last_premium"], axis=1)

np_clust_expl = StandardScaler().fit_transform(df_clust)

def top_5_columns_with_values(row):
    # Get the top 5 absolute values and their corresponding column names
    top_5 = row.abs().nlargest(10)
    # Create a dictionary mapping column names to their raw values in the row
    top_5_dict = {col: np.round(row[col], 2) for col in top_5.index if np.abs(row[col]) >= 0.8}
    return top_5_dict

df_clust_expl = pd.DataFrame(np_clust_expl, columns=df_clust.columns, index=df_clust.index)

top_5_per_row = df_clust_expl.apply(top_5_columns_with_values, axis=1).tolist()

for i in range(len(top_5_per_row)):
    print(i, ":", top_5_per_row[i])
    print("")