# Import Packages

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import gower
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, AgglomerativeClustering, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Load Data

In [2]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021].head(5000)

# Apply Clustering

In [12]:
columns_clustering = ['last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', "last_postcode", "last_fuel_type", "nr_years", "last_premium", 'last_sales_channel', 'median_income_household', 'density', 'pc4']

dist_matrix = gower.gower_matrix(df[columns_clustering])

cluster = KMeans(n_clusters=3, random_state=0, n_init='auto').fit(dist_matrix)
# cluster = HDBSCAN(min_cluster_size=3, min_samples=int(len(dist_matrix)/10)).fit(dist_matrix)

df["cluster"] = cluster.labels_

display(
    df
    .groupby("cluster")
    .agg(
        count=pd.NamedAgg(column="churn", aggfunc="count"),
        income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        density=pd.NamedAgg(column="density", aggfunc="mean"),
        household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
        churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"),
        perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
        perc_nld_ppl=pd.NamedAgg(column="perc_nld_ppl", aggfunc="mean"),
    )
)

print(silhouette_score(dist_matrix, df["cluster"]))
print(davies_bouldin_score(dist_matrix, df["cluster"]))
print(calinski_harabasz_score(dist_matrix, df["cluster"]))

Unnamed: 0_level_0,count,income,perc_low_income,perc_high_income,density,household_size,welcome_discount,churn,last_customer_age,last_car_value,perc_others_ppl,perc_nld_ppl
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1328,32862.274096,38.090286,22.21762,2139.276355,2.249021,0.045929,0.098645,49.112199,330176.050602,20.504518,69.99247
1,2333,32034.462066,37.531247,19.642563,1133.444921,2.226447,0.030093,0.089156,51.24775,345164.494814,5.949421,83.600514
2,1339,31065.944735,42.108738,18.787528,2303.209858,2.172591,0.122119,0.203883,46.902166,483544.567588,22.285288,66.295743


0.26249254
1.2927198181904045
2261.5231449291737
