# Import Packages

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import gower
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, AgglomerativeClustering, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score



# Load Data

In [2]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021].head(5000)

# Apply Clustering

In [13]:
columns_clustering = ['last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', "last_postcode", "last_fuel_type", "nr_years", "last_premium", 'last_sales_channel', 'median_income_household', 'density', 'pc4']

dist_matrix = gower.gower_matrix(df)

# cluster = KMeans(n_clusters=3, random_state=0, n_init='auto').fit(dist_matrix)
cluster = SpectralClustering(n_clusters=3, eigen_solver='lobpcg', gamma=0.1, n_components=2, n_neighbors=5, n_init=20).fit(dist_matrix)

df["cluster"] = cluster.labels_

display(
    df
    .groupby("cluster")
    .agg(
        count=pd.NamedAgg(column="churn", aggfunc="count"),
        income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        density=pd.NamedAgg(column="density", aggfunc="mean"),
        household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
        churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"),
        perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
        perc_nld_ppl=pd.NamedAgg(column="perc_nld_ppl", aggfunc="mean"),
    )
)

print(silhouette_score(dist_matrix, df["cluster"]))
print(davies_bouldin_score(dist_matrix, df["cluster"]))
print(calinski_harabasz_score(dist_matrix, df["cluster"]))



Unnamed: 0_level_0,count,income,perc_low_income,perc_high_income,density,household_size,welcome_discount,churn,last_customer_age,last_car_value,perc_others_ppl,perc_nld_ppl
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,210,27789.047619,51.198571,14.497143,4082.109524,2.105714,0.160102,0.295238,41.2,395091.093333,54.047619,32.52381
1,4440,32465.945946,37.435833,20.734775,1433.574099,2.234752,0.047392,0.101577,50.336036,375400.020901,9.921171,80.101351
2,350,28543.714286,50.174571,15.373714,3848.491429,2.073143,0.144789,0.282857,44.114286,404181.472,44.428571,40.8


0.33471826
1.4351504769544992
1249.9726628181218
