In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import gower
from sklearn_extra.cluster import KMedoids

In [84]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021].head(5000)

columns_to_keep = [
    'first_premium',
    'last_premium', 'first_split', 'last_split', 'last_customer_age',
    'last_accident_free_years', 'last_car_value', 'last_age_car',
    'last_weight', 'last_fuel_type', 'last_postcode', 'last_product',
    'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal',
    'last_wa-extra', 'last_sales_channel', 'nr_cars', 'fake_alarm',
    'policyholder_change', 'max_nr_coverages', 'last_nr_coverages',
    'accident_years', 'n_last_vs_peak', 'last_vs_first_split', 'lpa',
    'cum_change_premium_abs', 'cum_change_premium_perc', 
    # 'pc4', 'nr_years',
    # 'nr_ppl', 'nr_households', 'household_size', 'nr_homes', 'house_worth',
    # 'median_income_household', 'perc_low_income', 'perc_high_income',
    # 'ppl_social_help', 'density'
]

# Filter the DataFrame to keep only the specified columns
df = df[columns_to_keep]

#columns_clustering = ['last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', "last_postcode", "last_fuel_type", "nr_years", "last_premium", 'last_sales_channel', 'pc4', 'median_income_household', 'density', 'perc_others_ppl']

In [85]:
dist_matrix = gower.gower_matrix(df)


# K-Medoids

In [86]:
kmedoids = KMedoids(n_clusters=3, metric='precomputed', random_state=0)
kmedoids.fit(dist_matrix)

# Output the cluster labels
print("Cluster labels:", kmedoids.labels_)

df["cluster"] = kmedoids.labels_


Cluster labels: [1 0 2 ... 0 2 0]


In [91]:
display(
    df
    .groupby("cluster")
    .agg(
        # income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        # perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        # perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        # density=pd.NamedAgg(column="density", aggfunc="mean"),
        # household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
       # welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
       # churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"), 
        count=pd.NamedAgg(column="last_customer_age", aggfunc="count"), 
        premium = pd.NamedAgg(column='last_premium', aggfunc = 'mean'),
        last_accident_free_years =pd.NamedAgg(column="last_accident_free_years", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"),
        # perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
    )
)

Unnamed: 0_level_0,last_customer_age,count,premium,last_accident_free_years,last_car_value
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,50.304842,2375,6874.609186,11.961684,390096.608337
1,51.164021,1134,6888.860543,12.902998,412700.918519
2,47.008719,1491,6332.351023,8.305164,333149.866667


In [89]:
from sklearn.metrics import silhouette_score



# Compute the silhouette score
score = silhouette_score(dist_matrix, kmedoids.labels_, metric = 'precomputed')

print("Silhouette Score:", score)


Silhouette Score: 0.104034714


# Spectral Clustering

In [55]:
from sklearn.cluster import SpectralClustering
import numpy as np
beta = 1.0
similarity_matrix = np.exp(-beta * dist_matrix ** 2)

# Perform Spectral Clustering using the similarity matrix as affinity
n_clusters = 3  # Set the number of clusters you wish to find
clustering = SpectralClustering(n_clusters=n_clusters, affinity='rbf', n_init=100, assign_labels='discretize')
cluster_labels = clustering.fit_predict(dist_matrix)

# Output the cluster labels
print("Cluster labels:", cluster_labels)
df["cluster"] = cluster_labels

display(
    df
    .groupby("cluster")
    .agg(
        income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        count=pd.NamedAgg(column="median_income_household", aggfunc="count"),
        perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        density=pd.NamedAgg(column="density", aggfunc="mean"),
        household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
        churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"), 
        last_accident_free_years =pd.NamedAgg(column="last_accident_free_years", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"), 
        last_premium=pd.NamedAgg(column="last_premium", aggfunc="mean"),
        # perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
    )
)

# Compute the silhouette score
score = silhouette_score(dist_matrix, cluster_labels, metric = 'precomputed')

print("Silhouette Score:", score) 



Cluster labels: [2 0 2 ... 2 0 0]


Unnamed: 0_level_0,income,count,perc_low_income,perc_high_income,density,household_size,welcome_discount,churn,last_customer_age,last_accident_free_years,last_car_value,last_premium
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,32404.307432,2368,37.648184,20.714696,1428.357686,2.234079,0.079454,0.114443,48.169764,10.574324,351466.391216,6326.985946
1,28136.283186,565,50.882124,14.878053,4081.40354,2.078584,0.138366,0.269027,42.723894,6.60708,391334.561416,9378.08589
2,32580.745041,2067,37.072375,20.817175,1393.778423,2.237736,0.013737,0.091437,52.916788,12.893566,405337.34688,6434.339042


Silhouette Score: 0.15516564


# K-Prototypes

In [57]:
from kmodes.kprototypes import KPrototypes


X = df.values
categorical = [df.columns.get_loc(c) for c in df.select_dtypes(['category','object']).columns]


# Initialize the K-Prototypes model
kproto = KPrototypes(n_clusters=3, verbose=2, max_iter=20)

# Fit the model
clusters = kproto.fit_predict(X, categorical=categorical)

# Output the cluster for each instance
print("Cluster assignments:", clusters)

# Cluster centroids
print("Cluster centroids:")
print(kproto.cluster_centroids_)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/20, moves: 1893, ncost: 119202722057996.2
Run: 1, iteration: 2/20, moves: 936, ncost: 101956614334890.53
Run: 1, iteration: 3/20, moves: 515, ncost: 95394800573663.7
Run: 1, iteration: 4/20, moves: 285, ncost: 93514730075675.23
Run: 1, iteration: 5/20, moves: 162, ncost: 92736566643379.12
Run: 1, iteration: 6/20, moves: 116, ncost: 92208758581953.81
Run: 1, iteration: 7/20, moves: 88, ncost: 91895184772713.3
Run: 1, iteration: 8/20, moves: 39, ncost: 91835860093476.44
Run: 1, iteration: 9/20, moves: 14, ncost: 91825470144948.36
Run: 1, iteration: 10/20, moves: 10, ncost: 91820878992535.03
Run: 1, iteration: 11/20, moves: 8, ncost: 91818745692132.81
Run: 1, iteration: 12/20, moves: 9, ncost: 91816526243783.8
Run: 1, iteration: 13/20, moves: 5, ncost: 91815731340101.17
Run: 1, iteration: 14/20, moves: 5, ncost: 91

In [61]:
df["cluster"] = clusters


display(
    df
    .groupby("cluster")
    .agg(
        income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        count=pd.NamedAgg(column="median_income_household", aggfunc="count"),
        perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        density=pd.NamedAgg(column="density", aggfunc="mean"),
        household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
        churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"), 
        last_accident_free_years =pd.NamedAgg(column="last_accident_free_years", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"), 
        last_premium=pd.NamedAgg(column="last_premium", aggfunc="mean"),
        # perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
    )
)

Unnamed: 0_level_0,income,count,perc_low_income,perc_high_income,density,household_size,welcome_discount,churn,last_customer_age,last_accident_free_years,last_car_value,last_premium
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,32908.293839,422,37.61019,21.592891,1694.793839,2.227962,0.044561,0.123223,50.623223,13.241706,1050869.0,11178.684171
1,32520.230821,1473,37.748473,21.058452,1722.4759,2.23408,0.062386,0.126952,51.092329,12.632722,522001.0,7660.569841
2,31621.642512,3105,39.630564,19.438422,1712.358776,2.20905,0.059265,0.120129,48.619002,10.057327,218626.2,5661.602561


In [71]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
import pandas as pd

categorical_features = []
continuous_features = []
binary_features = []

# List of columns you want to keep
columns_to_keep = [
     'last_data_year', 'first_datapoint_year',
    'last_datapoint_year', 'first_data_year', 'churn', 'first_premium',
    'last_premium', 'first_split', 'last_split', 'last_customer_age',
    'last_accident_free_years', 'last_car_value', 'last_age_car',
    'last_weight', 'last_fuel_type', 'last_postcode', 'last_product',
    'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal',
    'last_wa-extra', 'last_sales_channel', 'nr_cars', 'fake_alarm',
    'policyholder_change', 'max_nr_coverages', 'last_nr_coverages',
    'accident_years', 'n_last_vs_peak', 'last_vs_first_split', 'lpa',
    'cum_change_premium_abs', 'cum_change_premium_perc', 'pc4', 'nr_years',
    'nr_ppl', 'nr_households', 'household_size', 'nr_homes', 'house_worth',
    'median_income_household', 'perc_low_income', 'perc_high_income',
    'ppl_social_help', 'density'
]

# Filter the DataFrame to keep only the specified columns
df = df[columns_to_keep]


# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 5

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

categorical_features = [col for col in categorical_features if col != "nr_years"]
continuous_features = continuous_features + ['nr_years']

# print(f'Binary Features: {binary_features}')
# print(f'Categorical Features: {categorical_features}')
# print(f'Continuous Features: {continuous_features}')

df = pd.get_dummies(df, columns=categorical_features, dtype="int")

class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features
    
    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]
    
        sc = StandardScaler()
        X = sc.fit_transform(X)
    
        pca = PCA(n_components=self.q).fit(X) # calculation Covmatrix is embeded in PCA
        A_q = pca.components_.T
    
        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_
    
        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))
    
        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]
            
# Usage
pfa = PFA(n_features=3)
pfa.fit(df)
# To get the transformed matrix
x = pfa.features_
print(x)
# To get the column indices of the kept features
column_indices = pfa.indices_

Binary Features: ['churn', 'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal', 'last_wa-extra', 'fake_alarm', 'policyholder_change', 'n_last_vs_peak', 'lpa']
Categorical Features: ['count', 'last_data_year', 'first_datapoint_year', 'last_datapoint_year', 'first_data_year', 'last_fuel_type', 'last_product', 'last_sales_channel', 'nr_cars', 'max_nr_coverages', 'last_nr_coverages']
Continuous Features: ['welcome_discount', 'first_premium', 'last_premium', 'first_split', 'last_split', 'last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', 'last_weight', 'last_postcode', 'accident_years', 'last_vs_first_split', 'cum_change_premium_abs', 'cum_change_premium_perc', 'pc4', 'nr_ppl', 'nr_households', 'household_size', 'nr_homes', 'house_worth', 'median_income_household', 'perc_low_income', 'perc_high_income', 'ppl_social_help', 'density', 'nr_years']
[[-0.2508489  -0.37345855 -0.56041096]
 [-0.2508489  -0.37345855 -0.56041096]
 [-0.2508489  -0.37345



In [72]:
# Assuming `df` is your pre-processed DataFrame (after one-hot encoding)

# Get the list of original column names (including dummy variables for categorical features)
original_columns = list(df.columns)

# Map selected indices back to column names
selected_feature_names = [original_columns[i] for i in column_indices]

print("Selected Feature Names:", selected_feature_names)


Selected Feature Names: ['max_nr_coverages_1', 'churn', 'last_nr_coverages_2']
