In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import gower
from sklearn_extra.cluster import KMedoids

In [None]:
df = pd.read_csv("../data/prepped_data.csv", low_memory=False, index_col=0).drop_duplicates()

df = df[df["first_data_year"] >= 2021].head(5000)

columns_to_keep = [
    'first_premium',
    'last_premium', 'first_split', 'last_split', 'last_customer_age',
    'last_accident_free_years', 'last_car_value', 'last_age_car',
    'last_weight', 'last_fuel_type', 'last_postcode', 'last_product',
    'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal',
    'last_wa-extra', 'last_sales_channel', 'nr_cars', 'fake_alarm',
    'policyholder_change', 'max_nr_coverages', 'last_nr_coverages',
    'accident_years', 'n_last_vs_peak', 'last_vs_first_split', 'lpa',
    'cum_change_premium_abs', 'cum_change_premium_perc', 
    # 'pc4', 'nr_years',
    # 'nr_ppl', 'nr_households', 'household_size', 'nr_homes', 'house_worth',
    # 'median_income_household', 'perc_low_income', 'perc_high_income',
    # 'ppl_social_help', 'density'
]

# Filter the DataFrame to keep only the specified columns
df = df[columns_to_keep]

#columns_clustering = ['last_customer_age', 'last_accident_free_years', 'last_car_value', 'last_age_car', "last_postcode", "last_fuel_type", "nr_years", "last_premium", 'last_sales_channel', 'pc4', 'median_income_household', 'density', 'perc_others_ppl']

In [None]:
dist_matrix = gower.gower_matrix(df)


# K-Medoids

In [None]:
kmedoids = KMedoids(n_clusters=3, metric='precomputed', random_state=0)
kmedoids.fit(dist_matrix)

# Output the cluster labels
print("Cluster labels:", kmedoids.labels_)

df["cluster"] = kmedoids.labels_


In [None]:
display(
    df
    .groupby("cluster")
    .agg(
        # income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        # perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        # perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        # density=pd.NamedAgg(column="density", aggfunc="mean"),
        # household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
       # welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
       # churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"), 
        count=pd.NamedAgg(column="last_customer_age", aggfunc="count"), 
        premium = pd.NamedAgg(column='last_premium', aggfunc = 'mean'),
        last_accident_free_years =pd.NamedAgg(column="last_accident_free_years", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"),
        # perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
    )
)

In [None]:
from sklearn.metrics import silhouette_score



# Compute the silhouette score
score = silhouette_score(dist_matrix, kmedoids.labels_, metric = 'precomputed')

print("Silhouette Score:", score)


# Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering
import numpy as np
beta = 1.0
similarity_matrix = np.exp(-beta * dist_matrix ** 2)

# Perform Spectral Clustering using the similarity matrix as affinity
n_clusters = 3  # Set the number of clusters you wish to find
clustering = SpectralClustering(n_clusters=n_clusters, affinity='rbf', n_init=100, assign_labels='discretize')
cluster_labels = clustering.fit_predict(dist_matrix)

# Output the cluster labels
print("Cluster labels:", cluster_labels)
df["cluster"] = cluster_labels

display(
    df
    .groupby("cluster")
    .agg(
        income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        count=pd.NamedAgg(column="median_income_household", aggfunc="count"),
        perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        density=pd.NamedAgg(column="density", aggfunc="mean"),
        household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
        churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"), 
        last_accident_free_years =pd.NamedAgg(column="last_accident_free_years", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"), 
        last_premium=pd.NamedAgg(column="last_premium", aggfunc="mean"),
        # perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
    )
)

# Compute the silhouette score
score = silhouette_score(dist_matrix, cluster_labels, metric = 'precomputed')

print("Silhouette Score:", score) 

# K-Prototypes

In [None]:
from kmodes.kprototypes import KPrototypes


X = df.values
categorical = [df.columns.get_loc(c) for c in df.select_dtypes(['category','object']).columns]


# Initialize the K-Prototypes model
kproto = KPrototypes(n_clusters=3, verbose=2, max_iter=20)

# Fit the model
clusters = kproto.fit_predict(X, categorical=categorical)

# Output the cluster for each instance
print("Cluster assignments:", clusters)

# Cluster centroids
print("Cluster centroids:")
print(kproto.cluster_centroids_)

In [None]:
df["cluster"] = clusters


display(
    df
    .groupby("cluster")
    .agg(
        income=pd.NamedAgg(column="median_income_household", aggfunc="mean"),
        count=pd.NamedAgg(column="median_income_household", aggfunc="count"),
        perc_low_income=pd.NamedAgg(column="perc_low_income", aggfunc="mean"),
        perc_high_income=pd.NamedAgg(column="perc_high_income", aggfunc="mean"),
        density=pd.NamedAgg(column="density", aggfunc="mean"),
        household_size=pd.NamedAgg(column="household_size", aggfunc="mean"),
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="mean"),
        churn=pd.NamedAgg(column="churn", aggfunc="mean"),
        last_customer_age=pd.NamedAgg(column="last_customer_age", aggfunc="mean"), 
        last_accident_free_years =pd.NamedAgg(column="last_accident_free_years", aggfunc="mean"),
        last_car_value=pd.NamedAgg(column="last_car_value", aggfunc="mean"), 
        last_premium=pd.NamedAgg(column="last_premium", aggfunc="mean"),
        # perc_others_ppl=pd.NamedAgg(column="perc_others_ppl", aggfunc="mean"),
    )
)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
import pandas as pd

categorical_features = []
continuous_features = []
binary_features = []

# List of columns you want to keep
columns_to_keep = [
     'last_data_year', 'first_datapoint_year',
    'last_datapoint_year', 'first_data_year', 'churn', 'first_premium',
    'last_premium', 'first_split', 'last_split', 'last_customer_age',
    'last_accident_free_years', 'last_car_value', 'last_age_car',
    'last_weight', 'last_fuel_type', 'last_postcode', 'last_product',
    'last_allrisk basis', 'last_allrisk compleet', 'last_allrisk royaal',
    'last_wa-extra', 'last_sales_channel', 'nr_cars', 'fake_alarm',
    'policyholder_change', 'max_nr_coverages', 'last_nr_coverages',
    'accident_years', 'n_last_vs_peak', 'last_vs_first_split', 'lpa',
    'cum_change_premium_abs', 'cum_change_premium_perc', 'pc4', 'nr_years',
    'nr_ppl', 'nr_households', 'household_size', 'nr_homes', 'house_worth',
    'median_income_household', 'perc_low_income', 'perc_high_income',
    'ppl_social_help', 'density'
]

# Filter the DataFrame to keep only the specified columns
df = df[columns_to_keep]


# Define a threshold for the maximum number of unique values for a categorical column
max_unique_values_for_categorical = 5

# Iterate through each column to determine if it's categorical, continuous, or binary
for column in df.columns:
    unique_values = df[column].nunique()
    if unique_values == 2:
        # If exactly 2 unique values, treat column as binary
        binary_features.append(column)
    elif (df[column].dtype == 'object' or unique_values <= max_unique_values_for_categorical) and unique_values > 2:
        # If object type or up to the threshold of unique values (and more than 2), treat as categorical
        categorical_features.append(column)
    else:
        # Otherwise, treat as continuous
        continuous_features.append(column)

categorical_features = [col for col in categorical_features if col != "nr_years"]
continuous_features = continuous_features + ['nr_years']

# print(f'Binary Features: {binary_features}')
# print(f'Categorical Features: {categorical_features}')
# print(f'Continuous Features: {continuous_features}')

df = pd.get_dummies(df, columns=categorical_features, dtype="int")

class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features
    
    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]
    
        sc = StandardScaler()
        X = sc.fit_transform(X)
    
        pca = PCA(n_components=self.q).fit(X) # calculation Covmatrix is embeded in PCA
        A_q = pca.components_.T
    
        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_
    
        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))
    
        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]
            
# Usage
pfa = PFA(n_features=3)
pfa.fit(df)
# To get the transformed matrix
x = pfa.features_
print(x)
# To get the column indices of the kept features
column_indices = pfa.indices_

In [None]:
# Assuming `df` is your pre-processed DataFrame (after one-hot encoding)

# Get the list of original column names (including dummy variables for categorical features)
original_columns = list(df.columns)

# Map selected indices back to column names
selected_feature_names = [original_columns[i] for i in column_indices]

print("Selected Feature Names:", selected_feature_names)
