In [1]:
from kmodes import kprototypes
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics.pairwise import cosine_distances,cosine_similarity,pairwise_distances
from scipy.spatial.distance import euclidean, hamming, cosine,cdist
from sklearn.preprocessing import LabelEncoder
import time
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

In [2]:
df = pd.read_csv('/content/drive/MyDrive/sample_data/BankChurners.csv')

In [3]:
df.shape

(10127, 24)

In [4]:
df.columns

Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')

In [5]:
df.drop([i for i in df.columns if 'unnamed' in i.lower()],axis=1,inplace = True)

In [6]:
categorical_cols = [ i for i in df.columns if len(df[i].unique())<=10]
continuous_cols = [ i for i in df.columns if i not in categorical_cols]
categorical_cols_index = [df.columns.tolist().index(i) for i in categorical_cols]

In [7]:
rows =[0,1]
row_a = np.array(df[categorical_cols].iloc[rows[0]])
row_b = np.array(df[categorical_cols].iloc[rows[1]])

In [8]:
hamming(row_a,row_b)

0.7

In [9]:
rows =[0,1]
row_a = np.array(df[continuous_cols].iloc[rows[0]])
row_b = np.array(df[continuous_cols].iloc[rows[1]])

In [10]:
euclidean(row_a,row_b)

6336.167091254223

In [15]:
cosine(row_a,row_b)

0.0020772267353964535

In [38]:
import numpy as np

def cosine_dissimilarity(P, Q):
    """
    Calculate the cosine dissimilarity between two vectors P and Q.
    Cosine dissimilarity is 1 minus the cosine similarity.

    Parameters:
    P (array-like): First input vector.
    Q (array-like): Second input vector.

    Returns:
    float: The cosine dissimilarity between the two vectors.
    """
    # Convert input vectors to numpy arrays (in case they are lists or other types)
    P = np.array(P)
    Q = np.array(Q)

    # Calculate cosine similarity
    cosine_similarity = np.dot(P, Q) / (np.linalg.norm(P) * np.linalg.norm(Q))

    # Cosine dissimilarity is 1 - cosine similarity
    cosine_dissimilarity = 1 - cosine_similarity

    return cosine_dissimilarity


In [39]:
model = kprototypes.KPrototypes(
    n_clusters= 4,
    init='Cao',
    n_jobs = -1,
    num_dissim=cosine_dissimilarity,
    verbose=2,
    random_state=101,
    n_init=4,
)

Initialization method and algorithm are deterministic. Setting n_init to 1.


In [40]:
cluster_index = model.fit_predict(df,categorical=categorical_cols_index)

Best run was number 1


In [41]:
cluster_index

array([2, 3, 1, ..., 2, 1, 0], dtype=uint16)

In [44]:
np.unique(cluster_index, return_counts=True)

(array([0, 1, 2, 3], dtype=uint16), array([3517, 3589, 1613, 1408]))

In [45]:
def calculate_mixed_dissimilarity(X, categorical_columns, numerical_columns):
    # Split categorical and numerical features
    X_cat = X[categorical_columns]
    X_num = X[numerical_columns]

    le = LabelEncoder()
    for col in range(X_cat.shape[1]):
        X_cat[categorical_columns[col]] = le.fit_transform(X_cat[categorical_columns[col]])

    num_distances = cdist(X_num, X_num, metric='cosine')
    cat_distances = cdist(X_cat, X_cat, metric='hamming')

    return cat_distances,num_distances,cat_distances+num_distances

In [48]:
sample_data = df.sample(n=int(0.25*df.shape[0]),random_state=101)
sample_data_index = sample_data.index.tolist()
print(len(sample_data_index))

2531


In [49]:
cat_matrix, num_matrix,distance_matrix = calculate_mixed_dissimilarity(sample_data, categorical_cols, continuous_cols)

In [50]:
cat_matrix.shape,num_matrix.shape,distance_matrix.shape

((2531, 2531), (2531, 2531), (2531, 2531))

In [51]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

sil_score = silhouette_score(distance_matrix, cluster_index[sample_data_index], metric="precomputed")
print(f"Silhouette Score: {sil_score:.4f}")

# Calculate the Davies-Bouldin Score using the precomputed distance matrix
db_score = davies_bouldin_score(distance_matrix, cluster_index[sample_data_index])
print(f"Davies-Bouldin Score: {db_score:.4f}")

# Calculate the Calinski-Harabasz Score using the precomputed distance matrix
ch_score = calinski_harabasz_score(distance_matrix, cluster_index[sample_data_index])
print(f"Calinski-Harabasz Score: {ch_score:.4f}")

Silhouette Score: 0.0237
Davies-Bouldin Score: 5.2224
Calinski-Harabasz Score: 132.7567


## Gower Matrix

In [68]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def gower_distance_df(X_df, Y_df=None):
    """
    Compute Gower's distance between two datasets (X_df, Y_df).

    Parameters:
        X_df : pandas DataFrame, shape (n_samples, n_features)
            First data matrix (numerical and categorical).
        Y_df : pandas DataFrame, shape (m_samples, n_features), optional
            Second data matrix. If None, pairwise distance with X_df is computed.

    Returns:
        dist : array-like, shape (n_samples, m_samples)
            Gower distance matrix.
    """
    # Separate numerical and categorical columns
    num_cols = X_df.select_dtypes(include=[np.number]).columns
    cat_cols = X_df.select_dtypes(exclude=[np.number]).columns

    # Function to compute Gower distance for numerical features
    def numerical_distance(a, b):
        # Add epsilon to avoid division by zero
        range_val = np.max([a, b]) - np.min([a, b])
        range_val = range_val if range_val > 0 else 1e-6  # Avoid zero range
        return np.abs(a - b) / range_val

    # Function to compute Gower distance for categorical features
    def categorical_distance(a, b):
        return 0 if a == b else 1

    # Function to compute Gower distance between two samples
    def gower_single(x, y):
        # Convert numpy arrays (rows) into pandas Series to index by column names
        x_series = pd.Series(x, index=X_df.columns)
        y_series = pd.Series(y, index=X_df.columns)

        total_distance = 0
        # Calculate numerical distance
        for col in num_cols:
            total_distance += numerical_distance(x_series[col], y_series[col])

        # Calculate categorical distance
        for col in cat_cols:
            total_distance += categorical_distance(x_series[col], y_series[col])

        # Normalize by the number of features
        total_distance /= len(num_cols) + len(cat_cols)
        return total_distance

    # Apply cdist to compute pairwise Gower distance
    dist_matrix = cdist(X_df.values, Y_df.values if Y_df is not None else X_df.values, metric=lambda u, v: gower_single(u, v))

    return dist_matrix

In [77]:
sample_data = df.sample(n=int(0.01*df.shape[0]),random_state=101)
sample_data_index = sample_data.index.tolist()
print(len(sample_data_index))

101


In [78]:
%%time
gower_dist = gower_distance_df(sample_data)

CPU times: user 4.08 s, sys: 7 ms, total: 4.09 s
Wall time: 4.1 s


In [79]:
gower_dist

array([[0.  , 0.85, 0.8 , ..., 0.9 , 0.75, 0.8 ],
       [0.85, 0.  , 0.75, ..., 0.75, 0.8 , 0.85],
       [0.8 , 0.75, 0.  , ..., 0.8 , 0.85, 0.85],
       ...,
       [0.9 , 0.75, 0.8 , ..., 0.  , 0.85, 0.65],
       [0.75, 0.8 , 0.85, ..., 0.85, 0.  , 0.85],
       [0.8 , 0.85, 0.85, ..., 0.65, 0.85, 0.  ]])