In [1]:
import pandas as pd
import numpy as np
from kpplus import KPrototypes_plus
from kmodes.kprototypes import KPrototypes

In [2]:
columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
df = pd.read_csv('processed.cleveland.csv', names = columns).iloc[:,:-1]
df['sex'] = df['sex'].replace({1: 'male', 0: 'female'})
df['cp'] = df['cp'].replace({
    1: 'typical angina',
    2: 'atypical angina',
    3: 'non-anginal pain',
    4: 'asymptomatic'
})
df['cp'] = df['cp'].astype('category')
df = df.replace(to_replace='?', value = np.nan)
df = df.dropna(axis = 0)
df['ca'] = df['ca'].astype('float64')
df['thal'] = df['thal'].astype('float64')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63.0,male,typical angina,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0
1,67.0,male,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
2,67.0,male,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
3,37.0,male,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
4,41.0,female,atypical angina,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0


In [3]:
df.dtypes

age          float64
sex           object
cp          category
trestbps     float64
chol         float64
fbs          float64
restecg      float64
thalach      float64
exang        float64
oldpeak      float64
slope        float64
ca           float64
thal         float64
dtype: object

In [4]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64

In [5]:
df = df.sample(frac=15, replace=True)
df.shape

(4455, 13)

In [6]:
mdl = KPrototypes_plus(n_clusters = 4, n_init = 4, n_jobs = -1)
mdl.fit_predict(X = df, categorical = [1,2,5,6,8,10,12])

In [7]:
print('cluster labels: \n', mdl.labels_)
print('cluster centroids: \n', mdl.cluster_centroids_)
print('number of iterations: \n', mdl.n_iter_)
print('cost: \n', mdl.cost_)

cluster labels: 
 [3 3 3 ... 3 1 1]
cluster centroids: 
 [[52.7928870292887, 'male', 'asymptomatic', 129.84379358437937, 245.9100418410042, 0.0, 2.0, 163.9065550906555, 0.0, 0.7884937238493711, 1.0, 0.5857740585774058, 3.0], [52.38825324180015, 'male', 'asymptomatic', 126.64073226544622, 195.0045766590389, 0.0, 0.0, 149.82074752097634, 0.0, 1.100152555301297, 1.0, 0.532418001525553, 3.0], [56.02930832356389, 'male', 'asymptomatic', 132.5685814771395, 324.9706916764361, 0.0, 2.0, 152.1512309495897, 0.0, 0.9971864009378658, 1.0, 0.794841735052755, 3.0], [59.55892648774796, 'male', 'asymptomatic', 137.91015169194867, 259.80396732788796, 0.0, 2.0, 122.94399066511085, 1.0, 1.5824970828471423, 2.0, 0.9358226371061844, 7.0]]
number of iterations: 
 14
cost: 
 5979433.184415547


In [8]:
%%time
for _ in range(20):
    k_prototype_plus = KPrototypes_plus(n_clusters = 4, n_init = 4, n_jobs = -1)
    k_prototype_plus.fit_predict(X = df, categorical = [1,2,5,6,8,10,12])

Wall time: 58.8 s


In [9]:
%%time
for _ in range(20):    
    k_prototype_normal = KPrototypes(n_clusters = 4, n_init = 4, n_jobs = -1, init = 'Cao')
    k_prototype_normal.fit_predict(X = df, categorical = [1,2,5,6,8,10,12])

Wall time: 2min 36s
