In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score

In [2]:
df= pd.read_csv('hepatitis.data') 
df.head(5)

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,SCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,2,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [3]:
print(df.isnull().sum())  

Class               0
 AGE                0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
 ANOREXIA           1
LIVER BIG          10
LIVER FIRM         11
SPLEEN PALPABLE     5
SPIDERS             5
SCITES              5
VARICES             5
BILIRUBIN           6
ALK PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
dtype: int64


In [4]:
df.dtypes

Class                int64
 AGE                 int64
SEX                  int64
STEROID            float64
ANTIVIRALS           int64
FATIGUE            float64
MALAISE            float64
 ANOREXIA          float64
LIVER BIG          float64
LIVER FIRM         float64
SPLEEN PALPABLE    float64
SPIDERS            float64
SCITES             float64
VARICES            float64
BILIRUBIN          float64
ALK PHOSPHATE      float64
SGOT               float64
ALBUMIN            float64
PROTIME            float64
HISTOLOGY            int64
dtype: object

In [5]:
#heandle missing value dengan mean
df.fillna(df.select_dtypes(include='number').mean().iloc[0], inplace=True)

print(df.isnull().sum())


Class              0
 AGE               0
SEX                0
STEROID            0
ANTIVIRALS         0
FATIGUE            0
MALAISE            0
 ANOREXIA          0
LIVER BIG          0
LIVER FIRM         0
SPLEEN PALPABLE    0
SPIDERS            0
SCITES             0
VARICES            0
BILIRUBIN          0
ALK PHOSPHATE      0
SGOT               0
ALBUMIN            0
PROTIME            0
HISTOLOGY          0
dtype: int64


In [6]:
# drop digunakan untuk memotong data kolom
df.drop('PROTIME', axis=1, inplace=True)
df.head(2)

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,SCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,HISTOLOGY
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,1


In [7]:
kmeans = KMeans(n_clusters = 4, random_state=0, init="random")
kmeans.fit(df)                                         #melatih data df
kmeans.labels_

array([0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 3, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 3, 1, 0, 3, 3,
       0, 0, 3, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1, 0, 1, 1, 0, 0, 3, 2, 1, 0,
       3, 0, 0, 0, 3, 1, 0, 3, 0, 0, 1, 3, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0,
       1, 0, 3, 1, 0, 1, 3, 0, 3, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 0, 0,
       0])

In [8]:
predict =kmeans.predict(df)
predict

array([0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 3, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 3, 1, 0, 3, 3,
       0, 0, 3, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1, 0, 1, 1, 0, 0, 3, 2, 1, 0,
       3, 0, 0, 0, 3, 1, 0, 3, 0, 0, 1, 3, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0,
       1, 0, 3, 1, 0, 1, 3, 0, 3, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 0, 0,
       0])

In [9]:
df['cluster']=predict
df.head(5)

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,SCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,HISTOLOGY,cluster
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,1,0
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,1,3
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,1,0
3,2,31,1,1.793548,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,1,0
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.793548,200.0,4.0,1,1


In [10]:
db_index = davies_bouldin_score(df, predict)
print(db_index)

0.7532410027410833


In [11]:
from sklearn.metrics import silhouette_score
K = range (2,15)
for num_clusters in K:
    
    #inisalisai kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(df)
    kluster = kmeans.labels_
    
    #silhouette score
    silhouette_rata2 = silhouette_score(df, kluster)
    print ("for n_clusters={0}, the silhouette score is {1}".format (num_clusters,  silhouette_rata2))

for n_clusters=2, the silhouette score is 0.5905987771312794
for n_clusters=3, the silhouette score is 0.4796485326541399
for n_clusters=4, the silhouette score is 0.4523696913642508
for n_clusters=5, the silhouette score is 0.3969477090129178
for n_clusters=6, the silhouette score is 0.40298762418530826
for n_clusters=7, the silhouette score is 0.4243078092925822
for n_clusters=8, the silhouette score is 0.4160159407068607
for n_clusters=9, the silhouette score is 0.4164780925340256
for n_clusters=10, the silhouette score is 0.4156919652615231
for n_clusters=11, the silhouette score is 0.37771457240961276
for n_clusters=12, the silhouette score is 0.3569717178408401
for n_clusters=13, the silhouette score is 0.3591060172544677
for n_clusters=14, the silhouette score is 0.3390932693762252
