In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch

In [2]:
df = pd.read_excel('hepatitis1.xlsx',  na_values="?" )
df = df.replace(to_replace = ['yes','no'],value = ['1','0'])

In [3]:
#Categorical
df['Steroid'].fillna(df.groupby('CLASS')['Steroid'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Fatique'].fillna(df.groupby('CLASS')['Fatique'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Malaise'].fillna(df.groupby('CLASS')['Malaise'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Anorexia'].fillna(df.groupby('CLASS')['Anorexia'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Liver Big'].fillna(df.groupby('CLASS')['Liver Big'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Liver Firm'].fillna(df.groupby('CLASS')['Liver Firm'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Spleen Palpable'].fillna(df.groupby('CLASS')['Spleen Palpable'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Speiders'].fillna(df.groupby('CLASS')['Speiders'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Ascites'].fillna(df.groupby('CLASS')['Ascites'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)
df['Varices'].fillna(df.groupby('CLASS')['Varices'].transform(lambda x: pd.Series.mode(x)[0]), inplace = True)

#Numeric
df["Bilirubin"] = df["Bilirubin"].fillna(df.groupby('CLASS')['Bilirubin'].transform('mean'))
df["Alk Phosphate"] = df["Alk Phosphate"].fillna(df.groupby('CLASS')['Alk Phosphate'].transform('mean'))
df["SGOT"] = df["SGOT"].fillna(df.groupby('CLASS')['SGOT'].transform('mean'))
df["Albumin"] = df["Albumin"].fillna(df.groupby('CLASS')['Albumin'].transform('mean'))
df["Protime"] = df["Protime"].fillna(df.groupby('CLASS')['Protime'].transform('mean'))


In [5]:
#Drop Number & Class
x = df.drop(columns=['#','CLASS'])

scaler = StandardScaler()
#scaler = MinMaxScaler()
x[x.columns] = scaler.fit_transform(x[x.columns])

In [6]:
#Clustering using K-means
kmeans = KMeans(n_clusters=2)
centroid_collect=[]
for i in range(0,10):
    kmeans.fit(x)
    centroids  = kmeans.cluster_centers_
    centroid_collect.append(centroids)

collect=np.array(centroid_collect)

print(collect)
print(collect.shape)

feature1=collect[:,0,:]
feature2=collect[:,1,:]

mean1=np.mean(feature1, axis=0)
mean2=np.mean(feature2, axis=0)
print(mean1)
print(mean2)


[[[ 0.28627823  0.13025759 -0.23368433  0.27993653 -0.69372118
   -0.72963127 -0.46053532 -0.1440881  -0.37843782 -0.45927933
   -0.91573593 -0.68044853 -0.58506348  0.62754839  0.58363409
    0.36298633 -0.81198576 -0.63486821  0.59958844]
  [-0.16193516 -0.07368106  0.13218507 -0.15834794  0.39240794
    0.41272072  0.26050483  0.08150438  0.21406584  0.25979437
    0.51799204  0.38490018  0.330945   -0.35497687 -0.33013646
   -0.2053256   0.45930508  0.35911737 -0.33916114]]

 [[ 0.28627823  0.13025759 -0.23368433  0.27993653 -0.69372118
   -0.72963127 -0.46053532 -0.1440881  -0.37843782 -0.45927933
   -0.91573593 -0.68044853 -0.58506348  0.62754839  0.58363409
    0.36298633 -0.81198576 -0.63486821  0.59958844]
  [-0.16193516 -0.07368106  0.13218507 -0.15834794  0.39240794
    0.41272072  0.26050483  0.08150438  0.21406584  0.25979437
    0.51799204  0.38490018  0.330945   -0.35497687 -0.33013646
   -0.2053256   0.45930508  0.35911737 -0.33916114]]

 [[-0.18053288 -0.07097091  0.14

In [7]:
newcentroid = np.concatenate((mean1, mean2))
newcentroid=newcentroid.reshape(2,19)
print(newcentroid)

kmeans.fit(x , newcentroid)
centroids2  = kmeans.cluster_centers_
print(centroids2)
clusters=kmeans.predict(x)


[[ 0.01788883  0.00517486 -0.00981747  0.02492354 -0.03838155 -0.03611793
  -0.02493616 -0.00805646 -0.01983488 -0.03417565 -0.04945344 -0.04756526
  -0.04118131  0.04621276  0.039785    0.02557112 -0.05209336 -0.04493741
   0.03543447]
 [ 0.11223584  0.04864148 -0.09282807  0.10171336 -0.26268378 -0.28011959
  -0.17109366 -0.0524022  -0.13957907 -0.16599537 -0.34135915 -0.24870195
  -0.21358278  0.22814846  0.21630015  0.13224918 -0.30452119 -0.23664887
   0.2203822 ]]
[[ 0.28627823  0.13025759 -0.23368433  0.27993653 -0.69372118 -0.72963127
  -0.46053532 -0.1440881  -0.37843782 -0.45927933 -0.91573593 -0.68044853
  -0.58506348  0.62754839  0.58363409  0.36298633 -0.81198576 -0.63486821
   0.59958844]
 [-0.16193516 -0.07368106  0.13218507 -0.15834794  0.39240794  0.41272072
   0.26050483  0.08150438  0.21406584  0.25979437  0.51799204  0.38490018
   0.330945   -0.35497687 -0.33013646 -0.2053256   0.45930508  0.35911737
  -0.33916114]]


In [8]:
#Class
y = df['CLASS'].replace(to_replace = ['Live','Die'],value = ['0','1'])
true_y = y.to_numpy().astype(np.int)

#Cluster analysis
print("Actual Class")
print(clusters)
print("Predicted Class")
print(true_y)
print ("Accuracy score:" , (accuracy_score(clusters,true_y))*100,"%")
print ("Error ratio:" , (1-accuracy_score(clusters,true_y))*100,"%")
print(classification_report(clusters,true_y))

Actual Class
[1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1
 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1
 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1
 0 1 1 1 1 1 1 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 0
 1 1 0 1 0 0 0]
Predicted Class
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0
 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 0 1 1
 0 0 1 0 0 0 1]
Accuracy score: 20.64516129032258 %
Error ratio: 79.35483870967742 %
              precision    recall  f1-score   support

           0       0.23      0.50      0.31        56
           1       0.12      0.04      0.06        99

    accuracy                           0.21       155
   macro avg       0.18      0.27      0.19       15