In [19]:
import pandas as pd
csv_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data"

headers = ['Class','AGE','SEX','STEROID','ANTIVIRALS','FATIGUE',
           'MALAISE','ANOREXIA','LIVER BIG','LIVER FIRM','SPLEEN PALPABLE',
           'SPIDERS','ASCITES','VARICES','BILIRUBIN','ALK PHOSPHATE','SGOT',
           'ALBUMIN','PROTIME','HISTOLOGY']

df = pd.read_csv(csv_path, names = headers)

df.head()

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1


In [20]:
df[['STEROID','ANTIVIRALS', 'MALAISE']]

Unnamed: 0,STEROID,ANTIVIRALS,MALAISE
0,1,2,2
1,1,2,2
2,2,2,2
3,?,1,2
4,2,2,2
...,...,...,...
150,2,2,1
151,2,2,2
152,1,2,1
153,1,2,2


In [23]:
# cek type data
df.dtypes

Class               int64
AGE                 int64
SEX                 int64
STEROID            object
ANTIVIRALS          int64
FATIGUE            object
MALAISE            object
ANOREXIA           object
LIVER BIG          object
LIVER FIRM         object
SPLEEN PALPABLE    object
SPIDERS            object
ASCITES            object
VARICES            object
BILIRUBIN          object
ALK PHOSPHATE      object
SGOT               object
ALBUMIN            object
PROTIME            object
HISTOLOGY           int64
dtype: object

In [24]:
# replace missing value "?" menjadi NaN
import numpy as np
df.replace("?", np.nan, inplace = True)
df.head()

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1.0,2,2,2,2,1,2,2,2,2,2,1.0,85.0,18,4.0,,1
1,2,50,1,1.0,2,1,2,2,1,2,2,2,2,2,0.9,135.0,42,3.5,,1
2,2,78,1,2.0,2,1,2,2,2,2,2,2,2,2,0.7,96.0,32,4.0,,1
3,2,31,1,,1,2,2,2,2,2,2,2,2,2,0.7,46.0,52,4.0,80.0,1
4,2,34,1,2.0,2,2,2,2,2,2,2,2,2,2,1.0,,200,4.0,,1


In [25]:
missing_data = df.isnull()
missing_data.head()

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False


In [27]:
df.isnull().sum()

Class               0
AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER BIG          10
LIVER FIRM         11
SPLEEN PALPABLE     5
SPIDERS             5
ASCITES             5
VARICES             5
BILIRUBIN           6
ALK PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
dtype: int64

In [28]:
# replace dengan rata-rata
avg_bili = df["BILIRUBIN"].astype("float").mean(axis=0)
df["BILIRUBIN"].replace(np.nan, avg_bili, inplace=True)
avg_alk = df["ALK PHOSPHATE"].astype("float").mean(axis=0)
df["ALK PHOSPHATE"].replace(np.nan, avg_alk, inplace=True)
avg_sgot = df["SGOT"].astype("float").mean(axis=0)
df["SGOT"].replace(np.nan, avg_sgot, inplace=True)
avg_albu = df["ALBUMIN"].astype("float").mean(axis=0)
df["ALBUMIN"].replace(np.nan, avg_albu, inplace=True)
avg_protime = df["PROTIME"].astype("float").mean(axis=0)
df["PROTIME"].replace(np.nan,avg_protime,inplace = True)

In [29]:
# replace dengan frekuensi terbanyak
VAL_STEROID = df["STEROID"].value_counts().idxmax()
df["STEROID"].replace(np.nan,VAL_STEROID,inplace = True)
VAL_FATI = df['FATIGUE'].value_counts().idxmax()
df["FATIGUE"].replace(np.nan, VAL_FATI, inplace=True)
VAL_MAL=df['MALAISE'].value_counts().idxmax()
df["MALAISE"].replace(np.nan, VAL_MAL, inplace=True)
VAL_ANO=df['ANOREXIA'].value_counts().idxmax()
df["ANOREXIA"].replace(np.nan, VAL_ANO, inplace=True)
VAL_LIVB=df['LIVER BIG'].value_counts().idxmax()
df["LIVER BIG"].replace(np.nan, VAL_LIVB, inplace=True)
VAL_LIVF=df['LIVER FIRM'].value_counts().idxmax()
df["LIVER FIRM"].replace(np.nan, VAL_LIVF, inplace=True)
VAL_SA=df['SPLEEN PALPABLE'].value_counts().idxmax()
df["SPLEEN PALPABLE"].replace(np.nan, VAL_SA, inplace=True)
VAL_SP=df['SPIDERS'].value_counts().idxmax()
df["SPIDERS"].replace(np.nan, VAL_SP, inplace=True)
VAL_ASC=df['ASCITES'].value_counts().idxmax()
df["ASCITES"].replace(np.nan, VAL_ASC, inplace=True)
VAL_VARI=df['VARICES'].value_counts().idxmax()
df["VARICES"].replace(np.nan, VAL_VARI, inplace=True)

In [30]:
# setelah menangai missing value, cek apakah masih ada missing value
df.isnull().sum()

Class              0
AGE                0
SEX                0
STEROID            0
ANTIVIRALS         0
FATIGUE            0
MALAISE            0
ANOREXIA           0
LIVER BIG          0
LIVER FIRM         0
SPLEEN PALPABLE    0
SPIDERS            0
ASCITES            0
VARICES            0
BILIRUBIN          0
ALK PHOSPHATE      0
SGOT               0
ALBUMIN            0
PROTIME            0
HISTOLOGY          0
dtype: int64

In [31]:
# cek type data
df.dtypes

Class               int64
AGE                 int64
SEX                 int64
STEROID            object
ANTIVIRALS          int64
FATIGUE            object
MALAISE            object
ANOREXIA           object
LIVER BIG          object
LIVER FIRM         object
SPLEEN PALPABLE    object
SPIDERS            object
ASCITES            object
VARICES            object
BILIRUBIN          object
ALK PHOSPHATE      object
SGOT               object
ALBUMIN            object
PROTIME            object
HISTOLOGY           int64
dtype: object

In [32]:
# memperbaiki format type data
df[["BILIRUBIN", "ALK PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME"]] = df[["BILIRUBIN", "ALK PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME"]].astype("float")
df[['STEROID', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES']] = df[['STEROID', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES']].astype ("int")
df.dtypes

Class                int64
AGE                  int64
SEX                  int64
STEROID              int64
ANTIVIRALS           int64
FATIGUE              int64
MALAISE              int64
ANOREXIA             int64
LIVER BIG            int64
LIVER FIRM           int64
SPLEEN PALPABLE      int64
SPIDERS              int64
ASCITES              int64
VARICES              int64
BILIRUBIN          float64
ALK PHOSPHATE      float64
SGOT               float64
ALBUMIN            float64
PROTIME            float64
HISTOLOGY            int64
dtype: object

In [33]:
# normalisasi
df['AGE'] = (df['AGE']-df['AGE'].min())/(df['AGE'].max()-df['AGE'].min())*(0.9-0.1)+0.1
df['BILIRUBIN'] = (df['BILIRUBIN']-df['BILIRUBIN'].min())/(df['BILIRUBIN'].max()-df['BILIRUBIN'].min())*(0.9-0.1)+0.1
df['ALK PHOSPHATE'] = (df['ALK PHOSPHATE']-df['ALK PHOSPHATE'].min())/(df['ALK PHOSPHATE'].max()-df['ALK PHOSPHATE'].min())*(0.9-0.1)+0.1
df['SGOT'] = (df['SGOT']-df['SGOT'].min())/(df['SGOT'].max()-df['SGOT'].min())*(0.9-0.1)+0.1
df['ALBUMIN'] = (df['ALBUMIN']-df['ALBUMIN'].min())/(df['ALBUMIN'].max()-df['ALBUMIN'].min())*(0.9-0.1)+0.1
df['PROTIME'] = (df['PROTIME']-df['PROTIME'].min())/(df['PROTIME'].max()-df['PROTIME'].min())*(0.9-0.1)+0.1

In [34]:
# rata-rata sgot
df["SGOT"].mean(axis=0)

0.19071803121148176

In [35]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(df[['SEX', 'FATIGUE', 'SPIDERS', 'ANOREXIA', 'LIVER BIG']], df['Class'])
predicted = model.predict([[2,2,2,1,1]])
print ("\nPredicted Value:", predicted)
probabilitas = model.predict_proba([[2,2,2,1,1]])
print("Probabilitas:", probabilitas)


Predicted Value: [2]
Probabilitas: [[0.15276581 0.84723419]]


In [38]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(df[['SEX', 'FATIGUE', 'SPIDERS', 'ANOREXIA', 'LIVER BIG']], df['Class'])
print("\nHasil prediksi [2,2,2,1,1] :",neigh.predict([[2,2,2,1,1]]))


Hasil prediksi [2,2,2,1,1] : [2]
