In [94]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
%config IPCompleter.greedy=True

### Description 
Cho dữ liệu chronic_kidney_disease.csv chứa thông tin của các bệnh nhân. Bộ dữ liệu này có thể được sử dụng để dự đoán bệnh thận mãn tính và nó được thu thập trong bệnh viện gần 2 tháng. 
We have the dataset ** chronic_kidney_disease.csv** which contains all patient information. This dataset is collected in hospital in 2 months and used to predict chronic kidney disease.  
### Data Information
The dataset can be reference and download at: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease# 
* age - age
* bp - blood pressure
* sg - specific gravity
* al - albumin
* su - sugar
* rbc - red blood cells
* pc - pus cell
* pcc - pus cell clumps
* ba - bacteria
* bgr - blood glucose random
* bu - blood urea
* sc - serum creatinine
* sod - sodium
* pot - potassium
* hemo - hemoglobin
* pcv - packed cell volume
* wc - white blood cell count
* rc - red blood cell count
* htn - hypertension
* dm - diabetes mellitus
* cad - coronary artery disease
* appet - appetite
* pe - pedal edema
* ane - anemia
* class - class
### Requirement:
* Read dataset and do basic analysis.
* Choose which method to standard data and do data standardization

In [95]:
dataset = pd.read_csv("data/chronic_kidney_disease.csv", names=["age", "bp", "sg","al", "su", "rbc", "pc", "pcc", "ba","bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wc","rc", "htn", "dm", "cad", "appet", "pe", "ane", "class"])

In [96]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      400 non-null object
bp       400 non-null object
sg       400 non-null object
al       400 non-null object
su       400 non-null object
rbc      400 non-null object
pc       400 non-null object
pcc      400 non-null object
ba       400 non-null object
bgr      400 non-null object
bu       400 non-null object
sc       400 non-null object
sod      400 non-null object
pot      400 non-null object
hemo     400 non-null object
pcv      400 non-null object
wc       400 non-null object
rc       400 non-null object
htn      400 non-null object
dm       400 non-null object
cad      400 non-null object
appet    400 non-null object
pe       400 non-null object
ane      400 non-null object
class    400 non-null object
dtypes: object(25)
memory usage: 78.2+ KB


In [97]:
dataset = dataset.replace("?", np.nan)

In [98]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      391 non-null object
bp       388 non-null object
sg       353 non-null object
al       354 non-null object
su       351 non-null object
rbc      248 non-null object
pc       335 non-null object
pcc      396 non-null object
ba       396 non-null object
bgr      356 non-null object
bu       381 non-null object
sc       383 non-null object
sod      313 non-null object
pot      312 non-null object
hemo     348 non-null object
pcv      329 non-null object
wc       294 non-null object
rc       269 non-null object
htn      398 non-null object
dm       398 non-null object
cad      398 non-null object
appet    399 non-null object
pe       399 non-null object
ane      399 non-null object
class    400 non-null object
dtypes: object(25)
memory usage: 78.2+ KB


In [99]:
dataset.iloc[:,0:12].head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc
0,48,80.0,1.02,1,0,,normal,notpresent,notpresent,121.0,36,1.2
1,7,50.0,1.02,4,0,,normal,notpresent,notpresent,,18,0.8
2,62,80.0,1.01,2,3,normal,normal,notpresent,notpresent,423.0,53,1.8
3,48,70.0,1.01,4,0,normal,abnormal,present,notpresent,117.0,56,3.8
4,51,80.0,1.01,2,0,normal,normal,notpresent,notpresent,106.0,26,1.4
5,60,90.0,1.02,3,0,,,notpresent,notpresent,74.0,25,1.1
6,68,70.0,1.01,0,0,,normal,notpresent,notpresent,100.0,54,24.0
7,24,,1.02,2,4,normal,abnormal,notpresent,notpresent,410.0,31,1.1
8,52,100.0,1.02,3,0,normal,abnormal,present,notpresent,138.0,60,1.9
9,53,90.0,1.02,2,0,abnormal,abnormal,present,notpresent,70.0,107,7.2


In [100]:
dataset.iloc[:,12:25].head(10)

Unnamed: 0,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,,,15.4,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,,,11.3,38,6000.0,,no,no,no,good,no,no,ckd
2,,,9.6,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,111.0,2.5,11.2,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,,,11.6,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,142.0,3.2,12.2,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,104.0,4.0,12.4,36,,,no,no,no,good,no,no,ckd
7,,,12.4,44,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,,,10.8,33,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,114.0,3.7,9.5,29,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


### These columns value is number but the data type is string. We convert them into data type number

In [101]:
cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
dataset[cols] = dataset[cols].apply(pd.to_numeric, errors='ignore', axis=1)

In [102]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      391 non-null float64
bp       388 non-null float64
sg       353 non-null float64
al       354 non-null float64
su       351 non-null float64
rbc      248 non-null object
pc       335 non-null object
pcc      396 non-null object
ba       396 non-null object
bgr      356 non-null float64
bu       381 non-null float64
sc       383 non-null float64
sod      313 non-null float64
pot      312 non-null float64
hemo     348 non-null float64
pcv      329 non-null float64
wc       294 non-null float64
rc       269 non-null float64
htn      398 non-null object
dm       398 non-null object
cad      398 non-null object
appet    399 non-null object
pe       399 non-null object
ane      399 non-null object
class    400 non-null object
dtypes: float64(14), object(11)
memory usage: 78.2+ KB


In [103]:
dataset = dataset.interpolate()

In [104]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      400 non-null float64
bp       400 non-null float64
sg       400 non-null float64
al       400 non-null float64
su       400 non-null float64
rbc      248 non-null object
pc       335 non-null object
pcc      396 non-null object
ba       396 non-null object
bgr      400 non-null float64
bu       400 non-null float64
sc       400 non-null float64
sod      397 non-null float64
pot      397 non-null float64
hemo     400 non-null float64
pcv      400 non-null float64
wc       400 non-null float64
rc       400 non-null float64
htn      398 non-null object
dm       398 non-null object
cad      398 non-null object
appet    399 non-null object
pe       399 non-null object
ane      399 non-null object
class    400 non-null object
dtypes: float64(14), object(11)
memory usage: 78.2+ KB


In [105]:
dataset = dataset.dropna()

In [106]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232 entries, 3 to 399
Data columns (total 25 columns):
age      232 non-null float64
bp       232 non-null float64
sg       232 non-null float64
al       232 non-null float64
su       232 non-null float64
rbc      232 non-null object
pc       232 non-null object
pcc      232 non-null object
ba       232 non-null object
bgr      232 non-null float64
bu       232 non-null float64
sc       232 non-null float64
sod      232 non-null float64
pot      232 non-null float64
hemo     232 non-null float64
pcv      232 non-null float64
wc       232 non-null float64
rc       232 non-null float64
htn      232 non-null object
dm       232 non-null object
cad      232 non-null object
appet    232 non-null object
pe       232 non-null object
ane      232 non-null object
class    232 non-null object
dtypes: float64(14), object(11)
memory usage: 47.1+ KB


### These columns "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane" are category type ,in text values and do not have the order. We apply one hot encoder/dummy encoder to convert them into number
### The columns "class" is target variable and contain text value, we apply label encode to change it into number

In [107]:
label_encode = preprocessing.LabelEncoder()
category_col = ["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
dataset = pd.get_dummies(dataset, columns=category_col)
dataset['class'] = label_encode.fit_transform(dataset['class'])

In [108]:
dataset.iloc[:,0:18].head(10)

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,class,rbc_abnormal,rbc_normal,pc_abnormal
3,48.0,70.0,1.01,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,0,0,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,126.5,2.85,11.6,35.0,7300.0,4.6,0,0,1,0
7,24.0,85.0,1.02,2.0,4.0,410.0,31.0,1.1,107.333333,3.9,12.4,44.0,6900.0,5.0,0,0,1,1
8,52.0,100.0,1.02,3.0,0.0,138.0,60.0,1.9,110.666667,3.8,10.8,33.0,9600.0,4.0,0,0,1,1
9,53.0,90.0,1.02,2.0,0.0,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,0,1,0,1
11,63.0,70.0,1.01,3.0,0.0,380.0,60.0,2.7,131.0,4.2,10.8,32.0,4500.0,3.8,0,1,0,1
14,68.0,80.0,1.01,3.0,2.0,157.0,90.0,4.1,130.0,6.4,5.6,16.0,11000.0,2.6,0,0,1,1
20,61.0,80.0,1.02,2.0,0.0,173.0,148.0,3.9,135.0,5.2,7.7,24.0,9200.0,3.2,0,1,0,1
22,48.0,80.0,1.03,4.0,0.0,95.0,163.0,7.7,136.0,3.8,9.8,32.0,6900.0,3.4,0,0,1,1
24,42.0,100.0,1.02,4.0,0.0,103.666667,50.0,1.4,129.0,4.0,11.1,39.0,8300.0,4.6,0,0,1,1


In [109]:
dataset.iloc[:,18:30].head(10)

Unnamed: 0,pc_normal,pcc_notpresent,pcc_present,ba_notpresent,ba_present,htn_no,htn_yes,dm_no,dm_yes,cad_no,cad_yes,appet_good
3,0,0,1,1,0,0,1,1,0,1,0,0
4,1,1,0,1,0,1,0,1,0,1,0,1
7,0,1,0,1,0,1,0,0,1,1,0,1
8,0,0,1,1,0,0,1,0,1,1,0,1
9,0,0,1,1,0,0,1,0,1,1,0,0
11,0,0,1,1,0,0,1,0,1,1,0,0
14,0,0,1,0,1,0,1,0,1,0,1,0
20,0,1,0,1,0,0,1,0,1,0,1,0
22,0,1,0,1,0,0,1,1,0,1,0,1
24,0,1,0,0,1,0,1,1,0,1,0,0


In [110]:
dataset.iloc[:,30:40].head(10)

Unnamed: 0,appet_poor,pe_no,pe_yes,ane_no,ane_yes
3,1,0,1,0,1
4,0,1,0,1,0
7,0,0,1,1,0
8,0,1,0,0,1
9,1,1,0,0,1
11,1,0,1,1,0
14,1,0,1,1,0
20,1,0,1,0,1
22,0,1,0,0,1
24,1,1,0,1,0
