In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

**age:年紀
bp:血壓
sg:比重
al:白蛋白
su:糖分
rbc:紅血球
pc:膿細胞
pcc:膿細胞塊
ba:細菌
bgr:隨機血糖
bu:血尿素
sc:血清肌酐
sod:鈉
pot:鉀
hemo:血紅蛋白
pcv:填充細胞體積
wbcc:白血球數量
rbcc:紅血球數量
htn:高血壓
dm:糖尿病
cad:冠狀動脈疾病
appet:食慾
pe:足部水腫
ane:貧血
class**

In [28]:
def read_dataset(fname):
    # 指定索引
    data = pd.read_csv(fname, index_col=0) 
    # 刪除不需要的欄位
    data.drop(['al','rbc','pc','pcc','ba','bgr','bu','sc','sod','pot','hemo','pcv','wbcc','rbcc'], axis=1, inplace=True)
    # 處理特徵
    data['htn'] = (data['htn'] == 'yes').astype('int')
    data['dm'] = (data['dm'] == 'yes').astype('int')
    data['cad'] = (data['cad'] == 'yes').astype('int')
    data['pe'] = (data['pe'] == 'yes').astype('int')
    data['ane'] = (data['ane'] == 'yes').astype('int')
    data['class'] = (data['class'] == 'ckd').astype('int')
    # 處理登船港口特徵
    labels = data['appet'].unique().tolist()
    data['appet'] = data['appet'].apply(lambda n: labels.index(n))
    # 處理遺缺值
    data = data.fillna(0)
    return data

train = read_dataset('chronic_kidney_disease.csv')
train.head(20)

Unnamed: 0_level_0,age,bp,sg,su,htn,dm,cad,appet,pe,ane,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,48.0,80.0,1.02,0.0,1,1,0,0,0,0,1
2,7.0,50.0,1.02,0.0,0,0,0,0,0,0,1
3,62.0,80.0,1.01,3.0,0,1,0,1,0,1,1
4,48.0,70.0,1.005,0.0,1,0,0,1,1,1,1
5,51.0,80.0,1.01,0.0,0,0,0,0,0,0,1
6,60.0,90.0,1.015,0.0,1,1,0,0,1,0,1
7,68.0,70.0,1.01,0.0,0,0,0,0,0,0,1
8,24.0,0.0,1.015,4.0,0,1,0,0,1,0,1
9,52.0,100.0,1.015,0.0,1,1,0,0,0,1,1
10,53.0,90.0,1.02,0.0,1,1,0,1,0,1,1


In [29]:
from sklearn.model_selection import train_test_split

y = train['class'].values
X = train.drop(['class'], axis=1).values

#切分訓練與驗證資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('train dataset: {0}; test dataset: {1}'.format(
    X_train.shape, X_test.shape))

train dataset: (320, 10); test dataset: (80, 10)


In [30]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))

train score: 1.0; test score: 0.8875


In [31]:
clf = DecisionTreeClassifier(criterion='entropy', min_impurity_split=0.5306122448979591)
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))

from sklearn import tree

with open("ckd.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

train score: 0.94375; test score: 0.925


