In [57]:
%matplotlib notebook
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer, LabelEncoder, StandardScaler, scale
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

data = pd.read_csv('ckd_data.csv',sep='\t')
data = data.replace('?','NaN')
Numdata = data.loc[0:,'age':'bp'].join(data.loc[0:,'bgr':'rbcc'])
Catdata = data.loc[0:,'sg':'ba'].join(data.loc[0:,'htn':'class'])
data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [58]:
for i in Catdata.columns:
    the_value = str(Catdata[i].mode().values[0])
    Catdata[i].replace('NaN',the_value,inplace = True)
    if i != 'class':
        dummy_data = pd.get_dummies(Catdata[i], prefix=i+"_", drop_first=True)
        Catdata = pd.concat([Catdata, dummy_data], axis=1)
        Catdata.drop(i, axis=1, inplace=True)
    elif i == 'class':
        dummy_data = pd.get_dummies(Catdata[i])
        dummy_data.drop("notckd", axis=1, inplace=True)
        dummy_data.rename(columns={"ckd": "class"}, inplace=True)
        Catdata.drop(i, axis=1, inplace=True)
        Catdata = pd.concat([Catdata, dummy_data], axis=1)

In [59]:
imp1 = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp1.fit(Numdata)
Numdata.loc[0:,:] = imp1.transform(Numdata)
data = Numdata.join(Catdata)
datas = Numdata.join(Catdata)
#datas.loc[:,'age':'ane'] = scale(data.loc[:,'age':'ane'])

In [60]:
data.head()

Unnamed: 0,age,bp,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,...,pc__normal,pcc__present,ba__present,htn__yes,dm__yes,cad__yes,appet__poor,pe__yes,ane__yes,class
0,48.0,80.0,121.0,36.0,1.2,137.528754,4.627244,15.4,44.0,7800.0,...,1,0,0,1,1,0,0,0,0,1
1,7.0,50.0,148.036517,18.0,0.8,137.528754,4.627244,11.3,38.0,6000.0,...,1,0,0,0,0,0,0,0,0,1
2,62.0,80.0,423.0,53.0,1.8,137.528754,4.627244,9.6,31.0,7500.0,...,1,0,0,0,1,0,1,0,1,1
3,48.0,70.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,...,0,1,0,1,0,0,1,1,1,1
4,51.0,80.0,106.0,26.0,1.4,137.528754,4.627244,11.6,35.0,7300.0,...,1,0,0,0,0,0,0,0,0,1


In [92]:
labels = data.loc[0:,['class']]
features = data.drop(['class'],axis=1,inplace=False)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 1)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train).reshape(320)
y_test = np.array(y_test)

In [93]:
def fmeasure(predict,true):
    conf = confusion_matrix(true, predict)
    TP = conf[0][0]
    TN = conf[1][1]
    FN = conf[0][1]
    FP = conf[1][0]
    Pre = TP/(TP+FP)
    Rec = TP/(TP+FN)
    f_measure = 2*Pre*Rec/(Pre+Rec)
    return f_measure

# SVM(linear)

In [94]:
clf = SVC(kernel='linear')
clf.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [95]:
pre_train = clf.predict(X_train)
pre_test = clf.predict(X_test)
f_measure_train = fmeasure(pre_train, y_train)
f_measure_test = fmeasure(pre_test, y_test)
print("Training f-measure is:", f_measure_train)
print("Testing f-measure is:", f_measure_test)

Training f-measure is: 0.9655172413793104
Testing f-measure is: 0.955223880597015


# SVM(RBF)

In [96]:
clf = SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [97]:
pre_train = clf.predict(X_train)
pre_test = clf.predict(X_test)
f_measure_train = fmeasure(pre_train, y_train)
f_measure_test = fmeasure(pre_test, y_test)
print("Training f-measure is:", f_measure_train)
print("Testing f-measure is:", f_measure_test)

Training f-measure is: 1.0
Testing f-measure is: nan


  import sys


# Random Forest

In [98]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [99]:
pre_train = RF.predict(X_train)
pre_test = RF.predict(X_test)
f_measure_train = fmeasure(pre_train, y_train)
f_measure_test = fmeasure(pre_test, y_test)
print("Training f-measure is:", f_measure_train)
print("Testing f-measure is:", f_measure_test)

Training f-measure is: 0.9957081545064378
Testing f-measure is: 0.9705882352941176
