In [1]:
from sklearn import tree
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(os.path.join("data/KidneyData.csv"))
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df = df.dropna(how='any')

In [4]:
data = df["classification"].copy()

data_binary_encoded = pd.get_dummies(data)
data_binary_encoded.head()

Unnamed: 0,ckd,notckd
3,1,0
9,1,0
11,1,0
14,1,0
20,1,0


In [5]:
target = data_binary_encoded["ckd"]
target_names = ["negative", "positive"]

In [6]:
kidney_data = df.drop(columns=["classification", "id"], axis=1)

In [7]:
# kidney_data = kidney_data.dropna(how='any')

In [8]:
data1 = kidney_data.copy()

# data1 = kidney_data[["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]].copy()

data_binary_encoded1 = pd.get_dummies(data1, columns=["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"])
data_binary_encoded1.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,dm_no,dm_yes,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,...,1,0,1,0,0,1,0,1,0,1
9,53.0,90.0,1.02,2.0,0.0,70.0,107.0,7.2,114.0,3.7,...,0,1,1,0,0,1,1,0,0,1
11,63.0,70.0,1.01,3.0,0.0,380.0,60.0,2.7,131.0,4.2,...,0,1,1,0,0,1,0,1,1,0
14,68.0,80.0,1.01,3.0,2.0,157.0,90.0,4.1,130.0,6.4,...,0,1,0,1,0,1,0,1,1,0
20,61.0,80.0,1.015,2.0,0.0,173.0,148.0,3.9,135.0,5.2,...,0,1,0,1,0,1,0,1,0,1


In [9]:
data_binary_encoded1.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
       'pcv', 'wc', 'rc', 'rbc_abnormal', 'rbc_normal', 'pc_abnormal',
       'pc_normal', 'pcc_notpresent', 'pcc_present', 'ba_notpresent',
       'ba_present', 'htn_no', 'htn_yes', 'dm_no', 'dm_yes', 'cad_no',
       'cad_yes', 'appet_good', 'appet_poor', 'pe_no', 'pe_yes', 'ane_no',
       'ane_yes'],
      dtype='object')

In [10]:
kidney_binary_data = data_binary_encoded1.drop(columns=['rbc_normal', 'pc_normal', 'pcc_present', 'ba_present', 'htn_yes', 'dm_yes','cad_yes', 'appet_poor', 'pe_yes', 'ane_yes'])
kidney_binary_data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
       'pcv', 'wc', 'rc', 'rbc_abnormal', 'pc_abnormal', 'pcc_notpresent',
       'ba_notpresent', 'htn_no', 'dm_no', 'cad_no', 'appet_good', 'pe_no',
       'ane_no'],
      dtype='object')

In [11]:
feature_names = kidney_binary_data.columns

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(kidney_binary_data, target, random_state=630)

# Decision Tree:

In [13]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

# Random Forest:

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

1.0

In [15]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.15465450931230557, 'al'),
 (0.15318564596788212, 'hemo'),
 (0.13500221282906694, 'pcv'),
 (0.1343730469932806, 'sc'),
 (0.12052029775663148, 'rc'),
 (0.06496705456860692, 'bu'),
 (0.05639548747302218, 'htn_no'),
 (0.05301237549106996, 'sg'),
 (0.022101054649446943, 'bgr'),
 (0.021116930787408356, 'pc_abnormal'),
 (0.02058771867130072, 'dm_no'),
 (0.014864825411759337, 'sod'),
 (0.009434322109647968, 'pe_no'),
 (0.009037748293702534, 'rbc_abnormal'),
 (0.005444204248847402, 'wc'),
 (0.005149864418718055, 'su'),
 (0.005071331954336809, 'bp'),
 (0.00479734255560068, 'appet_good'),
 (0.00317711802463817, 'pot'),
 (0.002125223177888743, 'age'),
 (0.0020486287107838964, 'cad_no'),
 (0.0015994131526639753, 'ane_no'),
 (0.0013336434413907022, 'ba_notpresent'),
 (0.0, 'pcc_notpresent')]

# Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [17]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [18]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [19]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [1 1 1 0 0 1 1 0 0 0]
First 10 Actual labels: [1, 1, 1, 0, 0, 1, 1, 0, 0, 0]


In [20]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,0
4,0,0
5,1,1
6,1,1
7,0,0
8,0,0
9,0,0


# SVC

In [21]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [22]:
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.975


In [23]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       1.00      0.96      0.98        28
    positive       0.92      1.00      0.96        12

    accuracy                           0.97        40
   macro avg       0.96      0.98      0.97        40
weighted avg       0.98      0.97      0.98        40



# One-Hot Encoding

In [24]:
from tensorflow.keras.utils import to_categorical

y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.

# Scaling the Data:

In [25]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [26]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
from tensorflow.keras.utils import to_categorical

In [28]:
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=24, activation='relu', input_dim=24))
model.add(Dense(units=2, activation='softmax'))

In [34]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 650
Trainable params: 650
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [36]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 118 samples
Epoch 1/100
118/118 - 1s - loss: 0.5138 - accuracy: 0.7458
Epoch 2/100
118/118 - 0s - loss: 0.4516 - accuracy: 0.8559
Epoch 3/100
118/118 - 0s - loss: 0.3991 - accuracy: 0.9322
Epoch 4/100
118/118 - 0s - loss: 0.3543 - accuracy: 0.9492
Epoch 5/100
118/118 - 0s - loss: 0.3147 - accuracy: 0.9915
Epoch 6/100
118/118 - 0s - loss: 0.2799 - accuracy: 1.0000
Epoch 7/100
118/118 - 0s - loss: 0.2510 - accuracy: 1.0000
Epoch 8/100
118/118 - 0s - loss: 0.2240 - accuracy: 1.0000
Epoch 9/100
118/118 - 0s - loss: 0.1995 - accuracy: 1.0000
Epoch 10/100
118/118 - 0s - loss: 0.1783 - accuracy: 1.0000
Epoch 11/100
118/118 - 0s - loss: 0.1593 - accuracy: 1.0000
Epoch 12/100
118/118 - 0s - loss: 0.1424 - accuracy: 1.0000
Epoch 13/100
118/118 - 0s - loss: 0.1271 - accuracy: 1.0000
Epoch 14/100
118/118 - 0s - loss: 0.1138 - accuracy: 1.0000
Epoch 15/100
118/118 - 0s - loss: 0.1023 - accuracy: 1.0000
Epoch 16/100
118/118 - 0s - loss: 0.0920 - accuracy: 1.0000
Epoch 17/100
118/118 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x2011f669308>

In [37]:
deep_model = Sequential()
deep_model.add(Dense(units=24, activation='relu', input_dim=24))
deep_model.add(Dense(units=6, activation='relu'))
deep_model.add(Dense(units=2, activation='softmax'))

In [38]:
deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 118 samples
Epoch 1/100
118/118 - 1s - loss: 0.6339 - accuracy: 0.7966
Epoch 2/100
118/118 - 0s - loss: 0.5892 - accuracy: 0.8475
Epoch 3/100
118/118 - 0s - loss: 0.5462 - accuracy: 0.8729
Epoch 4/100
118/118 - 0s - loss: 0.5033 - accuracy: 0.9068
Epoch 5/100
118/118 - 0s - loss: 0.4625 - accuracy: 0.9492
Epoch 6/100
118/118 - 0s - loss: 0.4216 - accuracy: 0.9746
Epoch 7/100
118/118 - 0s - loss: 0.3843 - accuracy: 0.9831
Epoch 8/100
118/118 - 0s - loss: 0.3483 - accuracy: 0.9915
Epoch 9/100
118/118 - 0s - loss: 0.3149 - accuracy: 0.9915
Epoch 10/100
118/118 - 0s - loss: 0.2830 - accuracy: 0.9915
Epoch 11/100
118/118 - 0s - loss: 0.2537 - accuracy: 1.0000
Epoch 12/100
118/118 - 0s - loss: 0.2269 - accuracy: 1.0000
Epoch 13/100
118/118 - 0s - loss: 0.2022 - accuracy: 1.0000
Epoch 14/100
118/118 - 0s - loss: 0.1801 - accuracy: 1.0000
Epoch 15/100
118/118 - 0s - loss: 0.1601 - accuracy: 1.0000
Epoch 16/100
118/118 - 0s - loss: 0.1424 - accuracy: 1.0000
Epoch 17/100
118/118 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x201209fd988>

In [39]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

40/40 - 0s - loss: 0.0033 - accuracy: 1.0000
Normal Neural Network - Loss: 0.0032674197107553484, Accuracy: 1.0


In [40]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

40/40 - 0s - loss: 0.0028 - accuracy: 1.0000
Deep Neural Network - Loss: 0.002782899048179388, Accuracy: 1.0
