In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

def auc_scorer(clf, X, y, model): # Helper function to plot the ROC curve
    if model=='RF':
        fpr, tpr, _ = roc_curve(y, clf.predict_proba(X)[:,1])
    elif model=='SVM':
        fpr, tpr, _ = roc_curve(y, clf.decision_function(X))
    roc_auc = auc(fpr, tpr)

    plt.figure()    # Plot the ROC curve
    plt.plot(fpr, tpr, label='ROC curve from '+model+' model (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    return fpr,tpr,roc_auc

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

In [2]:
df = pd.read_csv('ckd.csv')


In [3]:
df

Unnamed: 0,Age,Bp,Sg,Al,Su,Rbc,Pc,Pcc,Ba,Bgr,...,Pcv,Wbcc,Rbcc,Htn,Dm,Cad,Appet,pe,Ane,Class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55,80,1.02,0,0,normal,normal,notpresent,notpresent,140,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42,70,1.025,0,0,normal,normal,notpresent,notpresent,75,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12,80,1.02,0,0,normal,normal,notpresent,notpresent,100,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17,60,1.025,0,0,normal,normal,notpresent,notpresent,114,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [4]:
import numpy as np
import pandas as pd

# Read dataset file ckd.csv
dataset = pd.read_csv("ckd.csv",header=0, na_values="?")

# Replace null values "?" by numpy.NaN
dataset.replace("?", np.NaN)

# Convert nominal values to binary values
cleanup = {"Rbc":     {"normal": 1, "abnormal": 0},
           "Pc": {"normal": 1, "abnormal": 0},
           "Pcc": {"present": 1, "notpresent": 0},
           "Ba": {"present": 1, "notpresent": 0},
           "Htn": {"yes": 1, "no": 0},
           "Dm": {"yes": 1, "no": 0},
           "Cad": {"yes": 1, "no": 0},
           "Appet": {"good": 1, "poor": 0},
           "pe": {"yes": 1, "no": 0},
           "Ane": {"yes": 1, "no": 0}}

# Replace binary values into dataset
dataset.replace(cleanup, inplace=True)

dataset=dataset.drop(["Sg","Su","Pcc","Ba","Cad","Appet","pe","Ane"],axis=1)

# Fill null values with mean value of the respective column

dataset.fillna(round(dataset.mean(),2), inplace=True)

# print(dataset)

# Save this dataset as final.csv for further prediction
dataset.to_csv("final.csv", sep=',', index=False)

In [5]:
d=pd.read_csv("final.csv")

In [8]:
d

Unnamed: 0,Age,Bp,Sg,Al,Su,Rbc,Pc,Pcc,Ba,Bgr,...,Pcv,Wbcc,Rbcc,Htn,Dm,Cad,Appet,pe,Ane,Class
0,48.0,80.0,1.020,1.0,0.0,0.81,1.0,0.0,0.0,121.00,...,44.0,7800.0,5.20,1.0,1,0.0,1.0,0.0,0.0,ckd
1,7.0,50.0,1.020,4.0,0.0,0.81,1.0,0.0,0.0,148.04,...,38.0,6000.0,4.71,0.0,0,0.0,1.0,0.0,0.0,ckd
2,62.0,80.0,1.010,2.0,3.0,1.00,1.0,0.0,0.0,423.00,...,31.0,7500.0,4.71,0.0,1,0.0,0.0,0.0,1.0,ckd
3,48.0,70.0,1.005,4.0,0.0,1.00,0.0,1.0,0.0,117.00,...,32.0,6700.0,3.90,1.0,0,0.0,0.0,1.0,1.0,ckd
4,51.0,80.0,1.010,2.0,0.0,1.00,1.0,0.0,0.0,106.00,...,35.0,7300.0,4.60,0.0,0,0.0,1.0,0.0,0.0,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1.00,1.0,0.0,0.0,140.00,...,47.0,6700.0,4.90,0.0,0,0.0,1.0,0.0,0.0,notckd
396,42.0,70.0,1.025,0.0,0.0,1.00,1.0,0.0,0.0,75.00,...,54.0,7800.0,6.20,0.0,0,0.0,1.0,0.0,0.0,notckd
397,12.0,80.0,1.020,0.0,0.0,1.00,1.0,0.0,0.0,100.00,...,49.0,6600.0,5.40,0.0,0,0.0,1.0,0.0,0.0,notckd
398,17.0,60.0,1.025,0.0,0.0,1.00,1.0,0.0,0.0,114.00,...,51.0,7200.0,5.90,0.0,0,0.0,1.0,0.0,0.0,notckd


In [13]:
d.columns

Index(['Age', 'Bp', 'Sg', 'Al', 'Su', 'Rbc', 'Pc', 'Pcc', 'Ba', 'Bgr', 'Bu',
       'Sc', 'Sod', 'Pot', 'Hemo', 'Pcv', 'Wbcc', 'Rbcc', 'Htn', 'Dm', 'Cad',
       'Appet', 'pe', 'Ane', 'Class'],
      dtype='object')

In [None]:
Age : 
B.P : 
A.L : 
P.C.C : 
B.G.R: 
B.U : 
S.C : 
HEMO : 
PCV : 
HTN: 
D.M : 
Appet:

In [32]:
d["Ane"].value_counts()

0.00    339
1.00     60
0.15      1
Name: Ane, dtype: int64

In [39]:
# d_corr = d.corr()
# plt.figure(figsize = (20,20))
# sns.heatmap(d_corr , annot = True, fmt = ".0%")

In [None]:
Sg,Su,Pcc,Ba,Cad,Appet,pe,Ane