# Algorithms before feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

import time

In [2]:
df_knn_imputed_uniform_3 = pd.read_csv('df_knn_imputed_uniform_3.csv')
df_knn_imputed_uniform_5 = pd.read_csv('df_knn_imputed_uniform_5.csv')
df_knn_imputed_uniform_7 = pd.read_csv('df_knn_imputed_uniform_7.csv')
df_knn_imputed_uniform_9 = pd.read_csv('df_knn_imputed_uniform_9.csv')
df_knn_imputed_uniform_11 = pd.read_csv('df_knn_imputed_uniform_11.csv')
df_mean_imputed = pd.read_csv('df_mean_imputed.csv')

In [3]:
df_knn_imputed_uniform_3.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,bgr,bu,sc,...,pot,hemo,pcv,wbcc,rbcc,htn,dm,appet,pe,class
0,48.0,80.0,1.02,1.0,0.0,0.666667,1.0,121.0,36.0,1.2,...,4.533333,15.4,44.0,7800.0,5.2,1.0,1.0,1.0,0.0,1.0
1,7.0,50.0,1.02,4.0,0.0,0.333333,1.0,113.666667,18.0,0.8,...,4.0,11.3,38.0,6000.0,5.4,0.0,0.0,1.0,0.0,1.0
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,423.0,53.0,1.8,...,4.666667,9.6,31.0,7500.0,3.966667,0.0,1.0,0.0,0.0,1.0
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,117.0,56.0,3.8,...,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,106.0,26.0,1.4,...,4.233333,11.6,35.0,7300.0,4.6,0.0,0.0,1.0,0.0,1.0


# Models(LOG, KNN, RF, SVM, NB, MLP) Applying on the KNN-Imputed & Mean-Imputed Dataset

In [4]:
dfs = [df_knn_imputed_uniform_3,df_knn_imputed_uniform_5,
        df_knn_imputed_uniform_7,df_knn_imputed_uniform_9,
        df_knn_imputed_uniform_11,df_mean_imputed]
    
dfs_name = ["knn_imputed_3", "knn_imputed_5",
                "knn_imputed_7","knn_imputed_9",
                "knn_imputed_11","mean_imputed"]

In [29]:
def models(x):
    
    if(x==0):
        classifier = LogisticRegression()
    elif(x==1):
        classifier = KNeighborsClassifier()
    elif(x==2):
        classifier = RandomForestClassifier(n_estimators = 500, random_state=42)
    elif(x==3):
        classifier = SVC(kernel='rbf',random_state=42)
    elif(x==4):
        classifier = GaussianNB()
    else:
        classifier = MLPClassifier(hidden_layer_sizes=(13,), max_iter=500)
        
    res = {}
    
    for i, df in enumerate(dfs):
        x = df.iloc[:, :-1]
        y = df['class']
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)
        
        ss = StandardScaler()
        x_train = ss.fit_transform(x_train)
        x_test = ss.transform(x_test)
        
        classifier.fit(x_train,y_train)
        
        test_pred = classifier.predict(x_test)
        train_pred = classifier.predict(x_train)
        
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        
#         print("Time taken to execute",dfs_name[i],"--- %s seconds ---" % (time.time() - start_time))
        tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()
#         print(classification_report(y_test, test_pred))
        res[dfs_name[i]] = [train_acc, test_acc, tn, fp, fn, tp]
        
    return res

**1.)True negative (TN) indicates the notckd samples were correctly diagnosed,**

**2.)False positive (FP) indicates the notckd samples were incorrectly diagnosed,**

**3.)False negative (FN) indicates the ckd samples were incorrectly diagnosed,** 

**4.)True positive (TP) indicates the ckd samples were correctly diagnosed.**

In [30]:
for i in range(6):
    result = models(i)
    res = pd.DataFrame.from_dict(data = result,orient='index', columns = ['train_acc', 'test_acc', 'true_neg', 'false_pos', 'false_neg', 'true_pos'])
    if(i==0):
        print("\033[1m",i+1,".PERFORMANCE INDICATORS for Logistic Regression \n")
    elif(i==1):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for KNN \n")
    elif(i==2):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Random Forest \n")
    elif(i==3):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for SVM \n")
    elif(i==4):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Naive Bayes \n")
    else:
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for MultiLayer-Perceptron \n")

    print(res,'\n')

[1m 1 .PERFORMANCE INDICATORS for Logistic Regression 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
knn_imputed_3    0.996667      0.98        36          2          0        62
knn_imputed_5    0.996667      0.97        35          3          0        62
knn_imputed_7    1.000000      0.98        36          2          0        62
knn_imputed_9    1.000000      0.97        35          3          0        62
knn_imputed_11   0.996667      0.97        35          3          0        62
mean_imputed     1.000000      0.99        38          0          1        61 

2 [1m.PERFORMANCE INDICATORS for KNN 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
knn_imputed_3    0.976667      0.97        37          1          2        60
knn_imputed_5    0.993333      0.95        37          1          4        58
knn_imputed_7    0.983333      0.98        38          0          2        60
knn_imputed_9    0.986667      0.99        

we see from above implemented Models, that for different Models we get some less accuracy and more false_pos & false_neg as mentioned above:-
    
To increase the accuracy we need to do **feature Scaling** so that some variables can be removed which is not contibuting in prediction