# Ensemble Classifier KNN Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('train_knn_imputed_encoded.csv')
df_test = pd.read_csv('test_knn_imputed_encoded.csv')

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]

# Individual Algorithms

In [7]:
def rforest(X_train, y_train, X_test):

    rforest = RandomForestClassifier()
    rforest.fit(X_train, y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

In [8]:
def knn(X_train, y_train, X_test):
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

In [9]:
def nb(X_train, y_train, X_test):
    
    nb = GaussianNB()
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred


In [10]:
def lr(X_train, y_train, X_test):

    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [11]:
def svm(X_train, y_train, X_test):
    svm = SVC()
    svm.fit(X_train,y_train)    
    y_pred = svm.predict(X_test)
    
    return y_pred

# Ensemble Function


In [12]:
def ensemble(X_train, y_train, X_test):
    
    r_y = rforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    l_y = lr(X_train, y_train, X_test)
    n_y = nb(X_train, y_train, X_test)
    s_y = svm(X_train, y_train, X_test)
    
    
    final_y = []
    
    for i in range(len(X_test)):
        final_y.append(stats.mode([r_y[i], k_y[i], l_y[i], n_y[i], s_y[i]])[0][0])
        
    return final_y

# Mode Ensemble

In [13]:
trial_run = ensemble(X_train, y_train, X_test)

In [14]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_run, y_test).ravel()[i])

tn 12212
fp 2756
fn 223
tp 1090


In [15]:
accuracy_score(trial_run, y_test)

0.81702598120508563

In [16]:
precision_score(trial_run, y_test)

0.28341133645345812

In [17]:
recall_score(trial_run, y_test)

0.83015993907083019

In [18]:
f1_score(trial_run, y_test)

0.42256251211475093