# Ensemble Classifier Mode Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('mode_train.csv', index_col = 0)
df_test = pd.read_csv('mode_test.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

# KNN

In [7]:
def knn_run(x_train, x_test, y_train, y_test, k):
    
    knn = KNeighborsClassifier(n_neighbors = k).fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [8]:
knn_accuracy = []
for i in [3,5,8,10,12,15,20,25,30,35,40,45,50]:
    score = knn_run(X_train, X_test, y_train, y_test, i)
    knn_accuracy.append(score)

In [9]:
knn_accuracy

[(3, 0.80850018425254888),
 (5, 0.81243090529418993),
 (8, 0.82188920280063871),
 (10, 0.82667976907013885),
 (12, 0.82545141874462602),
 (15, 0.82938213978626707),
 (20, 0.83220734553494657),
 (25, 0.83404987102321582),
 (30, 0.8349097162510748),
 (35, 0.83171600540474144),
 (40, 0.83282152069770299),
 (45, 0.83159317037219016),
 (50, 0.83036482004667733)]

best k for knn = 30

# Random Forest

In [10]:
rf = RandomForestClassifier()

In [11]:
def rf_run(x_train, x_test, y_train, y_test, k):
    
    rf = RandomForestClassifier(n_estimators = k).fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [12]:
rf_accuracy = []
for i in [3,5,8,10,15,20,30,40,50,60,70,80,90,100]:
    score = rf_run(X_train, X_test, y_train, y_test, i)
    rf_accuracy.append(score)

In [13]:
rf_accuracy

[(3, 0.80284977275518976),
 (5, 0.79928755681120256),
 (8, 0.81255374032674121),
 (10, 0.81083404987102325),
 (15, 0.81427343078245917),
 (20, 0.815501781107972),
 (30, 0.81562461614052328),
 (40, 0.81709863653113868),
 (50, 0.82152069770298486),
 (60, 0.81808131679154894),
 (70, 0.82139786267043358),
 (80, 0.81758997666134381),
 (90, 0.81734430659624124),
 (100, 0.8198010072472669)]