# Ensemble Classifier Mode Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('train_rf_norm.csv', index_col = 0)
df_test = pd.read_csv('test_rf_norm.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

# KNN = 70

In [7]:
def knn_run(x_train, x_test, y_train, y_test, k):
    
    knn = KNeighborsClassifier(n_neighbors = k).fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [8]:
knn_accuracy = []
for i in [3,5,8,10,12,15,20,25,30,35,40,45,50]:
    score = knn_run(X_train, X_test, y_train, y_test, i)
    knn_accuracy.append(score)

In [9]:
knn_accuracy

[(3, 0.82864512959095937),
 (5, 0.83552389141383121),
 (8, 0.84522785898538266),
 (10, 0.84596486918069036),
 (12, 0.84829873479916473),
 (15, 0.84891290996192115),
 (20, 0.84768455963640832),
 (25, 0.84805306473406217),
 (30, 0.84915858002702371),
 (35, 0.84964992015722884),
 (40, 0.85038693035253654),
 (45, 0.85038693035253654),
 (50, 0.85075543545019039)]

In [15]:
knn_accuracy_2 = []
for i in [60,70,80,90,100]:
    score = knn_run(X_train, X_test, y_train, y_test, i)
    knn_accuracy_2.append(score)

In [16]:
knn_accuracy_2

[(60, 0.85087827048274167),
 (70, 0.85112394054784424),
 (80, 0.84903574499447243),
 (90, 0.84903574499447243),
 (100, 0.84952708512467756)]

best k for knn = 30

# Random Forest = 60

In [10]:
rf = RandomForestClassifier()

In [11]:
def rf_run(x_train, x_test, y_train, y_test, k):
    
    rf = RandomForestClassifier(n_estimators = k).fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [12]:
rf_accuracy = []
for i in [3,5,8,10,15,20,30,40,50,60,70,80,90,100]:
    score = rf_run(X_train, X_test, y_train, y_test, i)
    rf_accuracy.append(score)

In [13]:
rf_accuracy

[(3, 0.83662940670679276),
 (5, 0.84485935388772881),
 (8, 0.85198378577570322),
 (10, 0.85554600171969042),
 (15, 0.8571428571428571),
 (20, 0.85910821766367773),
 (30, 0.85787986733816479),
 (40, 0.86082790811939569),
 (50, 0.86254759857511365),
 (60, 0.86303893870531878),
 (70, 0.8629161036727675),
 (80, 0.86168775334725467),
 (90, 0.86181058837980595),
 (100, 0.8614420832821521)]

# Logistic Regression 0.1

In [21]:
def lr_run(x_train, x_test, y_train, y_test, k):
    
    lr = LogisticRegression(C = k).fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [22]:
lr_accuracy = []
for i in [0.01,0.1,1,10,100]:
    score = lr_run(X_train, X_test, y_train, y_test, i)
    lr_accuracy.append(score)

In [23]:
lr_accuracy

[(0.01, 0.85530033165458785),
 (0.1, 0.85886254759857517),
 (1, 0.85837120746837003),
 (10, 0.85775703230561351),
 (100, 0.85751136224051094)]

# SVM rbf C = 10, gamma = 0.01

parameters_svm = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

### RBF gamma = 'auto' 

In [25]:
def svm_run(x_train, x_test, y_train, y_test, k):
    
    svm = SVC(C = k).fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [26]:
a = svm_run(X_train, X_test, y_train, y_test, 1)

(1, 0.8571428571428571)

In [None]:
b = svm_run(X_train, X_test, y_train, y_test, 10)

In [None]:
b

In [None]:
c = svm_run(X_train, X_test, y_train, y_test, 100)

In [None]:
c

In [None]:
d = svm_run(X_train, X_test, y_train, y_test, 1000)

In [None]:
d

### RBF 

In [8]:
def svm_rbf(x_train, x_test, y_train, y_test, k,l):
    
    svm = SVC(C = k, gamma = l, kernel = 'rbf').fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [9]:
e = svm_rbf(X_train, X_test, y_train, y_test, 100, 0.001)

In [15]:
e

(100, 0.85787986733816479)

In [10]:
f = svm_rbf(X_train, X_test, y_train, y_test, 100, 0.0001)

In [16]:
f

(100, 0.85775703230561351)

In [11]:
g = svm_rbf(X_train, X_test, y_train, y_test, 10, 0.001)

In [17]:
g

(10, 0.85837120746837003)

In [12]:
h = svm_rbf(X_train, X_test, y_train, y_test, 10, 0.01)

In [18]:
h

(10, 0.86131924824960082)

In [13]:
i = svm_rbf(X_train, X_test, y_train, y_test, 1, 0.001)

In [19]:
i

(1, 0.84952708512467756)

In [14]:
j = svm_rbf(X_train, X_test, y_train, y_test, 1, 0.01)

In [20]:
j

(1, 0.85751136224051094)

# Naive Bayes

There are no adjustable parameters for naive bayes (other than prior)