# Ensemble Classifier Mode Data

# Import Libraries

In [3]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [4]:
df_train = pd.read_csv('train_rf_norm.csv', index_col = 0)
df_test = pd.read_csv('test_rf_norm.csv', index_col = 0)

In [5]:
len(df_train.columns) == len(df_test.columns)

True

In [6]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [7]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

In [9]:
#X_test = df_test.iloc[:,:-1]
#y_test = df_test.iloc[:,-1]

# KNN

n_neighbors = 30

In [13]:
knn = KNeighborsClassifier()

In [14]:
parameters_knn = [{'n_neighbors': [3,5,8,10,12,15,20,25,30,35,40,45,50]}]

In [15]:
clf = GridSearchCV(knn, parameters_knn)

In [16]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [3, 5, 8, 10, 12, 15, 20, 25, 30, 35, 40, 45, 50]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
clf.best_params_

{'n_neighbors': 15}

In [18]:
clf.score(X_train, y_train)

0.85819000819000824

# Random Forest

n_estimators = 90

In [25]:
rf = RandomForestClassifier()

In [26]:
parameters_rf = [{'n_estimators': [3,5,8,10,15,20,30,40,50,60,70,80,90,100]}]

In [27]:
clf = GridSearchCV(rf, parameters_rf)

In [28]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 5, 8, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [29]:
clf.best_params_

{'n_estimators': 90}

In [30]:
clf.score(X_train, y_train)

0.99995904995904994

# Logistic Regression 

C = 10

In [19]:
lr = LogisticRegression()

In [20]:
parameters_lr = [{'C': [0.01,0.1,1,10,100]}]

In [21]:
clf = GridSearchCV(lr, parameters_lr)

In [22]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.01, 0.1, 1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [23]:
clf.best_params_

{'C': 10}

In [24]:
clf.score(X_train, y_train)

0.85180180180180176

# SVM

In [10]:
from sklearn import preprocessing
X_train_n = preprocessing.scale(X_train) 

In [11]:
svm = SVC()

In [12]:
parameters_svm = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

In [13]:
clf = GridSearchCV(svm, parameters_svm)

In [None]:
clf.fit(X_train_n, y_train)

In [None]:
clf.best_params_

In [None]:
clf.score(X_train, y_train)

# Naive Bayes

There are no adjustable parameters for naive bayes (other than prior)