In [None]:
#!usr/bin/env python3

In [None]:
"""
Now that we have a functional and feature-engineered dataset, we can build some 
ML models. We will be classifying data using three models of my choice. I am choosing
Random Forest, SVC, and K-nearest neighbors
"""

In [25]:
#Importing proper libraries and modules to create and optimize models

%store -r X_train
%store -r X_test
%store -r X_train_PCA
%store -r X_test_PCA
%store -r y_train
%store -r y_test
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

In [26]:
# Models with no feature optimization and no PCA

rf = RandomForestClassifier()
svc = SVC()
knn = KNeighborsClassifier()

rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))

svc.fit(X_train, y_train)
print(svc.score(X_test, y_test))

knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

0.832258064516129
0.546236559139785
0.8193548387096774


In [28]:
# Models with no feature optimization and PCA

rf_PCA = RandomForestClassifier()
svc_PCA = SVC()
knn_PCA = KNeighborsClassifier()

rf_PCA.fit(X_train_PCA, y_train)
print(rf_PCA.score(X_test_PCA, y_test))

svc_PCA.fit(X_train_PCA, y_train)
print(svc_PCA.score(X_test_PCA, y_test))

knn_PCA.fit(X_train_PCA, y_train)
print(knn_PCA.score(X_test_PCA, y_test))

0.7118279569892473
0.546236559139785
0.6645161290322581


In [None]:
"""
SVC is clearly the weakest performing model. But both Random Forest and KNN demonstrate an
accuracy of over 80% with no PCA and around 70% with PCA. This is a good accuracy but we can make it better. Let's perform 
a hyperparameter grid search in order to optimize all of our models.
"""

In [37]:
# Random Forest (PCA and no PCA)

params_rf = {'n_estimators': np.arange(50,200,50,int),
             'max_depth': np.arange(5,10,1,int),
             'min_samples_leaf': [1,2,3],
             'bootstrap': [True,False]}

rf_grid = GridSearchCV(RandomForestClassifier(), params_rf, verbose=3)
rf_PCA_grid = GridSearchCV(RandomForestClassifier(), params_rf, verbose=3)

rf_grid.fit(X_train, y_train)
rf_PCA_grid.fit(X_train_PCA, y_train)

print(rf_grid.best_params_)
print(rf_PCA_grid.best_params_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.760 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.797 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.811 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.802 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.778 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=100;, score=0.783 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=100;, score=0.779 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=5, min_samples_leaf=1, n_estimators=100;, score=0.811 total time=   0.3s
[CV 4/5] END bootstrap=True, max_depth=5, min_s

[CV 4/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.825 total time=   0.4s
[CV 5/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.773 total time=   0.4s
[CV 1/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.779 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.779 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.825 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.820 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.782 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=100;, score=0.788 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=6, min_samples_leaf=3, n_estimators=100;, score=0.770 total time=   0.

[CV 3/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.843 total time=   0.4s
[CV 4/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.825 total time=   0.4s
[CV 5/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.824 total time=   0.4s
[CV 1/5] END bootstrap=True, max_depth=8, min_samples_leaf=2, n_estimators=50;, score=0.774 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=8, min_samples_leaf=2, n_estimators=50;, score=0.806 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=8, min_samples_leaf=2, n_estimators=50;, score=0.853 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=8, min_samples_leaf=2, n_estimators=50;, score=0.839 total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=8, min_samples_leaf=2, n_estimators=50;, score=0.792 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=8, min_samples_leaf=2, n_estimators=100;, score=0.774 total time=   0.

[CV 2/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.793 total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.848 total time=   0.4s
[CV 4/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.843 total time=   0.4s
[CV 5/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.806 total time=   0.5s
[CV 1/5] END bootstrap=False, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.779 total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.788 total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.825 total time=   0.3s
[CV 4/5] END bootstrap=False, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.797 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=5, min_samples_leaf=1, n_estimators=50;, score=0.792 total time=

[CV 5/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.787 total time=   0.4s
[CV 1/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.783 total time=   0.5s
[CV 2/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.793 total time=   0.6s
[CV 3/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.839 total time=   0.6s
[CV 4/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.811 total time=   0.5s
[CV 5/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.787 total time=   0.5s
[CV 1/5] END bootstrap=False, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.770 total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.783 total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.825 total

[CV 3/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.862 total time=   0.4s
[CV 4/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.848 total time=   0.4s
[CV 5/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.810 total time=   0.4s
[CV 1/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.779 total time=   0.5s
[CV 2/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.816 total time=   0.6s
[CV 3/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.862 total time=   0.5s
[CV 4/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.843 total time=   0.6s
[CV 5/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.801 total time=   0.6s
[CV 1/5] END bootstrap=False, max_depth=8, min_samples_leaf=2, n_estimators=50;, score=0.783 tot

[CV 1/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.788 total time=   0.4s
[CV 2/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.825 total time=   0.4s
[CV 3/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.853 total time=   0.4s
[CV 4/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.843 total time=   0.4s
[CV 5/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.819 total time=   0.4s
[CV 1/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.783 total time=   0.5s
[CV 2/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.811 total time=   0.5s
[CV 3/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.866 total time=   0.5s
[CV 4/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.834 to

[CV 1/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.654 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.664 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.645 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.654 total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.634 total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.650 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.673 total time=   0.3s
[CV 3/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.659 total time=   0.3s
[CV 4/5] END bootstrap=True, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.668 total time=

[CV 1/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.650 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.682 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.677 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.691 total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.681 total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.654 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.682 total time=   0.3s
[CV 3/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.687 total time=   0.3s
[CV 4/5] END bootstrap=True, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.700 total time=

[CV 1/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.659 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.700 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.677 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.682 total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.653 total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.650 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.705 total time=   0.3s
[CV 3/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.668 total time=   0.3s
[CV 4/5] END bootstrap=True, max_depth=9, min_samples_leaf=3, n_estimators=150;, score=0.673 total time=

[CV 1/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.645 total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.659 total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.659 total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.673 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=100;, score=0.653 total time=   0.2s
[CV 1/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.641 total time=   0.3s
[CV 2/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.668 total time=   0.3s
[CV 3/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.664 total time=   0.3s
[CV 4/5] END bootstrap=False, max_depth=6, min_samples_leaf=2, n_estimators=150;, score=0.668 to

[CV 4/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=50;, score=0.687 total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=50;, score=0.662 total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.631 total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.682 total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.668 total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.687 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=100;, score=0.681 total time=   0.2s
[CV 1/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.631 total time=   0.3s
[CV 2/5] END bootstrap=False, max_depth=8, min_samples_leaf=1, n_estimators=150;, score=0.677 tota

[CV 2/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=50;, score=0.677 total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=50;, score=0.654 total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=50;, score=0.696 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=50;, score=0.681 total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.645 total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.691 total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.664 total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.700 total time=   0.2s
[CV 5/5] END bootstrap=False, max_depth=9, min_samples_leaf=3, n_estimators=100;, score=0.676 total 

In [39]:
#SVC (PCA and no PCA)

params_svc = {'kernel': ['rbf','poly','sigmoid','linear'],
             'degree': np.arange(1,4,1),
             'tol': [1e-3, 1e-4, 1e-2]}
svc_grid = GridSearchCV(SVC(), params_svc, verbose=3)
svc_PCA_grid = GridSearchCV(SVC(), params_svc, verbose=3)

svc_grid.fit(X_train, y_train)
svc_PCA_grid.fit(X_train_PCA, y_train)

print(svc_grid.best_params_)
print(svc_PCA_grid.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ...degree=1, kernel=rbf, tol=0.001;, score=0.613 total time=   0.1s
[CV 2/5] END ...degree=1, kernel=rbf, tol=0.001;, score=0.608 total time=   0.1s
[CV 3/5] END ...degree=1, kernel=rbf, tol=0.001;, score=0.608 total time=   0.0s
[CV 4/5] END ...degree=1, kernel=rbf, tol=0.001;, score=0.608 total time=   0.1s
[CV 5/5] END ...degree=1, kernel=rbf, tol=0.001;, score=0.611 total time=   0.1s
[CV 1/5] END ..degree=1, kernel=rbf, tol=0.0001;, score=0.613 total time=   0.0s
[CV 2/5] END ..degree=1, kernel=rbf, tol=0.0001;, score=0.608 total time=   0.1s
[CV 3/5] END ..degree=1, kernel=rbf, tol=0.0001;, score=0.608 total time=   0.1s
[CV 4/5] END ..degree=1, kernel=rbf, tol=0.0001;, score=0.608 total time=   0.1s
[CV 5/5] END ..degree=1, kernel=rbf, tol=0.0001;, score=0.611 total time=   0.1s
[CV 1/5] END ....degree=1, kernel=rbf, tol=0.01;, score=0.613 total time=   0.1s
[CV 2/5] END ....degree=1, kernel=rbf, tol=0.01

[CV 2/5] END degree=2, kernel=sigmoid, tol=0.01;, score=0.608 total time=   0.0s
[CV 3/5] END degree=2, kernel=sigmoid, tol=0.01;, score=0.608 total time=   0.1s
[CV 4/5] END degree=2, kernel=sigmoid, tol=0.01;, score=0.604 total time=   0.0s
[CV 5/5] END degree=2, kernel=sigmoid, tol=0.01;, score=0.611 total time=   0.0s
[CV 1/5] END degree=2, kernel=linear, tol=0.001;, score=0.613 total time=   0.0s
[CV 2/5] END degree=2, kernel=linear, tol=0.001;, score=0.608 total time=   0.0s
[CV 3/5] END degree=2, kernel=linear, tol=0.001;, score=0.608 total time=   0.0s
[CV 4/5] END degree=2, kernel=linear, tol=0.001;, score=0.608 total time=   0.0s
[CV 5/5] END degree=2, kernel=linear, tol=0.001;, score=0.611 total time=   0.0s
[CV 1/5] END degree=2, kernel=linear, tol=0.0001;, score=0.613 total time=   0.0s
[CV 2/5] END degree=2, kernel=linear, tol=0.0001;, score=0.608 total time=   0.0s
[CV 3/5] END degree=2, kernel=linear, tol=0.0001;, score=0.608 total time=   0.0s
[CV 4/5] END degree=2, ke

[CV 5/5] END ...degree=1, kernel=poly, tol=0.01;, score=0.611 total time=   0.0s
[CV 1/5] END degree=1, kernel=sigmoid, tol=0.001;, score=0.599 total time=   0.0s
[CV 2/5] END degree=1, kernel=sigmoid, tol=0.001;, score=0.604 total time=   0.0s
[CV 3/5] END degree=1, kernel=sigmoid, tol=0.001;, score=0.604 total time=   0.0s
[CV 4/5] END degree=1, kernel=sigmoid, tol=0.001;, score=0.599 total time=   0.0s
[CV 5/5] END degree=1, kernel=sigmoid, tol=0.001;, score=0.602 total time=   0.1s
[CV 1/5] END degree=1, kernel=sigmoid, tol=0.0001;, score=0.599 total time=   0.0s
[CV 2/5] END degree=1, kernel=sigmoid, tol=0.0001;, score=0.604 total time=   0.0s
[CV 3/5] END degree=1, kernel=sigmoid, tol=0.0001;, score=0.604 total time=   0.0s
[CV 4/5] END degree=1, kernel=sigmoid, tol=0.0001;, score=0.599 total time=   0.0s
[CV 5/5] END degree=1, kernel=sigmoid, tol=0.0001;, score=0.602 total time=   0.0s
[CV 1/5] END degree=1, kernel=sigmoid, tol=0.01;, score=0.599 total time=   0.0s
[CV 2/5] END 

[CV 1/5] END ....degree=3, kernel=rbf, tol=0.01;, score=0.613 total time=   0.1s
[CV 2/5] END ....degree=3, kernel=rbf, tol=0.01;, score=0.608 total time=   0.1s
[CV 3/5] END ....degree=3, kernel=rbf, tol=0.01;, score=0.608 total time=   0.0s
[CV 4/5] END ....degree=3, kernel=rbf, tol=0.01;, score=0.608 total time=   0.0s
[CV 5/5] END ....degree=3, kernel=rbf, tol=0.01;, score=0.611 total time=   0.1s
[CV 1/5] END ..degree=3, kernel=poly, tol=0.001;, score=0.613 total time=   0.1s
[CV 2/5] END ..degree=3, kernel=poly, tol=0.001;, score=0.608 total time=   0.1s
[CV 3/5] END ..degree=3, kernel=poly, tol=0.001;, score=0.608 total time=   0.1s
[CV 4/5] END ..degree=3, kernel=poly, tol=0.001;, score=0.608 total time=   0.1s
[CV 5/5] END ..degree=3, kernel=poly, tol=0.001;, score=0.611 total time=   0.1s
[CV 1/5] END .degree=3, kernel=poly, tol=0.0001;, score=0.613 total time=   0.1s
[CV 2/5] END .degree=3, kernel=poly, tol=0.0001;, score=0.608 total time=   0.1s
[CV 3/5] END .degree=3, kern

In [40]:
params_knn = {'n_neighbors': np.arange(4,8,1),
             'weights': ['uniform','distance'],
             'algorithm': ['auto','ball_tree','kd_tree','auto'],
             'leaf_size': np.arange(20,50,10)}
knn_grid = GridSearchCV(KNeighborsClassifier(), params_knn, verbose=3)
knn_PCA_grid = GridSearchCV(KNeighborsClassifier(), params_knn, verbose=3)

knn_grid.fit(X_train, y_train)
knn_PCA_grid.fit(X_train_PCA, y_train)

print(knn_grid.best_params_)
print(knn_PCA_grid.best_params_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.774 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.760 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.765 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.797 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.787 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=distance;, score=0.825 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=distance;, score=0.765 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=distance;, score=0.770 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=4, weights=distance;, sc

[CV 4/5] END algorithm=auto, leaf_size=40, n_neighbors=4, weights=distance;, score=0.806 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=40, n_neighbors=4, weights=distance;, score=0.796 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=uniform;, score=0.765 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=uniform;, score=0.747 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=uniform;, score=0.779 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=uniform;, score=0.788 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=uniform;, score=0.778 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=distance;, score=0.816 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=40, n_neighbors=5, weights=distance;, score=0.765 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf

[CV 4/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=4, weights=distance;, score=0.806 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=4, weights=distance;, score=0.796 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.770 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.747 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.779 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.788 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.782 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.816 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.765 total t

[CV 2/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.747 total time=   0.0s
[CV 3/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.779 total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.788 total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.782 total time=   0.0s
[CV 1/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=distance;, score=0.816 total time=   0.0s
[CV 2/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=distance;, score=0.765 total time=   0.0s
[CV 3/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=distance;, score=0.779 total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=distance;, score=0.802 total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=5, weights=distance;, score=0.806 total time=   0.0s
[CV 1

[CV 2/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=5, weights=distance;, score=0.765 total time=   0.0s
[CV 3/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=5, weights=distance;, score=0.779 total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=5, weights=distance;, score=0.802 total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=5, weights=distance;, score=0.806 total time=   0.0s
[CV 1/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.765 total time=   0.0s
[CV 2/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.747 total time=   0.0s
[CV 3/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.793 total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.806 total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.769 total time=   0.0s
[CV 1/

[CV 4/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.806 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.769 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=distance;, score=0.802 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=distance;, score=0.760 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=distance;, score=0.793 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=distance;, score=0.802 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=30, n_neighbors=6, weights=distance;, score=0.801 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=30, n_neighbors=7, weights=uniform;, score=0.765 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=30, n_neighbors=7, weights=uniform;, score=0.742 total time=   0.0s
[CV 3/5] END algorithm=auto, lea

[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.636 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.611 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.618 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.668 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.599 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.636 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.625 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=7, weights=uniform;, score=0.622 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=7, weights=uniform;, score=0.668 total time=   0.0s
[CV 3/5] END algorithm=auto, lea

[CV 2/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.668 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.618 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.664 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.639 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.641 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.659 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.645 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.622 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.606 total time=   0.0s
[CV 1/5]

[CV 2/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=uniform;, score=0.659 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=uniform;, score=0.645 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=uniform;, score=0.622 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=uniform;, score=0.606 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=distance;, score=0.618 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=distance;, score=0.645 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=distance;, score=0.618 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=distance;, score=0.631 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=4, weights=distance;, score=0.597 total 

[CV 4/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.608 total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=uniform;, score=0.620 total time=   0.0s
[CV 1/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.631 total time=   0.0s
[CV 2/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.664 total time=   0.0s
[CV 3/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.604 total time=   0.0s
[CV 4/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.645 total time=   0.0s
[CV 5/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.630 total time=   0.0s
[CV 1/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.618 total time=   0.0s
[CV 2/5] END algorithm=kd_tree, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.664 total time=   0.0s
[CV 3

[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.636 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.611 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.618 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.668 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.599 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.636 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=distance;, score=0.625 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=7, weights=uniform;, score=0.622 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=7, weights=uniform;, score=0.668 total time=   0.0s
[CV 3/5] END algorithm=auto, lea

[CV 5/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=uniform;, score=0.644 total time=   0.0s
[CV 1/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.636 total time=   0.0s
[CV 2/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.668 total time=   0.0s
[CV 3/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.618 total time=   0.0s
[CV 4/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.664 total time=   0.0s
[CV 5/5] END algorithm=auto, leaf_size=40, n_neighbors=7, weights=distance;, score=0.639 total time=   0.0s
{'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 5, 'weights': 'distance'}
{'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 7, 'weights': 'uniform'}


In [49]:
"""
With the best parameters above, we can now create new, optimized models for Random Forest,
SVC, and KNN. Let's see how their accuracies stack up
"""

#Random Forest

rf_opt = RandomForestClassifier(bootstrap=False,
                               max_depth=9,
                               min_samples_leaf=2,
                               n_estimators=50)
rf_opt.fit(X_train, y_train)
print('Optimized RF Accuracy: ' + str(rf_opt.score(X_test, y_test)))

rf_opt_PCA = RandomForestClassifier(bootstrap=False,
                                    max_depth=9,
                                    min_samples_leaf=2,
                                    n_estimators=100)
rf_opt_PCA.fit(X_train_PCA, y_train)
print('Optimized RF Accuracy (PCA): ' + str(rf_opt_PCA.score(X_test_PCA, y_test)))

#SVC

svc_opt = svc_opt_PCA = SVC(degree=1,
                            kernel='rbf',
                            tol=0.001)
svc_opt.fit(X_train, y_train)
print('\nOptimized SVC Accuracy: ' + str(svc_opt.score(X_test, y_test)))

svc_opt_PCA.fit(X_train_PCA, y_train)
print('Optimized SVC Accuracy (PCA): ' + str(svc_opt_PCA.score(X_test_PCA, y_test)))

#KNN

knn_opt = KNeighborsClassifier(algorithm='auto',
                              leaf_size=20,
                              n_neighbors=5,
                              weights='distance')

knn_opt.fit(X_train, y_train)
print('\nOptimized KNN Accuracy: ' + str(knn_opt.score(X_test, y_test)))

knn_opt = KNeighborsClassifier(algorithm='auto',
                              leaf_size=20,
                              n_neighbors=7,
                              weights='uniform')

knn_opt.fit(X_train_PCA, y_train)
print('Optimized KNN Accuracy (PCA): ' + str(knn_opt.score(X_test_PCA, y_test)))

Optimized RF Accuracy: 0.8344086021505376
Optimized RF Accuracy (PCA): 0.6860215053763441

Optimized SVC Accuracy: 0.546236559139785
Optimized SVC Accuracy (PCA): 0.546236559139785

Optimized KNN Accuracy: 0.8193548387096774
Optimized KNN Accuracy (PCA): 0.6645161290322581


In [None]:
"""
We can see that our grid search improves the performance of all of our models and allows us
to make better classifications from our engineered data. Random Forest is our most
robust model and can get us an accuracy well over 80% which is impressive considering the 
amount of initial features we began with.
"""