In [27]:
from keras.datasets import mnist
import numpy as np

### Load Data

In [2]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=1/6., random_state=42)

train_X = train_X.reshape((train_X.shape[0], -1))
val_X = val_X.reshape((val_X.shape[0], -1))
test_X = test_X.reshape((test_X.shape[0], -1))

print("Train Size", train_X.shape)
print("Validation Size", val_X.shape)
print("Test Size", test_X.shape)

Train Size (50000, 784)
Validation Size (10000, 784)
Test Size (10000, 784)


### Grid Search

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier


rfcls = RandomForestClassifier(max_depth=700, n_estimators=10)
# parameters = {'max_depth':(100, 300, 500, 700, None)}
# rfcls = GridSearchCV(rf, parameters)
rfcls.fit(train_X, train_y)
# print("Best parameter",rfcls.best_params_)

prediction = rfcls.predict(val_X)


print("Random Forest validation accuracy", accuracy_score(val_y, prediction))

Random Forest validation accuracy 0.9436


### Extra Trees

In [5]:
from sklearn.ensemble import ExtraTreesClassifier

etcls = ExtraTreesClassifier(n_estimators=10, random_state=42)
etcls.fit(train_X, train_y)

prediction = etcls.predict(val_X)
print("Extra Trees validation accuracy", accuracy_score(val_y, prediction))

Extra Trees validation accuracy 0.9507


### Adaboost

In [8]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adacls = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=10, 
                            algorithm="SAMME.R", learning_rate=0.8)
adacls.fit(train_X, train_y)
prediction = adacls.predict(val_X)
print("Adaboost validation accuracy", accuracy_score(val_y, prediction))

### Voting

In [15]:
from sklearn.ensemble import VotingClassifier

vcls_h = VotingClassifier(estimators=[('random forest', rfcls), ('extra trees', etcls), 
                                           ('adaviist', adacls)], voting='hard')


for clf in (rfcls, etcls, adacls, vcls_h):
    clf.fit(train_X, train_y)
    prediction = clf.predict(val_X)
    print(clf.__class__.__name__, accuracy_score(prediction, val_y))

In [23]:
vcls_s = VotingClassifier(estimators=[('random forest', rfcls), ('extra trees', etcls), 
                                           ('adaviist', adacls)], voting='soft')

vcls_s.fit(train_X, train_y)
prediction = vcls_s.predict(val_X)
print(vcls_s.__class__.__name__, accuracy_score(prediction, val_y))

### Test Set

In [26]:
for clf in (rfcls, etcls, adacls, vcls_h, vcls_s):
    prediction = clf.predict(test_X)
    print(clf.__class__.__name__, accuracy_score(prediction, test_y))

RandomForestClassifier 0.9421
ExtraTreesClassifier 0.9467
AdaBoostClassifier 0.9414
VotingClassifier 0.9549
VotingClassifier 0.9523


## Best Model: Hard Voting Classifier