## Ensemble learning prototyping

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [2]:
X, y = mnist["data"], mnist["target"]

In [3]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
pca.fit_transform(X)


array([[ 122.25525533, -316.23384391,  -51.13183087, ...,   34.71703473,
         -14.22575676,   21.38272145],
       [1010.49400346, -289.96362059,  576.1207452 , ...,   23.87884359,
          -6.54283564,  -24.90277545],
       [ -58.99594719,  393.69744499, -161.99818411, ...,   -5.36282742,
          55.00020853,  -96.73397123],
       ...,
       [-271.50701323,  590.07850009,  341.36886918, ...,  -43.7571469 ,
          35.78216024,   49.96612771],
       [-310.22482291, -116.72715081,  635.71999693, ...,  -21.86345345,
          20.40152778,  -42.68277473],
       [1058.86212574,  -83.39253843,  731.34218396, ...,   41.22834049,
         -20.05206663,  -49.92361814]])

In [4]:
import numpy as np
y = y.astype(np.uint8)

In [5]:
X_train, X_test, X_val, y_train, y_test, y_val = X[:50000], X[50000:60000], X[60000:], y[:50000], y[50000:60000], y[60000:]

In [6]:
print(X_val.shape)
print(X_test.shape)

print(y_val.shape)
print(y_test.shape)

(10000, 784)
(10000, 784)
(10000,)
(10000,)


## Training models

In [7]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_clf.score(X_val, y_val)

0.9785

In [9]:
random_forest_clf = RandomForestClassifier()
random_forest_clf.fit(X_train, y_train)
random_forest_clf.score(X_val, y_val)

0.9677

In [10]:
xtra_trees_clf = ExtraTreesClassifier()
xtra_trees_clf.fit(X_train, y_train)
xtra_trees_clf.score(X_val, y_val)

0.9702

In [11]:
voting_clf = VotingClassifier(estimators=[('svm', svm_clf),
                                          ('rf', random_forest_clf),
                                          ('xt', xtra_trees_clf)], voting='hard')
voting_clf = voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

0.9733

In [12]:
estimators = [random_forest_clf, xtra_trees_clf, svm_clf]

In [13]:
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [14]:
X_val_predictions

array([[7., 7., 7.],
       [2., 2., 2.],
       [1., 1., 1.],
       ...,
       [4., 4., 4.],
       [5., 5., 5.],
       [6., 6., 6.]], dtype=float32)

In [15]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [16]:
rnd_forest_blender.oob_score_

0.9748

Neither the voting classifier nor stacking classifier seem to be able to beat the SVM in this case.