# 8) Combine different classifiers into an ensemble and train on the MNIST dataset

In [1]:
from sklearn.datasets import fetch_mldata
import numpy as np
from sklearn.model_selection import train_test_split
mnist = fetch_mldata('MNIST original')
X, y = mnist['data'], mnist['target']
X = X.astype(np.float64)
X_tr_val, X_test, y_tr_val, y_test = train_test_split(X, y, test_size=15000)
X_train, X_val, y_train, y_val = train_test_split(X_tr_val, y_tr_val, test_size=15000)

In [2]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((40000, 784), (40000,), (15000, 784), (15000,), (15000, 784), (15000,))

In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
ensemble = {'Random Forest': RandomForestClassifier(max_depth=6),
            'Extra Trees': ExtraTreesClassifier(max_depth=6),
            'Stochastic Gradient Descent': SGDClassifier(loss='log')}

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.base import clone
estimators_alone = []
y_train_preds = dict()
y_val_preds = dict()
y_test_preds = dict()
for name, base_est in ensemble.items():
    est = clone(base_est)
    est.fit(X_train, y_train)
    estimators_alone.append(est)
    y_pred = est.predict(X_train)
    y_train_preds[name] = y_pred
    y_val_pred = est.predict(X_val)
    y_val_preds[name] = y_val_pred
    # For the stacking ensemble later:
    y_test_pred = est.predict(X_test)
    y_test_preds[name] = y_test_pred
    print("{}:\ttrain: {:.0%}\tvalidation: {:.0%}".format(name, accuracy_score(y_pred, y_train), accuracy_score(y_val_pred, y_val)))

Random Forest:	train: 86%	validation: 85%
Extra Trees:	train: 81%	validation: 80%




Stochastic Gradient Descent:	train: 84%	validation: 83%


In [6]:
from sklearn.ensemble import VotingClassifier
vote_clf = VotingClassifier(ensemble.items(), voting='hard')
vote_clf.fit(X_train, y_train)
y_pred = vote_clf.predict(X_train)
y_val_pred = vote_clf.predict(X_val)
print("{:.0%} {:.0%}".format(accuracy_score(y_train, y_pred), accuracy_score(y_val_pred, y_val)))

  if diff:


89% 88%


  if diff:


# 9) Make a new training set with the predictors and run them through a blender to make a stacked blender

In [23]:
def format_predictions(vals_dict):
    arr_list = np.array(list(vals_dict.values()))
    return arr_list.T
X_stacking = format_predictions(y_train_preds)
X_val_stacking = format_predictions(y_val_preds)
X_test_stacking = format_predictions(y_test_preds)
X_stacking[:5]

array([[8., 8., 6.],
       [7., 7., 7.],
       [4., 4., 4.],
       [0., 0., 0.],
       [2., 2., 2.]])

In [39]:
def fit_and_model_predictions(model, train, val, test, y_train):
    model.fit(train, y_train)
    train_stacked_pred = model.predict(train)
    val_stacked_pred = model.predict(val)
    test_stacked_pred = model.predict(test)
    return model, train_stacked_pred, val_stacked_pred, test_stacked_pred

from sklearn.svm import LinearSVC
stack = ExtraTreesClassifier(max_depth=9, oob_score=True, bootstrap=True)
stack, stacking_preds, stacking_val_preds, stacking_test_preds = fit_and_model_predictions(stack,
                                                                                          X_stacking,
                                                                                          X_val_stacking,
                                                                                          X_test_stacking,
                                                                                          y_train)

stack.oob_score_, accuracy_score(stacking_preds, y_train), accuracy_score(stacking_val_preds, y_val), accuracy_score(stacking_test_preds, y_test)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


(0.883, 0.9048, 0.8902, 0.8909333333333334)