**Chapter 7 – Ensemble Learning and Random Forests**

# 8. Train various classifiers on MNIST data, then combine them into an ensemble

In [32]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, f1_score
import numpy as np
import time

In [2]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
X, y = mnist["data"], mnist["target"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 10000, random_state=43)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 10000, random_state=42)
print 'X_train shape:', X_train.shape
print 'X_val shape:', X_val.shape
print 'X_test shape:', X_test.shape

X_train shape: (50000, 784)
X_val shape: (10000, 784)
X_test shape: (10000, 784)


In [9]:
# It takes about an hour to train the log_reg model. The others are pretty fast.
vote_clf = VotingClassifier([('log_reg', LogisticRegression(penalty='l2', n_jobs=-1)), 
                             ('rf', RandomForestClassifier(n_estimators=100, min_samples_leaf=1, n_jobs=-1)),
                             ('et', ExtraTreesClassifier(n_estimators=100, min_samples_leaf=1, n_jobs=-1))],
               voting='soft')
vote_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('log_reg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', R...mators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None)

In [19]:
print 'Validation accuracy:'
print 'Logistic Regression:', accuracy_score(y_val, vote_clf.estimators_[0].predict(X_val))
print 'Random Forest:', accuracy_score(y_val, vote_clf.estimators_[1].predict(X_val))
print 'Extra Trees Classifier:', accuracy_score(y_val, vote_clf.estimators_[2].predict(X_val))
print 'Ensemble:', accuracy_score(y_val, vote_clf.predict(X_val))

Validation accuracy:
Logistic Regression: 0.9142
Random Forest: 0.9686
Extra Trees Classifier: 0.971
Ensemble: 0.9538


In [16]:
print 'Test accuracy:'
print 'Logistic Regression:', accuracy_score(y_test, vote_clf.estimators_[0].predict(X_test))
print 'Random Forest:', accuracy_score(y_test, vote_clf.estimators_[1].predict(X_test))
print 'Extra Trees Classifier:', accuracy_score(y_test, vote_clf.estimators_[2].predict(X_test))
print 'Ensemble:', accuracy_score(y_test, vote_clf.predict(X_test))

Test accuracy:
Logistic Regression: 0.9154
Random Forest: 0.9672
Extra Trees Classifier: 0.9723
Ensemble: 0.9579


# Combine predictions from the individual classifiers above and train a classifier on those predictions.

In [36]:
preds = []
for est in vote_clf.estimators_:
    preds.append(est.predict_proba(X_train))

In [35]:
m_train = np.c_[preds[0], preds[1], preds[2]]
m_train.shape

(50000, 30)

In [48]:
rf_blender = RandomForestClassifier(n_estimators=100)
param_grid = {'min_samples_leaf':[1, 2, 3, 5, 10]}
grid_search = GridSearchCV(rf_blender, param_grid, scoring='accuracy', n_jobs=-1)
grid_search.fit(m_train, y_train)
print 'Best parameters:', grid_search.best_params_

 Best parameters: {'min_samples_leaf': 1}


In [44]:
preds = []
for est in vote_clf.estimators_:
    preds.append(est.predict_proba(X_val))
m_val = np.c_[preds[0], preds[1], preds[2]]

In [51]:
blender_val_preds = grid_search.best_estimator_.predict(m_val)

print 'Random Forest Blender validation accuracy:', accuracy_score(y_val, blender_val_preds)

Random Forest Blender validation accuracy: 0.9385
