### Ensemble Learning And Random Forests

In [1]:
import numpy as np

### Voting Classifier With MNIST Data Set

In [2]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.int64)

# mnist["data"], mnist["target"]
mnist["data"].shape

(70000, 784)

In [3]:
from sklearn.model_selection import train_test_split

X_train_validate, X_test, y_train_validate, y_test = train_test_split(
    mnist["data"], mnist.target, test_size=10000, random_state=11)

X_train, X_validate, y_train, y_validate = train_test_split(
    X_train_validate, y_train_validate, test_size=10000, random_state=11)


X_train.shape

(50000, 784)

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=12)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=12, verbose=0,
                       warm_start=False)

In [5]:
rf_clf.score(X_validate, y_validate)

0.9672

In [6]:
from sklearn.ensemble import ExtraTreesClassifier

et_clf = ExtraTreesClassifier(random_state=13)
et_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=13, verbose=0,
                     warm_start=False)

In [7]:
et_clf.score(X_validate, y_validate)

0.9687

In [10]:
from sklearn.svm import LinearSVC

lin_svm_clf = LinearSVC(random_state=14)
lin_svm_clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=14, tol=0.0001,
          verbose=0)

In [12]:
lin_svm_clf.score(X_validate, y_validate)

0.8713

In [13]:
from sklearn.ensemble import VotingClassifier

estimators = [('random_forest_clf', rf_clf), ('extra_trees_clf', et_clf), ('lin_svm_clf', lin_svm_clf)]
vote_clf_hard = VotingClassifier(estimators, voting='hard')

In [14]:
vote_clf_hard.fit(X_train, y_train)



VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_lea

In [15]:
vote_clf_hard.score(X_validate, y_validate)

0.9655

In [17]:
for (clf_name, clf) in estimators:
    print(clf_name, clf.score(X_validate, y_validate))

random_forest_clf 0.9672
extra_trees_clf 0.9687
lin_svm_clf 0.8713


In [19]:
vote_clf_hard.set_params(lin_svm_clf=None)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_lea

In [20]:
vote_clf_hard.estimators

[('random_forest_clf',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=12, verbose=0,
                         warm_start=False)),
 ('extra_trees_clf',
  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fracti

In [21]:
vote_clf_hard.estimators_

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=12, verbose=0,
                        warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs

In [22]:
del vote_clf_hard.estimators_[2]

In [23]:
vote_clf_hard.score(X_validate, y_validate)

0.969

In [24]:
vote_clf_hard.voting = "soft"

In [25]:
vote_clf_hard.score(X_validate, y_validate)

0.9694

In [27]:
vote_clf_hard.score(X_test, y_test)

0.9706

In [30]:
[estimator.score(X_test, y_test) for estimator in vote_clf_hard.estimators_]

[0.9676, 0.9707]

### Stacking With The MNIST Data Set Using A Blender

In [32]:
rf_clf = RandomForestClassifier(random_state=11)
et_clf = ExtraTreesClassifier(random_state=12)
lin_svm_clf = LinearSVC(random_state=13)

In [33]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=11, verbose=0,
                       warm_start=False)

In [34]:
et_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=12, verbose=0,
                     warm_start=False)

In [35]:
lin_svm_clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=13, tol=0.0001,
          verbose=0)

In [37]:
X_val_predictions = np.empty((len(X_validate), 3), dtype=np.float32)

In [44]:
for idx in range(len(X_validate)):
    X_val_predictions[idx] = [rf_clf.predict([X_validate[idx]]), et_clf.predict([X_validate[idx]]), lin_svm_clf.predict([X_validate[idx]])]
    if idx % 1000 == 0:
        print(X_val_predictions[idx])

[9. 9. 9.]
[9. 9. 9.]
[1. 1. 1.]
[8. 8. 7.]
[9. 9. 3.]
[3. 3. 3.]
[3. 3. 3.]
[6. 6. 6.]
[6. 6. 6.]
[3. 3. 3.]


In [48]:
blender_clf = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=14)

In [49]:
blender_clf.fit(X_val_predictions, y_validate)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=14, verbose=0,
                       warm_start=False)

In [50]:
X_val_predictions.shape

(10000, 3)

In [51]:
X_test_predictions = np.empty((len(X_test), 3), dtype=np.float32)
for idx in range(len(X_test)):
    X_test_predictions[idx] = [rf_clf.predict([X_test[idx]]), et_clf.predict([X_test[idx]]), lin_svm_clf.predict([X_test[idx]])]
    if idx % 1000 == 0:
        print(X_test_predictions[idx])

[0. 0. 0.]
[8. 8. 3.]
[3. 3. 3.]
[0. 0. 0.]
[4. 4. 4.]
[4. 4. 9.]
[3. 3. 3.]
[3. 3. 3.]
[0. 0. 0.]
[2. 2. 4.]


In [57]:
stacking_test_predictions = blender_clf.predict(X_test_predictions)
blender_clf.score(X_test_predictions, y_test)

0.9694