In [3]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

1 -  Possible to have 5 different models that each have a 95% accuracy and build an ensemble model with these predictors that has an accuracy higher than 95% 
How: A predictor can be making more accurate predictions in instances where another predictor is making an inacurrate prediction

What helps: 
- Very different models (independent models, that will make different types of errors) 
- Trained on different subsets of the data (bagging) 
- Use a voting ensemble, predicting the majority voted class

2 - Difference between hard and soft voting classifier
- The hard voting ensemble selects the majority voted class 
- The soft voting ensemble selects the class with the highest class probab, averaged over all individual classifiers (all classifiers need to be able to output class probab)
---> High confidence classes are given more weight

3 - Bagging, pasting, RF models can be sped up by distributing across multiple servers, each predictor is built independently of the other 

On the other hand, boosting and stacking predictors are built on top of one another, and cannot be trained in // (predictors in the same layer can be trained in // w. stacking, but layers need to be trained sequentially) 

4 - Benefit of out of bag evaluation: Only around 63% of instances are sampled in bagging ensembles- 37% of training ensembles are not sampled and cannot therefore be used as out of bag samples. Instead of using less training data (w/ a train / val set split), can use the entire set knowing that ~1/3 of obs. won't be sampled in training set for each predictor

5 - In random forests, each predictor (tree) uses a random subset of features to split each node
With Extra Trees, the same applies, but instead of looking for the optimal split in this random subset of features, a random threshold is set for each feature. Therefore, Extra Trees can be thought of as regularized Random Forests and may perform better in the scenario where Random Forests overfit. 

Since ET don't search for optimal split, they can be trained faster than RFs (but predictions are the same)

6 - If AdaBoost underfits the training data, the hyperparameters that should be tweaked are: 
- Learning rate (increase) 
- n_estimators (increase)

7 - To avoid overfitting in Gradient Boosting: 
- Decrease learning rate
- Implement early stopping (too many estimators / predictors) 

## Question 8 
- Load MNIST data set
- Train SVM, RF, Extra Trees, SVM 
- Bring them into ensemble (soft / hard voting) 

In [2]:
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(np.int64)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

NameError: name 'np' is not defined

In [4]:
X = mnist.data
y = mnist.target

## Get Test set 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 10000, random_state=42)

## Get train and val set
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size = 10000, random_state=42)

In [5]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

In [6]:
## Fit the models on the train set, no need for cross validation for now 
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42)
et_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)
svc_clf = LinearSVC(random_state=42)

In [8]:
## Train models 
estimators = [rf_clf, et_clf, svc_clf]

for estimator in estimators: 
    print("Training the estimator: ", estimator)
    estimator.fit(X_train,y_train)

Training the estimator:  RandomForestClassifier(n_estimators=10, random_state=42)
Training the estimator:  ExtraTreesClassifier(n_estimators=10, random_state=42)
Training the estimator:  LinearSVC(random_state=42)




In [10]:
# the score() method -> Return the mean accuracy on the given test data and labels.
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9469, 0.9492, 0.8695]

In [14]:
## Put predictors in training ensemble 
named_estimators = [
    ("rf",rf_clf), 
    ("et",et_clf), 
    ("svc",svc_clf)
]

voting_classifier = VotingClassifier(named_estimators)
voting_classifier.fit(X_train, y_train)



VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     random_state=42)),
                             ('et',
                              ExtraTreesClassifier(n_estimators=10,
                                                   random_state=42)),
                             ('svc', LinearSVC(random_state=42))])

In [15]:
voting_classifier.score(X_val, y_val)

0.9511

In [16]:
[estimator.score(X_val, y_val) for estimator in voting_classifier.estimators_]

## @note: 
## voting_classifier.estimators return an array of tuples - see *1 
## On the other hand voting_classifier.estimators_ returns the models - see *2 


## *1
# [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
#  ('et', ExtraTreesClassifier(n_estimators=10, random_state=42)),
#  ('svc', LinearSVC(random_state=42))]

## *2 
# [RandomForestClassifier(n_estimators=10, random_state=42),
#  ExtraTreesClassifier(n_estimators=10, random_state=42),
#  LinearSVC(random_state=42)]

[0.9469, 0.9492, 0.8695]

In [25]:
## Remove SVC - underperforms 
del voting_classifier.estimators_[2]
voting_classifier.estimators_

[RandomForestClassifier(n_estimators=10, random_state=42),
 ExtraTreesClassifier(n_estimators=10, random_state=42)]

In [26]:
voting_classifier.score(X_val, y_val) ## Bizarrily enough, underperforms! Probably RF and ET performing same errors, models are too similar 

0.9445

In [28]:
voting_classifier.voting = 'soft'
voting_classifier.score(X_val, y_val) ## Better 

0.9595

In [30]:
print("Voting classifier score: ", voting_classifier.score(X_test, y_test))

[estimator.score(X_test, y_test) for estimator in voting_classifier.estimators_]



Voting classifier score:  0.961


[0.9437, 0.9474]

## Question 9

In [None]:
## Run classifiers aboive to make predicftions on validation set 
## Create new training set with predictions 
##    -> Each training instance is a vector containing the set of predictions from all your classifiers for an image, target is image's class 

## Train a classifier on new training set --> Have a blender and form a stacking ensemble 

## Evaluate ensemble on teset set 
##   For each image in test set, make prediction with classifiers, then feed predictions to the blender to get ensemble predictions

In [8]:
## retrain predictors in case
estimators = [rf_clf, et_clf, svc_clf]


for estimator in estimators: 
    print("Training estimator: ", estimator)
    estimator.fit(X_train,y_train)

Training estimator:  RandomForestClassifier(n_estimators=10, random_state=42)
Training estimator:  ExtraTreesClassifier(n_estimators=10, random_state=42)
Training estimator:  LinearSVC(random_state=42)




In [21]:
## Make preditions on validation set 
val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators): 
    val_predictions[:, index] = estimator.predict(X_val)

In [24]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(val_predictions, y_val)

RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

In [25]:
rnd_forest_blender.oob_score_


0.951

In [26]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators): 
    X_test_predictions[:, index] = estimator.predict(X_test)

y_pred = rnd_forest_blender.predict(X_test_predictions)

array(['8', '4', '8', ..., '3', '8', '3'], dtype=object)