In [36]:
import numpy as np
import pandas as pd 


from sklearn.datasets import make_moons,load_iris,fetch_openml
from sklearn.ensemble import RandomForestClassifier, VotingClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from time import time
from sklearn.svm import LinearSVC

In [3]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

X_train,X_test,y_train,y_test = train_test_split(X,y, random_state = 42)

In [4]:
log_clf = LogisticRegression()
rdn_clf = RandomForestClassifier()
svm_clf = SVC()

In [5]:
voting_clf = VotingClassifier(
    estimators=[('lr',log_clf), ('rf', rdn_clf), ('svc', svm_clf)],
    voting = 'hard'
)
voting_clf.fit(X_train,y_train)

In [6]:
for clf in (log_clf,rdn_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.904


## How bagging works towards ensemble method

In [7]:
# Ensemble method of 500 decision trees classifiers
# Each tree is trained using 100 training instances randomly sampled from the training set
# With replacement (boostrap = true)

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples = 100,
    bootstrap= True,
    n_jobs = -1 # use all gpu
)
bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)

# Begging performs by standard the soft voting instead of hard voting

## Taking advantage of Out of bag evaluation , instead of validation set

In [8]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples = 100,
    bootstrap= True,
    n_jobs = -1,
    oob_score = True
)
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.9253333333333333

## So, according to the oob evaluation , this bagging classifier is likely to achieve about 92% accuracy on the test set

In [9]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.92

In [10]:
# These are the probabilities of each instance has to belong to the positive or negative class, respectively
bag_clf.oob_decision_function_

array([[0.36132316, 0.63867684],
       [0.37795276, 0.62204724],
       [0.99487179, 0.00512821],
       [0.01025641, 0.98974359],
       [0.01832461, 0.98167539],
       [0.09207161, 0.90792839],
       [0.39493671, 0.60506329],
       [0.05483029, 0.94516971],
       [0.96266667, 0.03733333],
       [0.82552083, 0.17447917],
       [0.52238806, 0.47761194],
       [0.056     , 0.944     ],
       [0.72606383, 0.27393617],
       [0.85459184, 0.14540816],
       [0.90288714, 0.09711286],
       [0.09473684, 0.90526316],
       [0.0492228 , 0.9507772 ],
       [0.93041237, 0.06958763],
       [0.65974026, 0.34025974],
       [0.95372751, 0.04627249],
       [0.04533333, 0.95466667],
       [0.25692695, 0.74307305],
       [0.905     , 0.095     ],
       [0.98697917, 0.01302083],
       [0.95263158, 0.04736842],
       [0.00802139, 0.99197861],
       [0.96124031, 0.03875969],
       [1.        , 0.        ],
       [0.02956989, 0.97043011],
       [0.70437018, 0.29562982],
       [0.

## Random Forests

In [11]:
#ensemble of decision trees after Bagging

rdn_clf = RandomForestClassifier(
    n_estimators=500,
    max_leaf_nodes=16,
    n_jobs=-1
)

# 500 trees of 16 nodes using all gpu
rdn_clf.fit(X_train,y_train)
rdn_clf.predict(X_test)

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

## Important of features

With random forests we can set the importance of each features

scikit-learn verifies which features decreases the most of impurity with each tree , making a weighted average 

In [14]:
iris=  load_iris()
rnd_clf= RandomForestClassifier(
    n_estimators=500,
    n_jobs=-1
)
X = iris.data
y = iris.target
rdn_clf.fit(X,y)

In [15]:
for name,score in zip(iris['feature_names'] , rdn_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.10711348023519676
sepal width (cm) 0.023471095498304163
petal length (cm) 0.45187286891340145
petal width (cm) 0.41754255535309753


### Extra trees add randonmness with the decision of a simple threeshold for every tree, instead of trying to find the best parameter to make the division . This increases bias and decreases variance -> Speed grows very fast

# Ensemble methods on MNIST

In [18]:
mnist = fetch_openml('mnist_784', version=1, as_frame= False)

X = mnist['data'].astype(np.float32)
y = mnist['target'].astype(int)

In [21]:
# Normalizing the pixels

X /= 255

In [23]:
# 50 k training
# 10 k validation
# 10 k testing

# First separate 60 k train + val and 10 k testing
# stratify to keep same proportion of classes of vector y in train and test

X_temp, X_test, y_temp, y_test= train_test_split(X,y,test_size = 10000, random_state=42 , stratify=y)

# now separate training into 50 k training and 10 k validation

X_train,X_val,y_train,y_val=train_test_split(X_temp,y_temp,test_size = 10000, random_state=42, stratify = y_temp)


### Time to train different models 
- random forest
- Extra trees
- SVM

In [33]:
clf_rand_forest = RandomForestClassifier(
    n_estimators= 100,
    random_state = 42
)

start = time()
clf_rand_forest.fit(X_train,y_train)
train_time = time() - start

y_pred = clf_rand_forest.predict(X_val)
acc = accuracy_score(y_pred,y_val)


print("Random Forest Accuracy:  ", acc)
print("Random Forest Train time: ", train_time)

Random Forest Accuracy:   0.9703
Random Forest Train time:  14.82360291481018


In [42]:
clf_extra_trees = ExtraTreesClassifier(
    n_estimators= 100,
    random_state=42
)

start = time()
clf_extra_trees.fit(X_train,y_train)
train_time = time() - start

y_pred = clf_extra_trees.predict(X_val)
acc = accuracy_score(y_pred,y_val)

print("Extra Trees Accuracy:  ", acc)
print("Extra Trees Train time: ", train_time)

Extra Trees Accuracy:   0.9735
Extra Trees Train time:  10.087021112442017


In [43]:
clf_svm = LinearSVC(
    random_state=42
)

start = time()
clf_svm.fit(X_train,y_train)
train_time = time() - start

y_pred = clf_svm.predict(X_val)
acc = accuracy_score(y_pred,y_val)

print("SVM Accuracy:  ", acc)
print("SVM Train time: ", train_time)

SVM Accuracy:   0.9173
SVM Train time:  21.39802598953247


## Time to verify the ensemble

In [44]:
# Since LINEARSVC does not have any predict_proba , we can only use HARD VOTING

ensemble = VotingClassifier(
    estimators=[
        ('rf', clf_rand_forest),
        ('et', clf_extra_trees),
        ('svm',clf_svm)
    ],
    voting = 'hard'
)
start = time()
ensemble.fit(X_train,y_train)
train_time = time() - start

y_pred=ensemble.predict(X_val)
acc = accuracy_score(y_pred,y_val)

print("Ensemble Accuracy:  ", acc)
print("Ensemble Train time: ", train_time)


Ensemble Accuracy:   0.971
Ensemble Train time:  45.98756504058838


# Since random forests and extra trees can capture similar mistakes , ensemble does not perform better than extra trees 

# But ensemble can be more stable and decrease individual mistakes combining the advantages of each model