# Ensemble Learning and Random Forests

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

##  Imports

In [166]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier


## Models

### Voting technique

In [155]:
X, y = make_moons(1000, noise=0.15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [156]:
svc_clf, rf_clf, log_clf = SVC(), RandomForestClassifier(), LogisticRegression()

voting_clf = VotingClassifier(
    estimators=[('svc', svc_clf), ('rf', rf_clf), ('log',log_clf)],
    voting='hard', 
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('svc',
                              SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='rbf', max_iter=-1,
                                  probability=False, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gi...
                                                     random_state=None,
                                                     verbose=0,
                                                     warm_start=False)),

In [157]:
for clf in (svc_clf, rf_clf, log_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,  accuracy_score(y_test, y_pred))

SVC 0.995
RandomForestClassifier 0.995
LogisticRegression 0.9
VotingClassifier 0.995


###  Bagging and Pasting technique

In [164]:
# Bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, bootstrap=True, n_jobs=3)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.97

In [175]:
# Cross validation for bagging
score = cross_val_score(bag_clf, X_train, y_train, cv=3)
print(score)
print(np.mean(score))
print(np.std(score))

[0.98127341 0.96629213 0.96992481]
0.9724967850337455
0.006380748236260552


In [167]:
# Pasting
past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, bootstrap=False, n_jobs=3)
past_clf.fit(X_train, y_train)
y_pred = past_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.97

In [176]:
# Cross validation for pasting
score = cross_val_score(past_clf, X_train, y_train, cv=3)
print(score)
print(np.mean(score))
print(np.std(score))

[0.98127341 0.96254682 0.97368421]
0.972501478415139
0.007690706563740269


#### Bagging implementation 

In [345]:
# Trying to implement bagging algorithm
np.random.seed(42)
sample_size = 100
scores = []
for _ in range(0,100,25):
    rand_set = np.random.permutation(sample_size)
    voting_clf.fit(X_train[rand_set], y_train[rand_set])
    y_pred = voting_clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    print(f'score {score}')


print(f'Mean score : {np.mean(scores)}')
print(f'Score std : {np.std(scores):.4f}')

score 0.975
score 0.97
score 0.975
score 0.975
Mean score : 0.97375
Score std : 0.0022


### Out_of_bag score

In [341]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, bootstrap=True, oob_score=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(f'oob_score : {bag_clf.oob_score_}')
print(f'score: {accuracy_score(y_test, y_pred)}')

oob_score : 0.975
score: 0.97


In [179]:
bag_clf.oob_decision_function_

array([[0.9182243 , 0.0817757 ],
       [0.97550111, 0.02449889],
       [0.        , 1.        ],
       ...,
       [1.        , 0.        ],
       [0.2639821 , 0.7360179 ],
       [0.21428571, 0.78571429]])

#### oob implementation

In [344]:
# Trying to implement Out_of_bag sample score
'''
What I did here was to use python's set data type to find the intersection between a range of 
numbers from 1 to 100 that where no in the random_sample set(rand_set). 
This set of numbers is what is used as the out_of_bag sample.

Since the  OOB sample size is about 37% of the total training set. if you run ( X_train_oob.size) you would 
get 217 from a total of 1000

X_train_oob.shape => (217, 2)

This oob_score implementation would output a score that is an approximate if oob_score=True

Note: voting is set to hard so it won't give out a probability score 
'''
lst = np.arange(100)
oob_ =[]
np.random.seed(42)
sample_size = 100
scores = []

for _ in range(0,100,25):
    rand_set = np.random.permutation(sample_size)
    oob_sample = set(lst | rand_set)
    
    # getting the oob_samples
    for instance in oob_sample:
        oob_.append(instance)
    X_train_oob = X_train[oob_] # gets all the remaining samples that were not used
    y_train_oob = y_train[oob_]
    
    # fits a model on the oob samples and get a prediction 
    voting_clf.fit(X_train[rand_set], y_train[rand_set])   
    y_pred = voting_clf.predict(X_test)
    
    oob_score = voting_clf.fit(X_train_oob, y_train_oob)
    
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    print(f'score {score}')


print(f'Mean score : {np.mean(scores)}')
print(f'Score std : {np.std(scores):.4f}')
print('================================')
print(f'OOb_score_ : {oob_score.score(X_train_oob, y_train_oob):.4f}')

score 0.975
score 0.975
score 0.97
score 0.975
Mean score : 0.97375
Score std : 0.0022
OOb_score_ : 0.9952


# Random Forest