# Ensemble Learning and Random Forests

## Voting classifier

In [81]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# Get and split the data
X, y = make_moons(n_samples=500, random_state=235)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=122)

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [83]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
                              voting='hard')
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',...
                                        

In [85]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, round(accuracy_score(y_test, y_pred), 4))

LogisticRegression 0.9091
RandomForestClassifier 0.9939
SVC 1.0
VotingClassifier 1.0




## Bagging in scikit-learn

In [100]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            max_samples=100,
                            bootstrap=True,
                            n_jobs=-1,
                            oob_score=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

# Accuracy of using 500 tree classifiers and sample size of 100
accuracy_score(y_test, y_pred)

0.9939393939393939

In [101]:
# Use a single classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train[:100], y_train[:100])
y_pred = tree_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9515151515151515

In [102]:
# Check the accuracy of the out-of-bag evaluation
bag_clf.oob_score_

0.9850746268656716

## Random Forest

In [104]:
from sklearn.ensemble import RandomForestClassifier

In [112]:
# Making a random forest classifier model
rnd_clf = RandomForestClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 n_jobs=-1)
rnd_clf.fit(X_train[:100], y_train[:100])
y_pred_rf = rnd_clf.predict(X_test)

In [113]:
accuracy_score(y_test, y_pred_rf)

1.0

In [119]:
# Making a equivalent model with BaggingClassifier class
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter='random', max_leaf_nodes=16),
                            n_estimators = 500,
                            max_samples=1.0,
                            bootstrap=True,
                            n_jobs=-1)
bag_clf.fit(X_train[:100], y_train[:100])
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0