<a href="https://colab.research.google.com/github/yashveersinghsohi/Hands_On_ML_Book_Practice/blob/master/Chapter_7/Chapter7_Ensembling_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Packages

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import (
    VotingClassifier, 
    RandomForestClassifier, 
    BaggingClassifier, 
    AdaBoostClassifier, 
    GradientBoostingRegressor
  )
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons, load_iris

import xgboost

# Data

In [15]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Voting Classifier

## Hard Voting

**Base estimators**

In [16]:
log_clf = LogisticRegression(random_state=42, solver="lbfgs")
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
svm_clf = SVC(random_state=42, gamma="scale")

**Ensemble**

In [17]:
voting_clf = VotingClassifier(
    estimators=[
                ("lr", log_clf), 
                ("rf", rf_clf), 
                ("svm", svm_clf)
              ], 
    voting="hard"
   )

In [18]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                               

**Accuracy**

In [20]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


## Soft Voting

**Base Estimators**

In [21]:
log_clf = LogisticRegression(random_state=42, solver="lbfgs")
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
svm_clf = SVC(random_state=42, gamma="scale", probability=True)

**Ensemble**

In [22]:
voting_clf = VotingClassifier(
    estimators=[
                ("lr", log_clf), 
                ("rf", rf_clf), 
                ("svm", svm_clf)
              ], 
    voting="hard"
   )

In [23]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                               

**Accuracy**

In [24]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


# Bagging and Pasting

In [31]:
bag_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=42), 
    n_estimators=500, 
    bootstrap=True, 
    n_jobs=-1, 
    random_state=42
  )

In [32]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.912

**OOB Score**

In [34]:
bag_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=42), 
    n_estimators=500, 
    bootstrap=True, 
    n_jobs=-1, 
    oob_score=True, 
    random_state=42
  )
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8986666666666666

In [35]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [37]:
bag_clf.oob_decision_function_[:5]

array([[0.32352941, 0.67647059],
       [0.35625   , 0.64375   ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ]])

# Random Forests

In [38]:
rnd_clf = RandomForestClassifier(
    n_estimators=500, 
    max_leaf_nodes=16, 
    n_jobs=-1
  )
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

# Feature Importance

In [42]:
iris = load_iris()

rnd_clf = RandomForestClassifier(
    random_state=42, 
    n_estimators=500, 
    n_jobs=-1
  )
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
  print(name, score)

sepal length (cm) 0.11249225099876375
sepal width (cm) 0.02311928828251033
petal length (cm) 0.4410304643639577
petal width (cm) 0.4233579963547682


# Boosting

## Ada-Boost

In [44]:
ada_clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(random_state=42, max_depth=1), 
    n_estimators=200, 
    algorithm="SAMME.R", 
    learning_rate=0.5
  )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [45]:
accuracy_score(
    y_true=y_test, 
    y_pred=ada_clf.predict(X_test)
  )

0.896

## Gradient Boosting

In [47]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

**GBM with Early Stopping**

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(
    random_state=42, 
    max_depth=2, 
    n_estimators=120
  )
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [52]:
errors = [mean_squared_error(y_true=y_val, y_pred=y_pred) 
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

In [53]:
gbrt_best = GradientBoostingRegressor(
    random_state=42, 
    max_depth=2, 
    n_estimators=bst_n_estimators
  )
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=53,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
gbrt = GradientBoostingRegressor(
    random_state=42, 
    max_depth=2, 
    warm_start=True
  )

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  
  y_pred = gbrt.predict(X_val)
  val_error = mean_squared_error(y_val, y_pred)
  
  if val_error < min_val_error:
    min_val_error = val_error
    error_going_up = 0
  else:
    error_going_up += 1
    if error_going_up == 5:
      break # early stopping

In [55]:
gbrt

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=47,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=True)

## XGBoost

In [57]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)



In [58]:
xgb_reg.fit(
    X_train, 
    y_train, 
    eval_set=[(X_val, y_val)], 
    early_stopping_rounds=2
  )
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.264628
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.238158
[2]	validation_0-rmse:0.213671
[3]	validation_0-rmse:0.191839
[4]	validation_0-rmse:0.173536
[5]	validation_0-rmse:0.15676
[6]	validation_0-rmse:0.142701
[7]	validation_0-rmse:0.130993
[8]	validation_0-rmse:0.119751
[9]	validation_0-rmse:0.11102
[10]	validation_0-rmse:0.103301
[11]	validation_0-rmse:0.096759
[12]	validation_0-rmse:0.090696
[13]	validation_0-rmse:0.085071
[14]	validation_0-rmse:0.080967
[15]	validation_0-rmse:0.076923
[16]	validation_0-rmse:0.073651
[17]	validation_0-rmse:0.070516
[18]	validation_0-rmse:0.067908
[19]	validation_0-rmse:0.065979
[20]	validation_0-rmse:0.064584
[21]	validation_0-rmse:0.063215
[22]	validation_0-rmse:0.062086
[23]	validation_0-rmse:0.061187
[24]	validation_0-rmse:0.060842
[25]	validation_0-rmse:0.060447
[26]	validation_0-rmse:0.060134
[27]	validation_0-rmse:0.059679
[28]	validation_0-rmse:0.059234
[29]	validation_0-rm