<a href="https://colab.research.google.com/github/yashveersinghsohi/Hands_On_ML_Book_Practice/blob/master/Chapter_7/Practice_Chapter7_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Packages

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.datasets import (
    make_moons, 
    load_iris
  )

from sklearn.ensemble import (
    RandomForestClassifier, 
    VotingClassifier, 
    BaggingClassifier, 
    RandomForestClassifier, 
    AdaBoostClassifier, 
    GradientBoostingRegressor
  )
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    accuracy_score, 
    mean_squared_error
  )

import xgboost

# Voting Classifier

In [2]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

Hard Voting

In [4]:
voting_clf = VotingClassifier(
    estimators=[
                ("lr", log_clf), 
                ("rf", rnd_clf), 
                ("svc", svm_clf)
              ], 
    voting="hard"
  )
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [5]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_true=y_test, y_pred=y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.888
SVC 0.896
VotingClassifier 0.896


Soft voting

In [6]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_clf = VotingClassifier(
    estimators=[
                ("lr", log_clf), 
                ("rf", rnd_clf), 
                ("svc", svm_clf)
              ], 
    voting="soft"
  )
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [7]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_true=y_test, y_pred=y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.896


# Bagging 

In [8]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
bag_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=500, 
    max_samples=100, 
    bootstrap=True, 
    n_jobs=-1, 
    oob_score=True
  )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
bag_clf.oob_score_, accuracy_score(y_true=y_test, y_pred=y_pred)

(0.9253333333333333, 0.912)

In [11]:
bag_clf.oob_decision_function_[:5]

array([[0.36031332, 0.63968668],
       [0.40052356, 0.59947644],
       [1.        , 0.        ],
       [0.00755668, 0.99244332],
       [0.0225    , 0.9775    ]])

# Random Forests

In [12]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rnd_clf = RandomForestClassifier(
    n_estimators=500, 
    max_leaf_nodes=16, 
    n_jobs=-1
  )
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0.92

# Feature Importances

In [13]:
iris = load_iris()
X, y = iris.data, iris.target

rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(X, y)

for name, score in zip(iris.feature_names, rnd_clf.feature_importances_):
  print(name, round(score, 4))

sepal length (cm) 0.1029
sepal width (cm) 0.024
petal length (cm) 0.4334
petal width (cm) 0.4397


# AdaBoost

In [14]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ada_clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1), 
    n_estimators=200, 
    algorithm="SAMME.R", 
    learning_rate=0.5
  )
ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0.896

# Gradient Boosting

In [15]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [19]:
gbrt = GradientBoostingRegressor(
    max_depth=2, 
    n_estimators=3, 
    learning_rate=1.0
  )
gbrt.fit(X, y)
np.sqrt(mean_squared_error(y_true=y, y_pred=gbrt.predict(X)))

0.07097928528094154

Gradient Boosting with early stopping

In [20]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
  )

In [21]:
gbrt = GradientBoostingRegressor(
    max_depth=2, 
    n_estimators=120, 
    random_state=42
  )
gbrt.fit(X_train, y_train)

np.sqrt(
    mean_squared_error(
        y_true=y_test, 
        y_pred=gbrt.predict(X_test)
      )
  )

0.04759227782421127

In [23]:
errors = [mean_squared_error(y_true=y_test, y_pred=y_pred) \
          for y_pred in gbrt.staged_predict(X_test)]
best_n_estimators = np.argmin(errors) + 1

In [24]:
best_n_estimators

94

In [25]:
gbrt = GradientBoostingRegressor(
    max_depth=2, 
    n_estimators=best_n_estimators, 
    random_state=42
  )
gbrt.fit(X_train, y_train)

np.sqrt(
    mean_squared_error(
        y_true=y_test, 
        y_pred=gbrt.predict(X_test)
      )
  )

0.04740912595893978

# XGBoost

In [27]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
  )

In [28]:
xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(
    X_train, y_train, 
    eval_set=[(X_test, y_test)], 
    early_stopping_rounds=2
  )

y_pred = xgb_reg.predict(X_test)
np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))

[0]	validation_0-rmse:0.260696
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.234901
[2]	validation_0-rmse:0.211825
[3]	validation_0-rmse:0.191044
[4]	validation_0-rmse:0.173044
[5]	validation_0-rmse:0.156515
[6]	validation_0-rmse:0.141707
[7]	validation_0-rmse:0.128399
[8]	validation_0-rmse:0.116099
[9]	validation_0-rmse:0.105241
[10]	validation_0-rmse:0.095796
[11]	validation_0-rmse:0.087486
[12]	validation_0-rmse:0.080175
[13]	validation_0-rmse:0.074239
[14]	validation_0-rmse:0.069596
[15]	validation_0-rmse:0.065375
[16]	validation_0-rmse:0.061101
[17]	validation_0-rmse:0.058473
[18]	validation_0-rmse:0.056363
[19]	validation_0-rmse:0.053959
[20]	validation_0-rmse:0.05253
[21]	validation_0-rmse:0.05154
[22]	validation_0-rmse:0.05064
[23]	validation_0-rmse:0.049467
[24]	validation_0-rmse:0.048814
[25]	validation_0-rmse:0.048431
[26]	validation_0-rmse:0.047989
[27]	validation_0-rmse:0.047679
[28]	validation_0-rmse:0.047529
[29]	validation_0-rms

0.046856680237859674