In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)

In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


In [4]:
# Bagging and Pasting

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [5]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
accuracy_score(y_test, y_pred_tree)

0.856

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True,
    random_state=40
)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.9226666666666666

In [7]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.92

In [8]:
bag_clf.oob_decision_function_

array([[0.29350649, 0.70649351],
       [0.38743455, 0.61256545],
       [1.        , 0.        ],
       [0.01333333, 0.98666667],
       [0.02857143, 0.97142857],
       [0.08888889, 0.91111111],
       [0.38121547, 0.61878453],
       [0.08108108, 0.91891892],
       [0.9327957 , 0.0672043 ],
       [0.84224599, 0.15775401],
       [0.5309973 , 0.4690027 ],
       [0.03674541, 0.96325459],
       [0.73453608, 0.26546392],
       [0.83819629, 0.16180371],
       [0.93523316, 0.06476684],
       [0.09677419, 0.90322581],
       [0.01804124, 0.98195876],
       [0.9276808 , 0.0723192 ],
       [0.69553806, 0.30446194],
       [0.94750656, 0.05249344],
       [0.06111111, 0.93888889],
       [0.20104439, 0.79895561],
       [0.88947368, 0.11052632],
       [0.97662338, 0.02337662],
       [0.96373057, 0.03626943],
       [0.0026455 , 0.9973545 ],
       [0.9469496 , 0.0530504 ],
       [1.        , 0.        ],
       [0.01595745, 0.98404255],
       [0.73209549, 0.26790451],
       [0.

In [10]:
# random forest

from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred_rf)

0.92

In [14]:
# feature importance

from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09755163695612955
sepal width (cm) 0.02360291953162654
petal length (cm) 0.4442316032773431
petal width (cm) 0.4346138402349008


In [15]:
# Ada boosting

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm="SAMME.R",
    learning_rate=0.5
)

ada_clf.fit(X_train, y_train)

y_pred_ada = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred_ada)

0.896

In [28]:
# Gradient boosting

from sklearn.tree import DecisionTreeRegressor
import numpy as np

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)

y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2)

y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3)

y_pred = np.rint(sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3)))

In [30]:
accuracy_score(y_test, y_pred)

0.896

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X_train, y_train)

y_pred = np.rint(gbrt.predict(X_test))

In [42]:
accuracy_score(y_test, y_pred)

0.896

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

m = 200
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train.ravel())

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train.ravel())

In [60]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train.ravel())
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break   # early stopping

In [63]:
import xgboost

xgb_reg = xgboost.XGBRegressor(early_stopping_rounds=2)
xgb_reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)])
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:2.64507
[1]	validation_0-rmse:1.99173
[2]	validation_0-rmse:1.60981
[3]	validation_0-rmse:1.39387
[4]	validation_0-rmse:1.27240
[5]	validation_0-rmse:1.22875
[6]	validation_0-rmse:1.21273
[7]	validation_0-rmse:1.23555
[8]	validation_0-rmse:1.24589
