In [75]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [76]:
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

In [77]:
type(X)

numpy.ndarray

In [78]:
type(y)

numpy.ndarray

In [79]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

In [80]:
from sklearn.tree import export_graphviz

In [81]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "/home/takashi/ml"
CHAPTER_ID = "decision_trees"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)

In [82]:
export_graphviz(
        tree_clf,
        out_file=image_path("iris_tree.dot"),
        feature_names=iris.feature_names[2:],
        rounded=True,
        filled=True
)

In [83]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [84]:
tree_clf.predict([[5, 1.5]])

array([1])

In [85]:
from sklearn.tree import DecisionTreeRegressor

In [86]:
m = 1000
X = 6*np.random.rand(m, 1) - 3
y = 0.5*X**2 + X + 2 + np.random.randn(m, 1)

In [87]:
type(X)

numpy.ndarray

In [88]:
type(y)

numpy.ndarray

In [89]:
data = np.concatenate((X, y), axis=1)
data

array([[-0.75275929,  1.70826499],
       [ 2.70428584,  7.02552243],
       [ 1.39196365,  4.7409429 ],
       ...,
       [-2.17908821,  2.15808771],
       [ 2.70142412,  7.92096804],
       [-0.32396536,  1.03609044]])

In [90]:
type(data)

numpy.ndarray

In [91]:
import pandas as pd

column_values = ['X', 'y']
  
df = pd.DataFrame(data = data, 
                  columns = column_values)
df.head()

Unnamed: 0,X,y
0,-0.752759,1.708265
1,2.704286,7.025522
2,1.391964,4.740943
3,0.591951,3.37774
4,-2.063888,2.625719


In [92]:
X = df.X
y = df.y

In [93]:
X.head()

0   -0.752759
1    2.704286
2    1.391964
3    0.591951
4   -2.063888
Name: X, dtype: float64

In [94]:
type(X)

pandas.core.series.Series

In [95]:
y.head()

0    1.708265
1    7.025522
2    4.740943
3    3.377740
4    2.625719
Name: y, dtype: float64

In [96]:
type(y)

pandas.core.series.Series

In [97]:
tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X.values.reshape(-1, 1), y)

In [98]:
export_graphviz(
        tree_reg,
        out_file=image_path("noisy_quadratic.dot"),
        #feature_names="X",
        rounded=True,
        filled=True
)

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [100]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [101]:
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

In [105]:
from sklearn.metrics import accuracy_score

In [106]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 1.0
RandomForestClassifier 1.0
SVC 1.0
VotingClassifier 1.0


In [107]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [108]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1)

In [109]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [110]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True)

In [111]:
bag_clf.fit(X_train, y_train)

In [112]:
bag_clf.oob_score_

0.9583333333333334

In [113]:
from sklearn.metrics import accuracy_score

In [114]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [115]:
bag_clf.oob_decision_function_

array([[1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.07421227, 0.92578773],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.87883333, 0.12116667],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.39766082, 0.60233918],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.

In [116]:
from sklearn.ensemble import RandomForestClassifier

In [117]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

In [118]:
y_pred_rf = rnd_clf.predict(X_test)

In [119]:
from sklearn.datasets import load_iris

In [120]:
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

In [121]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.1048219474498254
sepal width (cm) 0.0247718947754257
petal length (cm) 0.40579618491689
petal width (cm) 0.46460997285785893


In [122]:
from sklearn.ensemble import AdaBoostClassifier

In [123]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

In [124]:
m = 1000000
X = 6*np.random.rand(m, 1) - 3
y = 0.5*X**2 + X + 2 + np.random.randn(m, 1)

In [125]:
X.shape

(1000000, 1)

In [126]:
y.shape

(1000000, 1)

In [127]:
from sklearn.tree import DecisionTreeRegressor

In [128]:
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

In [129]:
tree_reg1.predict([[1.5]])

array([3.69888352])

In [130]:
y2 = y - tree_reg1.predict(X).reshape(m, 1)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

In [131]:
tree_reg1.predict(X).reshape(m, 1)

array([[3.69888352],
       [2.07131913],
       [2.07131913],
       ...,
       [3.69888352],
       [3.69888352],
       [2.07131913]])

In [132]:
y3 = y2 - tree_reg2.predict(X).reshape(m, 1)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

In [133]:
tree_reg3.predict([[1.5]])

array([0.23515598])

In [134]:
m = 10
X_new = 6*np.random.rand(m, 1) - 3

In [135]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [136]:
y_pred

array([5.82452493, 7.83012604, 5.82452493, 1.6242052 , 5.82452493,
       7.83012604, 5.82452493, 2.16016657, 2.16016657, 3.78773096])

In [137]:
from sklearn.ensemble import GradientBoostingRegressor

In [138]:
y = np.ravel(y)

In [139]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

In [140]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [141]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [142]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

In [143]:
errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)]
errors

[4.959034071818929,
 4.266803478085732,
 3.70395529652249,
 3.2407131136878924,
 2.86290031878359,
 2.5546493704473954,
 2.301574419298073,
 2.091031186793325,
 1.91856708550536,
 1.7730778035536943,
 1.6565419740907654,
 1.559864124726004,
 1.474400262692707,
 1.4035703041658536,
 1.3454394470837947,
 1.297796759244327,
 1.2590834438043486,
 1.2236652533930377,
 1.1967630560855373,
 1.1743063527522868,
 1.149820168045955,
 1.128248447868228,
 1.1094689455553646,
 1.0956076267540185,
 1.0819474143561711,
 1.070540012266968,
 1.0624875482028595,
 1.0539682344629464,
 1.0466739716144617,
 1.0405911556451655,
 1.0357778418411643,
 1.0312263545847211,
 1.0280662916506749,
 1.0245235809446596,
 1.021510775012297,
 1.0194354763467193,
 1.0171127900561794,
 1.0152110663231837,
 1.0135191963949222,
 1.0123382837462687,
 1.0109329407712078,
 1.0100436093216243,
 1.009310296359136,
 1.0085208406651438,
 1.00762563100221,
 1.007092087427015,
 1.0066462598317405,
 1.0061690145497377,
 1.0057902960

In [144]:
best_n_estimators = np.argmin(errors)
best_n_estimators

118

In [145]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

In [146]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

In [147]:
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range (1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

In [148]:
gbrt.n_estimators

108

In [150]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [151]:
import xgboost

In [152]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
val_error

1.0019756530314015

In [155]:
xgb_reg.fit(X_train, y_train,
           eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
val_error



[0]	validation_0-rmse:2.78408
[1]	validation_0-rmse:2.07546
[2]	validation_0-rmse:1.61873
[3]	validation_0-rmse:1.33926
[4]	validation_0-rmse:1.17842
[5]	validation_0-rmse:1.09108
[6]	validation_0-rmse:1.04570
[7]	validation_0-rmse:1.02281
[8]	validation_0-rmse:1.01144
[9]	validation_0-rmse:1.00584
[10]	validation_0-rmse:1.00311
[11]	validation_0-rmse:1.00179
[12]	validation_0-rmse:1.00116
[13]	validation_0-rmse:1.00086
[14]	validation_0-rmse:1.00071
[15]	validation_0-rmse:1.00066
[16]	validation_0-rmse:1.00064
[17]	validation_0-rmse:1.00064
[18]	validation_0-rmse:1.00064
[19]	validation_0-rmse:1.00062
[20]	validation_0-rmse:1.00063
[21]	validation_0-rmse:1.00063


1.0012474511942608