### P047 决策树 - 训练决策树分类模型

In [19]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [21]:
np.random.seed(42)
raw_data = make_moons(n_samples=2000, noise=0.25, random_state=42)
data = raw_data[0]
target = raw_data[1]

In [22]:
data.shape, target.shape

((2000, 2), (2000,))

In [23]:
x_train, x_test, y_train, y_test = train_test_split(data, target)

In [24]:
classifer = DecisionTreeClassifier()
classifer.fit(x_train, y_train)

DecisionTreeClassifier()

In [25]:
classifer.score(x_test, y_test)

0.902

### P048 决策树 - max_depth 树的最大深度

In [28]:
classifer = DecisionTreeClassifier(max_depth=6)
classifer.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=6)

In [29]:
classifer.score(x_test, y_test)

0.928

### P049 决策树 - min_samples_leaf 叶节点所需的最小样本数

In [30]:
classifer = DecisionTreeClassifier(max_depth=6, min_samples_leaf=6)
classifer.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=6)

In [32]:
classifer.score(x_test, y_test)

0.93

### P050 决策树 - 使用网格搜索获得最优的模型参数

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
params = {
    "max_depth" : np.arange(1, 10),
    "min_samples_leaf": np.arange(1, 20),
}

In [37]:
grid_search = GridSearchCV(
    classifer,
    param_grid=params,
    scoring="accuracy",
    cv=5
)

In [38]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(max_depth=6, min_samples_leaf=6),
             param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
             scoring='accuracy')

In [39]:
grid_search.best_params_

{'max_depth': 6, 'min_samples_leaf': 6}