https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html  
https://mljar.com/blog/visualize-decision-tree/

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.datasets import make_moons, make_circles
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split


In [None]:
X, y =  make_circles(n_samples=400,
                     noise=0.3, #노이즈를 낮추면 좀 더 쉬운 문제
                     factor=0.1,
                shuffle=True, 
                random_state=195397 #본인학번이용
                )

# 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.5, 
    random_state=195397 # 본인학번으로 변경
) 

print("X shape:", X.shape)
# print("y:", y)
plt.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap=plt.cm.brg, s=80, edgecolor='k')
plt.scatter(X_test[:,0], X_test[:,1], c=y_test, cmap=plt.cm.brg, s=80, edgecolor='gray')
plt.show()

In [None]:
model = DecisionTreeClassifier(
    criterion='entropy',# 불순도 측정
    max_depth = None, # depth는 트리 깊이, 너무 깊으면 과적합 위험
    min_samples_split=20, # 내부 노드를 분할하기 위한 최소 샘플 수
    min_samples_leaf=1, # 리프 노드가 되기 위해 필요한 최소 샘플 수
    max_leaf_nodes=None,
    random_state=0 # 고정
)

model.fit(X_train,y_train)

print('Depth :', model.get_depth() )
print('n leaves :', model.get_n_leaves())

plt.figure(figsize=(5,10))
plot_tree(model, filled=True, node_ids=True)
plt.show()


pred_train = model.predict(X_train)


DecisionBoundaryDisplay.from_estimator(
            model, X_train, grid_resolution=100, response_method="predict", cmap=plt.cm.brg, alpha=0.8, eps=0.5
)
plt.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap=plt.cm.brg, s=80, edgecolor='k')
plt.scatter(X_train[:,0], X_train[:,1], c=pred_train, cmap=plt.cm.brg, s=10)
plt.show()

print("train acc:", np.sum(pred_train==y_train) / len(y_train) )

In [None]:
# 예측
pred_test = model.predict(X_test)
print("test acc:", np.sum(pred_test==y_test) / len(y_train) )


DecisionBoundaryDisplay.from_estimator(
            model, X_train, grid_resolution=100, response_method="predict", cmap=plt.cm.brg, alpha=0.8, eps=0.5
)
plt.scatter(X_test[:,0], X_test[:,1], c=y_test, cmap=plt.cm.brg, s=80, edgecolor='gray')
plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, cmap=plt.cm.brg, s=10)
plt.show()

In [None]:
model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth = None, # depth는 종료 조건X, depth는 별로 중요하지 않다.
    min_samples_split=20, # 얼마만큼 더 쪼갤거냐가 중요한 것 같다.
    min_samples_leaf=1,
    max_leaf_nodes=None,
    random_state=0 # 고정
)

model.fit(X_train,y_train)

print('Depth :', model.get_depth() )
print('n leaves :', model.get_n_leaves())

plt.figure(figsize=(15,30))
plot_tree(model, filled=True, node_ids=True)
plt.show()


pred_train = model.predict(X_train)


DecisionBoundaryDisplay.from_estimator(
            model, X_train, grid_resolution=100, response_method="predict", cmap=plt.cm.brg, alpha=0.8, eps=0.5
)
plt.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap=plt.cm.brg, s=80, edgecolor='k')
plt.scatter(X_train[:,0], X_train[:,1], c=pred_train, cmap=plt.cm.brg, s=10)
plt.show()

print("train acc:", np.sum(pred_train==y_train) / len(y_train) )

# 예측
pred_test = model.predict(X_test)
print("test acc:", np.sum(pred_test==y_test) / len(y_train) )


DecisionBoundaryDisplay.from_estimator(
            model, X_train, grid_resolution=100, response_method="predict", cmap=plt.cm.brg, alpha=0.8, eps=0.5
)
plt.scatter(X_test[:,0], X_test[:,1], c=y_test, cmap=plt.cm.brg, s=80, edgecolor='gray')
plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, cmap=plt.cm.brg, s=10)
plt.show()

In [None]:
## iris data로 해보자
iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.5, 
    random_state=195397 # 본인학번으로 변경
) 



print("X shape:", X.shape)
print("y:", y)
plt.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap=plt.cm.brg, s=80, edgecolor='k')
plt.scatter(X_test[:,0], X_test[:,1], c=y_test, cmap=plt.cm.brg, s=80, edgecolor='gray')
plt.show()

In [None]:
## Data 통계 확인용
from pandas.plotting import scatter_matrix
import pandas as pd

plt.figure(figsize=(20,20))
scatter_matrix(pd.DataFrame(X, columns=iris['feature_names']), 
    c=y, cmap = plt.cm.brg, s=100, figsize=(10,10))
plt.show()

In [None]:
## iris data에 대해서 Decision Tree를 수행

In [None]:
model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth = None, # depth는 종료 조건X, depth는 별로 중요하지 않다.
    min_samples_split=20, # 얼마만큼 더 쪼갤거냐가 중요한 것 같다.
    min_samples_leaf=1,
    max_leaf_nodes=None,
    random_state=0 # 고정
)

model.fit(X_train,y_train)

print('Depth :', model.get_depth() )
print('n leaves :', model.get_n_leaves())

plt.figure(figsize=(5,10))
plot_tree(model, filled=True, node_ids=True)
plt.show()


pred_train = model.predict(X_train)


# DecisionBoundaryDisplay.from_estimator(
#             model, X_train, grid_resolution=100, response_method="predict", cmap=plt.cm.brg, alpha=0.8, eps=0.5
# )
# plt.scatter(X_train[:,0], X_train[:,1], c=y_train, cmap=plt.cm.brg, s=80, edgecolor='k')
# plt.scatter(X_train[:,0], X_train[:,1], c=pred_train, cmap=plt.cm.brg, s=10)
# plt.show()

print("train acc:", np.sum(pred_train==y_train) / len(y_train) )

# 예측
pred_test = model.predict(X_test)
print("test acc:", np.sum(pred_test==y_test) / len(y_train) )


# DecisionBoundaryDisplay.from_estimator(
#             model, X_train, grid_resolution=100, response_method="predict", cmap=plt.cm.brg, alpha=0.8, eps=0.5
# )
# plt.scatter(X_test[:,0], X_test[:,1], c=y_test, cmap=plt.cm.brg, s=80, edgecolor='gray')
# plt.scatter(X_test[:,0], X_test[:,1], c=pred_test, cmap=plt.cm.brg, s=10)
# plt.show()
