jupyter notebook 단축키  
ctrl+enter: 셀 실행  
shift+enter: 셀 실행 및 다음 셀 이동  
alt+enter: 셀 실행, 다음 셀 이동, 새로운 셀 생성  
a: 상단에 새로운 셀 만들기  
b: 하단에 새로운 셀 만들기  
dd: 셀 삭제(x: 셀 삭제)  
함수 ( ) 안에서 shift+tab: arguments description. shift+tab+tab은 길게 볼 수 있도록

In [None]:
!pip install IPython
from IPython.display import Image

In [None]:
## 필요 Library 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap

In [None]:
!git clone https://github.com/yunkio/Datamining.git #코랩 사용

In [None]:
Image('/content/Datamining_DT/image/image1.png')

# 예제 (1) - 인공 데이터셋

---

## 데이터 셋 준비

### 데이터 만들기

In [None]:
# 인공 데이터셋 만들기
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2,
                           n_clusters_per_class=1, n_classes=3, flip_y=0.05,
                           class_sep=1.5, random_state=42)

### 데이터 확인

In [None]:
# Pandas 데이터 프레임으로 변환
df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])
df['Target'] = y
df

In [None]:
# 색 정의
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# 데이터 Plot 그리기
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.show()

### 데이터 나누기 (Train - Test split)

In [None]:
# 데이터셋을 훈련 데이터와 시험 데이터로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 모델 훈련하기

### Decision Tree 정의

In [None]:
# 의사결정나무 분류기를 만들고 훈련시키기
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train) # 훈련

### 예측 및 Test Data를 활용한 평가

In [None]:
# Predict and evaluate the model
y_pred = tree.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix :\n", confusion_matrix(y_test, y_pred))

### Hyper Parameter Tuning

* max_depth : Decision Tree의 최종 깊이  
* min_samples_split : 마지막 노드의 최소 샘플 갯수

In [None]:
# 하이퍼 파라미터 튜닝
param_grid = {'max_depth': [None, 3, 5, 10], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(tree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

### Tree 시각화

In [None]:
# Decision Tree 시각화
plt.figure(figsize=(20,10))
plot_tree(grid_search.best_estimator_, filled=True, feature_names=['Feature 1', 'Feature 2'], class_names=['Class 0', 'Class 1', 'Class 2'])
plt.show()

### Decision Boundary 시각화

In [None]:
# Decision Boundary를 만들기 위한 영역 만들기
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                     np.arange(y_min, y_max, 0.01))

In [None]:
# Decision Boundary 생성하기
Z = grid_search.best_estimator_.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [None]:
# Plot 그리기
# Plotting
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("2D Decision Boundary plotted with training points")
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

---

# 예제 (2) - 실제 데이터셋

## 데이터 셋(2) 준비

In [None]:
# 더 분류가 어려운 데이터 생성
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2,
                           n_clusters_per_class=1, n_classes=3, flip_y=0,
                           class_sep=0.7, random_state=45)

In [None]:
df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])
df['Target'] = y

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.show()

In [None]:
# 데이터셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 모델 학습 및 시각화

In [None]:
# 하이퍼 파라미터 세팅
depths = [2, 5, None]
min_samples_splits = [20, 5, 2]

In [None]:
fig, axes = plt.subplots(len(min_samples_splits), len(depths) * 2, figsize=(20, 15), gridspec_kw={'width_ratios': [1, 3] * len(depths)})

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for i, min_samples_split in enumerate(min_samples_splits):
    for j, max_depth in enumerate(depths):
        # Decision Tree 훈련
        dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
        dt.fit(X_train, y_train)
        
        # Decision Boundary를 그리기 위한 mesh grid 생성
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                             np.arange(y_min, y_max, 0.01))
        
        # Decision Boundary 결정
        Z = dt.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        
        # Decision Boundary Plot 그리기
        ax_boundary = axes[i][j*2 + 1]  # Even indices for boundaries
        ax_boundary.pcolormesh(xx, yy, Z, cmap=cmap_light)
        ax_boundary.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
        ax_boundary.set_xlim(xx.min(), xx.max())
        ax_boundary.set_ylim(yy.min(), yy.max())
        ax_boundary.set_title(f"Boundary: Depth={max_depth}, Split={min_samples_split}\nAccuracy: {dt.score(X_test, y_test):.2f}")

        # Tree Plot 그리기
        ax_tree = axes[i][j*2]  # Odd indices for trees
        plot_tree(dt, filled=True, feature_names=['Feature 1', 'Feature 2'], class_names=['Class 0', 'Class 1', 'Class 2'], ax=ax_tree)

plt.tight_layout()
plt.show()

# 끝