In [None]:
import os
import numpy as np
import pandas as pd
import hds
from plt_rcs import *
plt.rc(group='figure', figsize=(4, 4))

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
sorted(os.listdir())

In [None]:
objs = pd.read_pickle('WhiteWine.pkl')

In [None]:
globals().update(objs)

In [None]:
%whos

In [None]:
X_train, X_valid, y_train, y_valid = X_train, X_valid, y_train, y_valid

## 가지치기 전 분류 모델 학습

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_full = DecisionTreeClassifier(min_samples_split=100, random_state=0)

In [None]:
model_full.fit(X=X_train, y=y_train)

In [None]:
model_full.score(X=X_train, y=y_train)
# 0.8445157526254375
model_full.score(X=X_valid, y=y_valid)
# 0.791156462585034

## 모델 학습 결과 확인

In [None]:
# 모델의 특성 중요도 확인
pd.Series(data=model_full.feature_importances_, index=model_full.feature_names_in_).sort_values(ascending=False)
# alcohol                 0.532137
# volatile acidity        0.101741
# pH                      0.067792
# residual sugar          0.067128
# chlorides               0.058272
# density                 0.055429
# sulphates               0.049044
# total sulfur dioxide    0.034607
# fixed acidity           0.033050
# citric acid             0.000800
# dtype: float64

In [None]:
hds.plot.feature_importance(model_full)

## 트리 모델 시각화

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12, 6))
plot_tree(model_full, feature_names=X_train.columns, class_names=model_full.classes_.astype(str), filled=True)
plt.show()

In [None]:
hds.plot.tree(model=model_full, fileName='dtc_full')

In [None]:
[i for i in os.listdir() if 'png' in i]

In [None]:
from IPython.display import Image

In [None]:
Image('dtc_full.png')

In [None]:
# 터미널 노드 개수 확인
model_full.get_n_leaves()
# np.int64(62)

In [None]:
# 최대 깊이 조회
model_full.get_depth()
# 13

## 사후 가지치기 경로 확인

In [None]:
path = model_full.cost_complexity_pruning_path(X_train, y_train)

In [None]:
path = pd.DataFrame(path)

In [None]:
path.head()
# ccp_alphas	impurities
# 0	0.000000	0.211743
# 1	0.000022	0.211765
# 2	0.000032	0.211797
# 3	0.000049	0.211845
# 4	0.000054	0.211900

## 최적의 비용 복잡도 파라미터 탐색

In [None]:
from sklearn.base import clone

In [None]:
def clone_tree(alpha):
    model = clone(model_full)
    model.set_params(ccp_alpha=alpha)
    model.fit(X_train, y_train)
    return model

In [None]:
trees = [clone_tree(alpha) for alpha in path['ccp_alphas']]

In [None]:
path['leaves'] = [tree.get_n_leaves() for tree in trees]
path['tr_acc'] = [tree.score(X_train, y_train) for tree in trees]
path['vl_acc'] = [tree.score(X_valid, y_valid) for tree in trees]
path.head()
# ccp_alphas	impurities	leaves	tr_acc	vl_acc
# 0	0.000000	0.211743	62	0.844516	0.791156
# 1	0.000022	0.211765	61	0.844516	0.791156
# 2	0.000032	0.211797	60	0.844516	0.791156
# 3	0.000049	0.211845	59	0.844516	0.791156
# 4	0.000054	0.211900	58	0.844516	0.791156

In [None]:
hds.plot.step(data=path, x='ccp_alphas', y='tr_acc', color='red')
hds.plot.step(data=path, x='ccp_alphas', y='vl_acc', color='blue')

In [None]:
np.argmax(path['vl_acc'])
# np.int64(41)

In [None]:
indices = np.argsort(path['vl_acc'])
indices.iloc[-1]
# np.int64(42)

In [None]:
best_alpha = path['ccp_alphas'][indices.iloc[-1]]
# np.float64(0.0077337653548115864)

## 가지치기 후 분류 모델 학습

In [None]:
model_prun = clone(model_full)

In [None]:
model_prun.set_params(ccp_alpha=best_alpha)
model_prun.fit(X_train, y_train)

In [None]:
model_prun.score(X_train, y_train)
# 0.8042590431738623
model_prun.score(X_valid, y_valid)
# 0.7959183673469388

In [None]:
model_prun.get_depth()
# 2

In [None]:
model_prun.get_n_leaves()
# np.int64(3)

In [None]:
plot_tree(model_prun, feature_names=X_train.columns, class_names=['Good', 'Best'], filled=True)
plt.show()

## 분류 모델 성능 평가

In [None]:
y_pred_full = model_full.predict(X_valid)
y_pred_prun = model_prun.predict(X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_full)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_prun)

## ROC / PR 곡선

In [None]:
# 예측 활률 생성
y_prob_full = model_full.predict_proba(X_valid)
y_prob_prun = model_prun.predict_proba(X_valid)

In [None]:
# ROC
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_full, color='red')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_prun, color='blue')

In [None]:
# PR
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_full, color='red')
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_prun, color='blue')

## 불균형 데이터셋에 대한 처리 방법
0. 타겟 벡터의 실제값 상대도수로 분류 기준점 설정
1. 검증셋 정확도 대신 F1 점수 기준으로 가지치기
2. SMOTE를 활용한 데이터셋 균형화
3. class_weight 매개변수에 'balanced' 지정하여 학습

### 타겟 벡터의 실제값 상대도수로 설정

In [None]:
y_valid.value_counts(normalize=True)
# grade
# 0    0.787075
# 1    0.212925
# # Name: proportion, dtype: float64

In [None]:
cutoff = 0.212925
y_pred_best_0 = np.where(y_prob_full[:, 1] >= cutoff, 1, 0)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_0)

### F1 스코어 기준 가지치기

In [None]:
from sklearn.metrics import f1_score

In [None]:
# f1 스코어 계산 함수
def valid_f1_score(tree):
    y_pred = tree.predict(X=X_valid)
    score = f1_score(y_true=y_valid, y_pred=y_pred)
    return score

In [None]:
# f1스코어 값 컬럼 생성
path['vl_f1s'] = [valid_f1_score(tree) for tree in trees]

In [None]:
path.head()

In [None]:
hds.plot.step(data=path, x='ccp_alphas', y='vl_f1s')
plt.xlim(-0.005, 0.025)
plt.show()

In [None]:
index = np.argsort(path['vl_f1s'])
best_alpha_f1s = path['ccp_alphas'][index.iloc[-1]]
# np.float64(0.0014843053552403666)

In [None]:
model_best_1 = clone(model_full)
model_best_1.set_params(ccp_alpha=best_alpha_f1s)
model_best_1.fit(X_train, y_train)
y_pred_best_1 = model_best_1.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_1)

### SMOTE 활용

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(k_neighbors=5, random_state=0)
X_bal, y_bal = smote.fit_resample(X_train, y_train)
model_best_2 = DecisionTreeClassifier(min_samples_split=100, random_state=0)
model_best_2.fit(X_bal, y_bal)

In [None]:
model_best_2.score(X_bal, y_bal)
# 0.8508019395747856
model_best_2.score(X_valid, y_valid)
# 0.736734693877551

In [None]:
y_pred_best_2 = model_best_2.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_2)

### class_weight='balanced' 설정

In [None]:
model_best_3 = clone(model_full)
model_best_3.set_params(class_weight='balanced')
model_best_3.fit(X_train, y_train)

In [None]:
model_best_3.score(X_train, y_train)
# 0.793757292882147
model_best_3.score(X_valid, y_valid)
# 0.717687074829932

In [None]:
y_pred_best_3 = model_best_3.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_3)

In [None]:
def cutoff_search(cutoff):
    y_pred = np.where(y_prob_full[:, -1] >= cutoff, 1, 0)
    score = f1_score(y_true=y_valid, y_pred=y_pred)
    return score

In [None]:
cutoffs = np.arange(0.0, 1.01, 0.01)

In [None]:
vl_f1s = [cutoff_search(cutoff) for cutoff in cutoffs]

In [None]:
np.max(vl_f1s)

In [None]:
np.argmax(vl_f1s)

In [None]:
index = np.argsort(vl_f1s)
best_cutoff = cutoffs[index[-1]]
best_cutoff

In [None]:
y_pred_best_0 = np.where(y_prob_full[:, -1] >= best_cutoff, 1, 0)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_0)