In [None]:
import os
import pandas as pd
import numpy as np
import hds
from plt_rcs import *
plt.rc(group='figure', figsize=(4, 4))

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
[i for i in os.listdir() if 'Dia' in i][0]

In [None]:
objs = pd.read_pickle('Diabetes.pkl')

In [None]:
globals().update(objs)

In [None]:
%whos

In [None]:
X_train, X_valid, y_train, y_valid = X_train, X_valid, y_train, y_valid

## 가지치기 전 회귀 모델 학습

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model_full = DecisionTreeRegressor(min_samples_split=30, random_state=0)
model_full.fit(X_train, y_train)

In [None]:
model_full.score(X_train, y_train)
# 0.716395830725133
model_full.score(X_valid, y_valid)
# 0.35409551601258393

In [None]:
# 특성 중요도 확인
pd.Series(data=model_full.feature_importances_, index=model_full.feature_names_in_).sort_values(ascending=False)
# Glucose          0.741621
# BloodPressure    0.062569
# SkinThickness    0.054211
# BMI              0.053741
# Age              0.045612
# Pedigree         0.026888
# Pregnancies      0.015359
# dtype: float64

In [None]:
hds.plot.feature_importance(model_full)

## 트리 모델 시각화

In [None]:
model_full.get_n_leaves()

In [None]:
model_full.get_depth()

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12,6))
plot_tree(model_full, feature_names=X_train.columns, filled=True)
plt.show()

In [None]:
hds.plot.tree(model=model_full, fileName='dtr_full')

In [None]:
from IPython.display import Image

In [None]:
Image('dtr_full.png')

## 사후 가지치기 경로 확인

In [None]:
path = model_full.cost_complexity_pruning_path(X_train, y_train)
path = pd.DataFrame(path)
path.head()
# ccp_alphas	impurities
# 0	0.000000	2562.532263
# 1	2.634336	2565.166599
# 2	3.450466	2568.617064
# 3	3.849355	2572.466420
# 4	4.250502	2580.967423

## 최적의 비용 복잡도 파라미터 탐색

In [None]:
from sklearn.base import clone

In [None]:
def clone_tree(alpha):
    model = clone(model_full)
    model.set_params(ccp_alpha=alpha)
    model.fit(X_train, y_train)
    return model

In [None]:
trees = [clone_tree(alpha) for alpha in path['ccp_alphas']]

In [None]:
path['leaves'] = [tree.get_n_leaves() for tree in trees]
path['tr_rsq'] = [tree.score(X_train, y_train) for tree in trees]
path['vl_rsq'] = [tree.score(X_valid, y_valid) for tree in trees]

In [None]:
path.head()

In [None]:
hds.plot.step(data=path, x='ccp_alphas', y='tr_rsq', color='red')
hds.plot.step(data=path, x='ccp_alphas', y='vl_rsq', color='blue')
plt.xlim(-5, 250)
plt.show()

In [None]:
index = np.argsort(path['vl_rsq'])
best_alpha = path['ccp_alphas'][index.iloc[-1]]
# np.float64(38.711664229416954)

In [None]:
model_prun = clone(model_full)
model_prun.set_params(ccp_alpha=best_alpha)
model_prun.fit(X_train, y_train)

In [None]:
y_pred_full = model_full.predict(X_valid)
y_pred_prun = model_prun.predict(X_valid)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_full)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_prun)