In [None]:
# 대학 중도탈락률 예측 모델 (Decision Tree 기반)

# 1. 라이브러리 불러오기
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
import graphviz

# 2. 데이터 불러오기
df = pd.read_csv("data/수도권.csv", encoding="utf-8")
X = df[df.columns[2:-1]]
y = df[df.columns[-1]]

# 3. 학습용 / 테스트용 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# 4. 파이프라인 생성
pipe_tree = make_pipeline(DecisionTreeRegressor(random_state=23))
pipe_tree.get_params().keys()

# 5. Validation Curve (max_depth)
param_range = list(range(1, 11))
train_scores, test_scores = validation_curve(
    estimator=pipe_tree,
    X=X,
    y=y,
    param_name='decisiontreeregressor__max_depth',
    param_range=param_range,
    scoring="neg_mean_squared_error",
    cv=10
)

train_mean = -np.mean(train_scores, axis=1)
train_std = np.std(-train_scores, axis=1)
test_mean = -np.mean(test_scores, axis=1)
test_std = np.std(-test_scores, axis=1)

plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='Training MSE')
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Test MSE')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.grid()
plt.xlabel('max_depth')
plt.ylabel('MSE')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig("img/dt_validation_curve.png")
plt.show()

# 6. GridSearchCV로 최적 파라미터 탐색
param_grid = [{
    'decisiontreeregressor__max_depth': list(range(1, 21)),
    'decisiontreeregressor__min_samples_leaf': list(range(1, 31)),
    'decisiontreeregressor__criterion': ['mse', 'mae']
}]
gs = GridSearchCV(estimator=pipe_tree, param_grid=param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
gs = gs.fit(X_train, Y_train)

print("Best Score (MSE):", -gs.best_score_)
print("Best Parameters:", gs.best_params_)

# 7. 최적 모델로 학습
treeSeoul = DecisionTreeRegressor(criterion='mae', max_depth=18, min_samples_leaf=4)
treeSeoul.fit(X, y)

# 8. 예측 및 평가
y_pred_test = treeSeoul.predict(X_test)
y_pred_train = treeSeoul.predict(X_train)
print("Test MAE:", mean_absolute_error(Y_test, y_pred_test))
print("Train MAE:", mean_absolute_error(Y_train, y_pred_train))

# 9. Feature Importance 시각화
colnames = treeSeoul.feature_names_in_
importances = treeSeoul.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(7, 7))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [colnames[i] for i in indices])
plt.xlabel('Relative Importance')
plt.tight_layout()
plt.savefig("img/dt_feature_importance.png")
plt.show()

# 10. 트리 시각화
graph_data = tree.export_graphviz(treeSeoul, out_file=None, filled=True, rounded=True, special_characters=True, feature_names=colnames)
graphviz.Source(graph_data)

# 11. Partial Dependence Plot
features = list(range(len(colnames)))
fig, ax = plt.subplots(figsize=(20, 10))
ax.set_title("Partial Dependence Plot of treeSeoul")
PartialDependenceDisplay.from_estimator(treeSeoul, X_train, features, feature_names=colnames, n_jobs=11, ax=ax)
