In [None]:
# 대학 중도탈락률 예측 모델 (Random Forest 기반)

# 1. 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, validation_curve, GridSearchCV
from sklearn.metrics import mean_absolute_error
from pdpbox.pdp import pdp_isolate, pdp_plot

# 2. 데이터 불러오기
df = pd.read_csv("data/수도권.csv", encoding="utf-8")

# 3. 입력 변수(X), 목표 변수(Y) 분리
X = df.iloc[:, 2:-1]
Y = df.iloc[:, -1]

# 4. 학습용 / 테스트용 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=33)

# 5. 하이퍼파라미터 튜닝을 위한 Validation Curve (max_depth)
param_range = range(1, 11)
train_scores, test_scores = validation_curve(
    RandomForestRegressor(random_state=33),
    X, Y,
    param_name='max_depth',
    param_range=param_range,
    scoring='neg_mean_squared_error',
    cv=10
)

train_mean = -np.mean(train_scores, axis=1)
test_mean = -np.mean(test_scores, axis=1)

plt.plot(param_range, train_mean, marker='o', label='Train MSE')
plt.plot(param_range, test_mean, marker='s', linestyle='--', label='Test MSE')
plt.xlabel('max_depth')
plt.ylabel('MSE')
plt.legend()
plt.title('Validation Curve for max_depth')
plt.grid(True)
plt.tight_layout()
plt.savefig("img/validation_curve.png")
plt.show()

# 6. GridSearchCV로 최적 파라미터 탐색
param_grid = {
    'n_estimators': [10, 20, 30, 40, 50],
    'max_depth': range(5, 11),
    'criterion': ['mae']
}

gs = GridSearchCV(RandomForestRegressor(), param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
gs.fit(X_train, Y_train)

print("Best Parameters:", gs.best_params_)

# 7. 최적 모델로 학습 및 예측
best_model = RandomForestRegressor(**gs.best_params_)
best_model.fit(X_train, Y_train)

# 8. MAE 평가
train_pred = best_model.predict(X_train)
test_pred = best_model.predict(X_test)
print("Train MAE:", mean_absolute_error(Y_train, train_pred))
print("Test MAE:", mean_absolute_error(Y_test, test_pred))

# 9. Feature Importance 시각화
importances = best_model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(X.columns, importances)
plt.title('Feature Importances')
plt.tight_layout()
plt.savefig("img/rf_feature_importance.png")
plt.show()

# 10. PDP (Partial Dependence Plot) 시각화
selected_features = ['1work_std', 'Get_Job', 'Tuition', 'Scholar', '1prof_std', '1Prof_Paper', 'Edu_Fund', 'A_Ratio', 'Dorm_Capa', 'Fresh_Ratio', 'Fresh_Comp']
for feature in selected_features:
    pdp_result = pdp_isolate(
        model=best_model,
        dataset=X,
        model_features=X.columns,
        feature=feature
    )
    pdp_plot(pdp_result, feature)
