In [10]:
import optuna
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [11]:
df = pd.read_csv("./data/heart.csv", encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [12]:
target_feature = "HeartDisease"
X = df.drop(columns=[target_feature])
y = df[target_feature]

In [13]:
# 변수 자동 분류
binary_features, categorical_features, numeric_features = [], [], []

for col in X.columns:
    unique_vals = X[col].dropna().unique()
    n_unique = len(unique_vals)
    dtype = X[col].dtype

    if n_unique == 2 and X[col].dropna().isin([0, 1]).all():
        binary_features.append(col)
    elif dtype == 'object' or dtype.name == 'category':
        categorical_features.append(col)
    elif dtype in ['int64', 'float64']:
        if n_unique <= 10:
            categorical_features.append(col)
        else:
            numeric_features.append(col)

In [14]:
# 전처리 구성
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ],
    remainder="passthrough"
)

In [15]:
# Optuna 최적화 함수 정의
def tune_model(model_name):
    def objective(trial):
        if model_name == "SVM":
            C = trial.suggest_loguniform("C", 1e-3, 1e2)
            kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])
            gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
            clf = SVC(C=C, kernel=kernel, gamma=gamma, probability=True, random_state=42)

        elif model_name == "RandomForest":
            n_estimators = trial.suggest_int("n_estimators", 50, 200)
            max_depth = trial.suggest_int("max_depth", 3, 20)
            min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
            clf = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                random_state=42
            )

        elif model_name == "XGBoost":
            n_estimators = trial.suggest_int("n_estimators", 50, 200)
            max_depth = trial.suggest_int("max_depth", 3, 10)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            subsample = trial.suggest_float("subsample", 0.5, 1.0)
            clf = XGBClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                subsample=subsample,
                use_label_encoder=False,
                eval_metric="logloss",
                random_state=42
            )

        pipe = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", clf)
        ])
        score = cross_val_score(pipe, X, y, scoring="f1", cv=5, n_jobs=-1).mean()
        return score

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)
    return study.best_params, study.best_value

In [16]:
# 최적 파라미터 탐색 실행
svm_best_params, svm_best_score = tune_model("SVM")
rf_best_params, rf_best_score = tune_model("RandomForest")
xgb_best_params, xgb_best_score = tune_model("XGBoost")

(svm_best_params, svm_best_score, rf_best_params, rf_best_score, xgb_best_params, xgb_best_score)

[I 2025-07-29 21:11:02,230] A new study created in memory with name: no-name-e7a6c9a2-cbd5-4b3f-bd7f-590ba7e5527c
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-07-29 21:11:04,615] Trial 0 finished with value: 0.3552072993249464 and parameters: {'C': 0.02761657233929445, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.3552072993249464.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[W 2025-07-29 21:50:13,944] Trial 1 failed with parameters: {'C': 0.005788826903325548, 'kernel': 'poly', 'gamma': 'auto'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\workspace\hit_ml_dl\venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\wsm\AppData\Local\Temp\ipykernel_22172\2672864329.py", line 40, in objective
    score = cross_val_score(pipe, X, y, scoring="f1", cv=5, n_jobs=-1).mean()
            ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
import optuna.visualization.matplotlib as plt_optuna
import matplotlib.pyplot as plt

# 최적화 이력 (score 변화)
plt_optuna.plot_optimization_history(study)
plt.title("Optimization History")
plt.show()

# 파라미터 중요도
plt_optuna.plot_param_importances(study)
plt.title("Hyperparameter Importance")
plt.show()

# 파라미터별 F1-score 분포
plt_optuna.plot_slice(study)
plt.title("Parameter Slices")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df = study.trials_dataframe()
df = df[df['state'] == 'COMPLETE']  # 완료된 실험만

# 예: C 값 vs F1-score 산점도
sns.scatterplot(data=df, x='params_C', y='value')
plt.title("C vs F1-score")
plt.xscale('log')
plt.show()

sns.pairplot(df[['value', 'params_C', 'params_solver']])
plt.show()