## Heart Failure Prediction 데이터셋 분석

### 주제
환자의 임상/건강 관련 변수들을 기반으로 심장병 유무를 예측하는 이진 분류 과제

### 목적
- 심혈관질관은 전 세계적으로 주요 사망원인의 하나이며, 사전 예측 및 예방의 중요성이 강조됨
- 머신러닝 모델을 통해 다음을 달성하고자 함
    - 환자의 심장병 위험도 예측
    - 조기 진단 및 의료 개입 지원
    - 의료자원 효율화 및 환자 맞춤 치료 기회 제공

In [22]:
import os
print(os.getcwd())
print(os.listdir())

d:\workspace\hit_ml_dl\practice
['data', 'data_dictionary.png', 'heart.ipynb', 'linear_regression', 'stroke.ipynb', 'stroke_fixed.py', 'titanic.ipynb']


In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import loguniform
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
df = pd.read_csv("./data/heart.csv", encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [25]:
# 결측치 확인
print(df.isnull().sum()) # 결측치 없음

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [26]:
# 컬럼 타입 분류 (수치형, 범주형, 이진 변수)
target_feature = "HeartDisease"
df_features = df.drop(columns=[target_feature])

binary_features, categorical_features, numeric_features = [], [], []

for col in df_features.columns:
    unique_vals = df_features[col].dropna().unique()
    n_unique = len(unique_vals)
    dtype = df_features[col].dtype

    if n_unique == 2 and df_features[col].dropna().isin([0, 1]).all():
        binary_features.append(col)
    elif dtype == 'object' or dtype.name == 'category':
        categorical_features.append(col)
    elif dtype in ['int64', 'float64']:
        if n_unique <= 10:  # 커스터마이징 가능
            categorical_features.append(col)
        else:
            numeric_features.append(col)

print("Binary features: ", binary_features)
print("Categorical features: ", categorical_features)
print("Numeric features: ", numeric_features)

Binary features:  ['FastingBS']
Categorical features:  ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
Numeric features:  ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']


In [27]:
# 데이터 분할
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

In [28]:
# 기초 통계 확인 (숫자형)
print(df.describe())

              Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.432617   18.514154   109.384145    0.423046   25.460334   
min     28.000000    0.000000     0.000000    0.000000   60.000000   
25%     47.000000  120.000000   173.250000    0.000000  120.000000   
50%     54.000000  130.000000   223.000000    0.000000  138.000000   
75%     60.000000  140.000000   267.000000    0.000000  156.000000   
max     77.000000  200.000000   603.000000    1.000000  202.000000   

          Oldpeak  HeartDisease  
count  918.000000    918.000000  
mean     0.887364      0.553377  
std      1.066570      0.497414  
min     -2.600000      0.000000  
25%      0.000000      0.000000  
50%      0.600000      1.000000  
75%      1.500000      1.000000  
max      6.200000      1.000000  


In [29]:
# 범주형 데이터 분포 확인
for col in categorical_features:
    print(f"{col} 분포:")
    print(df[col].value_counts())
    print("\n")

Sex 분포:
Sex
M    725
F    193
Name: count, dtype: int64


ChestPainType 분포:
ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64


RestingECG 분포:
RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64


ExerciseAngina 분포:
ExerciseAngina
N    547
Y    371
Name: count, dtype: int64


ST_Slope 분포:
ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64




In [30]:
# 타겟 변수 불균형 확인
print(df['HeartDisease'].value_counts(normalize=True))

HeartDisease
1    0.553377
0    0.446623
Name: proportion, dtype: float64


In [31]:
# 전처리 구성
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="passthrough" # binary_features 그대로 사용
)

In [32]:
# 로지스틱 회귀용 파이프라인
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

# GridSearchCV 적용
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],# 규제 강도 조절
    "classifier__penalty": ["l2"], # 규제 유형
    "classifier__solver": ["lbfgs", "liblinear"], # 최적화 알고리즘
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X, y)
grid_best = grid_search.best_params_
grid_score = grid_search.best_score_

# RandomizedSearchCV 적용
param_dist = {
    "classifier__C": loguniform(1e-3, 1e+2),
    "classifier__penalty": ["l2"],
    "classifier__solver": ["lbfgs", "liblinear"]
}
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, cv=5, scoring="f1", n_jobs=-1, random_state=42)
random_search.fit(X, y)
random_best = random_search.best_params_
random_score = random_search.best_score_

(grid_best, grid_score, random_best, random_score)

({'classifier__C': 1,
  'classifier__penalty': 'l2',
  'classifier__solver': 'liblinear'},
 np.float64(0.8472780146670418),
 {'classifier__C': np.float64(0.37253938395788866),
  'classifier__penalty': 'l2',
  'classifier__solver': 'liblinear'},
 np.float64(0.8490932392893358))

### Optuna
자동 하이퍼파라미터 최적화 라이브러리



In [None]:
# https://optuna.org/
import optuna
from sklearn.model_selection import cross_val_score # 교차 검증(k-fold) 기반 모델 평가

# 하이퍼파라미터 탐색
# LogisticRegression().get_params().keys()
# help(LogisticRegression)

def objective(trial):
    """
    Optuna 프레임워크에서 호출되는 함수
    함수가 반환하는 score를 기준으로 탐색을 진행
    """

    # 하이퍼파라미터 탐색 범위(공간) 설정
    C = trial.suggest_loguniform("C", 1e-3, 1e2) # 로그스케일에서 0.001 ~ 100
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
    clf = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(C=C, solver=solver, penalty="l2", max_iter=1000, random_state=42))
    ])
    
    score = cross_val_score(clf, X, y, scoring="f1", cv=5, n_jobs=-1).mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print(study.best_params)
print(study.best_value)

best_params = study.best_params

[I 2025-07-29 21:00:34,636] A new study created in memory with name: no-name-dc735333-7de9-480b-8929-766b974ff3c7
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-07-29 21:00:34,675] Trial 0 finished with value: 0.8416681790683607 and parameters: {'C': 29.061924156208907, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8416681790683607.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-07-29 21:00:34,779] Trial 1 finished with value: 0.7394887860403395 and parameters: {'C': 0.002073894077575059, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8416681790683607.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-07-29 21:00:34,971] Trial 2 finished with value: 0.8432217144479475 and parameters: {'C': 0.7361758371506628, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8432217144479475.
  C = trial.suggest_loguniform("C", 1e-3, 1e2)
[I 2025-07-29 21:00:35,008] Trial 3 finished with value: 0.8416681790683607 and parameters: {'C': 59.40019209559979, 'solver': 'liblin

{'C': 0.29500179145061245, 'solver': 'liblinear'}
0.8498406123839045


In [39]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [40]:
help(LogisticRegression)

Help on class LogisticRegression in module sklearn.linear_model._logistic:

class LogisticRegression(sklearn.linear_model._base.LinearClassifierMixin, sklearn.linear_model._base.SparseCoefMixin, sklearn.base.BaseEstimator)
 |  LogisticRegression(
 |      penalty='l2',
 |      *,
 |      dual=False,
 |      tol=0.0001,
 |      C=1.0,
 |      fit_intercept=True,
 |      intercept_scaling=1,
 |      class_weight=None,
 |      random_state=None,
 |      solver='lbfgs',
 |      max_iter=100,
 |      multi_class='deprecated',
 |      verbose=0,
 |      warm_start=False,
 |      n_jobs=None,
 |      l1_ratio=None
 |  )
 |
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note
 |  that regularization is applied by default**. It can handle both dense
 |  and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit
 |  floats for opti