<a href="https://colab.research.google.com/github/veryHapppy/study_ai/blob/main/Kaggle/predicting_heart_disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna
!pip install catboost



In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna
import joblib

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
'''
Chest pain type : 범주형 1,2,3,4-정상
bp: 혈압
FBS over 120 : 공복혈당
EKG results : 심전도 결과, 0-정상, 2-좌심실 비대증
Max HR : 최대 심박수
Exercise angina : 운동성 협심증 ,
ST depression : ST분절하강의 깊이, 심장근육에 산소가 부족(협심증의 신호)
Slope of ST : 심장질환의 위험도 1,2,3
Number of vessels fluro : 협착된 혈관 개수
Thallium : 3-정상, 6-특정부위에 혈액공급X, 7-협심증 위험

범주형
Sex
Chest pain type
EKG results
Exercise angina
Thallium
'''

'\nChest pain type : 범주형 1,2,3,4-정상\nbp: 혈압\nFBS over 120 : 공복혈당 \nEKG results : 심전도 결과, 0-정상, 2-좌심실 비대증\nMax HR : 최대 심박수\nExercise angina : 운동성 협심증 ,\nST depression : ST분절하강의 깊이, 심장근육에 산소가 부족(협심증의 신호)\nSlope of ST : 심장질환의 위험도 1,2,3\nNumber of vessels fluro : 협착된 혈관 개수\nThallium : 3-정상, 6-특정부위에 혈액공급X, 7-협심증 위험\n\n범주형\nSex\nChest pain type\nEKG results\nExercise angina\nThallium\n'

In [None]:
from os import replace
def feature_engineering(data) :
  data = data.set_index(data['id']).drop('id', axis=1)
  data = pd.get_dummies(data, columns=['Sex', 'Chest pain type', 'EKG results', 'Exercise angina', 'Thallium'], drop_first=True)
  data['Max HR'] = data['Max HR'].clip(lower=80, upper=200)

  if 'Heart Disease' in data.columns :
    data['Have_disease'] = (data['Heart Disease'] == 'Presence').astype(int)
    data = data.drop('Heart Disease', axis=1)
  else :
    pass

  return data

In [None]:
data_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/[Kaggle 6-2] Predicting Heart Disease/train.csv')
train_data = feature_engineering(data_set)
X = train_data.drop('Have_disease', axis=1)
y = train_data['Have_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def objective(trial) :
  X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

  xgb_params = {
    'n_estimators': trial.suggest_int('xgb_n', 500, 2000),
    'max_depth': trial.suggest_int('xgb_depth', 3, 7),
    'colsample_bytree': trial.suggest_float('xgb_colsample', 0.6, 0.8),
    'reg_alpha': trial.suggest_float('xgb_alpha', 1e-3, 10.0, log=True),
    'reg_lambda': trial.suggest_float('xgb_lambda', 1e-3, 10.0, log=True),
    'learning_rate': trial.suggest_float('xgb_lr', 0.005, 0.1),
  }
  lgbm_params = {
    'n_estimators': trial.suggest_int('lgbm_n', 500, 2000),
    'num_leaves': trial.suggest_int('lgbm_leaves', 15, 50),
    'learning_rate': trial.suggest_float('lgbm_lr', 0.005, 0.1),
    'verbose': -1
  }
  cat_params = {
    'iterations': trial.suggest_int('cat_iter', 500, 2000),
    'depth': trial.suggest_int('cat_depth', 3, 7),
    'learning_rate': trial.suggest_float('cat_lr', 0.005, 0.1),
    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
    'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
    'logging_level': 'Silent' # CatBoost 조용히 시키기
  }
  w_xgb = trial.suggest_float('w_xgb', 0.2, 1.0)
  w_lgbm = trial.suggest_float('w_lgbm', 0.2, 1.0)
  w_cat = trial.suggest_float('w_cat', 0.2, 1.0)

  xgb = XGBClassifier(**xgb_params, tree_method='hist', device='cuda', n_jobs=-1, random_state=42)
  lgbm = LGBMClassifier(**lgbm_params, device='gpu', n_jobs=-1, random_state=42)
  cat = CatBoostClassifier(**cat_params, task_type='GPU', random_state=42)

  voting_model = VotingClassifier(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat)],
    weights=[w_xgb, w_lgbm, w_cat],
    voting='soft',
  )

  voting_model.fit(X_tr, y_tr)
  preds = voting_model.predict_proba(X_val)[:, 1]
  return roc_auc_score(y_val, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=80)
print(study.best_value)

[I 2026-02-02 05:43:19,776] A new study created in memory with name: no-name-640c9aa0-5083-40c6-9c83-f649a144385c
[I 2026-02-02 05:44:53,447] Trial 0 finished with value: 0.9552650602395889 and parameters: {'xgb_n': 749, 'xgb_depth': 3, 'xgb_colsample': 0.7521979451791054, 'xgb_alpha': 0.35703359764875453, 'xgb_lambda': 0.6275496723120393, 'xgb_lr': 0.07134718566062627, 'lgbm_n': 1950, 'lgbm_leaves': 34, 'lgbm_lr': 0.04805742354279338, 'cat_iter': 1059, 'cat_depth': 3, 'cat_lr': 0.07074761546031506, 'l2_leaf_reg': 4.811618580220381, 'random_strength': 3.4367738584659473, 'w_xgb': 0.7896567402983667, 'w_lgbm': 0.9668946970980514, 'w_cat': 0.9733056143465992}. Best is trial 0 with value: 0.9552650602395889.
[I 2026-02-02 05:46:16,864] Trial 1 finished with value: 0.9550360957177018 and parameters: {'xgb_n': 1253, 'xgb_depth': 5, 'xgb_colsample': 0.6097428625739416, 'xgb_alpha': 0.011272084824065998, 'xgb_lambda': 0.010561452721791702, 'xgb_lr': 0.08147865590724575, 'lgbm_n': 1607, 'lgbm_

0.9553718909063473


In [None]:
best = study.best_params

xgb_final = XGBClassifier(
    n_estimators=best['xgb_n'],
    max_depth=best['xgb_depth'],
    learning_rate=best['xgb_lr'],
    colsample_bytree=best['xgb_colsample'],
    reg_alpha=best['xgb_alpha'],
    reg_lambda=best['xgb_lambda'],
    random_state=42,
    tree_method='hist',
    device='cuda',
)

lgbm_final = LGBMClassifier(
    n_estimators=best['lgbm_n'],
    num_leaves=best['lgbm_leaves'],
    learning_rate=best['lgbm_lr'],
    random_state=42,
    verbose=-1,
    device='gpu',
)

cat_final = CatBoostClassifier(
    iterations=best['cat_iter'],
    depth=best['cat_depth'],
    learning_rate=best['cat_lr'],
    l2_leaf_reg=best['l2_leaf_reg'],
    random_strength=best['random_strength'],
    task_type='GPU', # GPU 사용
    devices='0',
    random_state=42,
    logging_level='Silent'
)

voting_model = VotingClassifier(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    weights=[best['w_xgb'], best['w_lgbm'], best['w_cat']],
    voting = 'soft'
)

stacking_model = StackingClassifier(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    final_estimator=LogisticRegression(),
    cv=3
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

voting_model.fit(X_train, y_train)
stacking_model.fit(X_train, y_train)

voting_pred = voting_model.predict_proba(X_test)[:, 1]
stacking_pred = stacking_model.predict_proba(X_test)[:, 1]

print(f'voting 점수 : {roc_auc_score(y_test, voting_pred)}')
print(f'stacking 점수 : {roc_auc_score(y_test, stacking_pred)}')

joblib.dump(voting_model, 'voting_best.pkl')
joblib.dump(stacking_model, 'stacking_best.pkl')

if roc_auc_score(y_test, voting_pred) > roc_auc_score(y_test, stacking_pred) :
  best_model = voting_model
else :
  best_model = stacking_model

best_model.fit(X, y)
joblib.dump(best_model, 'best_model.pkl')

voting 점수 : 0.9553736540626898
stacking 점수 : 0.9552343407681617


['best_model.pkl']

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/[Kaggle 6-2] Predicting Heart Disease/test.csv')
X = feature_engineering(test_data)

pred = best_model.predict_proba(X)[:, 1]

submission = pd.DataFrame({
    'id': X.index,
    'Heart Disease': pred
})

submission.to_csv('submission.csv', index=False)
print("파일 생성 완료")

파일 생성 완료


In [None]:
import os
save_path = '/content/drive/MyDrive/Colab Notebooks/[Kaggle 6-2] Predicting Heart Disease/'
submission.to_csv(save_path + 'submission.csv', index=False)