**Importing Libraries**


In [3]:
!pip install catboost
!pip install optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier,StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from imblearn.combine import SMOTEENN
import optuna
import shap


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m15.2 MB/s[0m eta [36m0

LOADING AND PREPROCESSING DATA

In [4]:
df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

In [5]:
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
df.drop('id',axis=1,inplace=True)


FEATURE ENGINEERING

In [6]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
df['age_glucose'] = df['age'] * df['avg_glucose_level']
df['bmi_age_ratio'] = df['bmi'] / (df['age'] + 1)
df = pd.get_dummies(df, drop_first=True)

In [8]:
X = df.drop('stroke', axis=1)
y = df['stroke']

RESAMPLE

In [9]:
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X, y)

TRAIN/TEST SPLIT

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

SCALE FEATURES

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Feature Selection with SHAP on LightGBM

In [12]:
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_scaled, y_train)
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_train_scaled)
shap_abs_mean = np.abs(shap_values).mean(axis=0)
important_features_idx = np.argsort(shap_abs_mean)[-15:]



[LightGBM] [Info] Number of positive: 2906, number of negative: 2490
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 5396, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.538547 -> initscore=0.154495
[LightGBM] [Info] Start training from score 0.154495




In [13]:
X_train_sel = X_train_scaled[:, important_features_idx]
X_test_sel = X_test_scaled[:, important_features_idx]

HYPERPARAMETER OPTIMIZATION WITH OPTUNA

In [14]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    xgb = XGBClassifier(**param)
    xgb.fit(X_train_sel, y_train)
    preds = xgb.predict(X_test_sel)
    return accuracy_score(y_test, preds)

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
best_params = study.best_params

[I 2025-06-06 14:33:41,202] A new study created in memory with name: no-name-02201028-6b24-4520-b3b2-e9e393400270
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-06 14:33:41,718] Trial 0 finished with value: 0.9822090437361009 and parameters: {'n_estimators': 248, 'max_depth': 8, 'learning_rate': 0.07055491367559284, 'subsample': 0.8979187570837075, 'colsample_bytree': 0.8974771351506733}. Best is trial 0 with value: 0.9822090437361009.
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-06 14:33:42,242] Trial 1 finished with value: 0.9644180874722016 and parameters: {'n_estimators': 203, 'max_depth': 8, 'learning_rate': 0.012022570500134007, 'subsample': 0.9896581614973698, 'colsample_bytree': 0.9640203861472302}. Best is trial 0 with value: 0.9822090437361009.
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-06 14:33:42,356] Trial 2 finished with value: 0.9629355077835434 and parameters: {'n_estimators': 159, 'max_depth': 3, 'learning_rate': 0.094

OPTIMIZED BASE MODEL WITH BEST PARAMS

In [16]:
xgb_opt = XGBClassifier(**best_params)
lgbm_opt = LGBMClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    random_state=42
)
catboost_opt = CatBoostClassifier(verbose=0, random_seed=42)
svc_opt = SVC(probability=True, random_state=42)

estimators = [
    ('xgb', xgb_opt),
    ('lgbm', lgbm_opt),
    ('catboost', catboost_opt),
    ('svc', svc_opt)
]

STACKING CLASSIFIER WITH LOGISTIC REG

In [17]:
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    passthrough=False
)

COMBINING STACKING WITH VOTING CLASSIFIER

In [18]:
#used soft voting here
voting_clf = VotingClassifier(
    estimators=[
        ('stacking', stacking_clf),
        ('lgbm', lgbm_opt),
        ('catboost', catboost_opt)
    ],
    voting='soft',
    n_jobs=-1
)

TRAINING

In [19]:
voting_clf.fit(X_train_sel, y_train)

PREDICTION AND EVALUATION

In [20]:
y_pred = voting_clf.predict(X_test_sel)

print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")



              precision    recall  f1-score   support

           0       0.99      0.98      0.99       622
           1       0.98      0.99      0.99       727

    accuracy                           0.99      1349
   macro avg       0.99      0.99      0.99      1349
weighted avg       0.99      0.99      0.99      1349

Accuracy: 0.9881


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.combine import SMOTEENN
import optuna
import shap


df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')


df = df[df['gender'] != 'Other']
df.drop('id', axis=1, inplace=True)
df = df[df['work_type'] != 'Never_worked']


df['bmi'] = df['bmi'].fillna(df['bmi'].median())


df['age_glucose'] = df['age'] * df['avg_glucose_level']
df['bmi_age_ratio'] = df['bmi'] / (df['age'] + 1)


df = pd.get_dummies(df, drop_first=True)


X = df.drop('stroke', axis=1)
y = df['stroke']


smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_scaled, y_train)
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_train_scaled)
shap_abs_mean = np.abs(shap_values).mean(axis=0)
important_features_idx = np.argsort(shap_abs_mean)[-15:]

X_train_sel = X_train_scaled[:, important_features_idx]
X_test_sel = X_test_scaled[:, important_features_idx]


def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    xgb = XGBClassifier(**param)
    xgb.fit(X_train_sel, y_train)
    preds = xgb.predict(X_test_sel)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
best_params = study.best_params

xgb_opt = XGBClassifier(**best_params)
lgbm_opt = LGBMClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    random_state=42
)
catboost_opt = CatBoostClassifier(verbose=0, random_seed=42)
svc_opt = SVC(probability=True, random_state=42)

estimators = [
    ('xgb', xgb_opt),
    ('lgbm', lgbm_opt),
    ('catboost', catboost_opt),
    ('svc', svc_opt)
]


stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    passthrough=False
)


voting_clf = VotingClassifier(
    estimators=[
        ('stacking', stacking_clf),
        ('lgbm', lgbm_opt),
        ('catboost', catboost_opt)
    ],
    voting='soft',
    n_jobs=-1
)


voting_clf.fit(X_train_sel, y_train)

y_pred = voting_clf.predict(X_test_sel)
print(classification_report(y_test, y_pred))
print(f"Final Optimized Accuracy: {accuracy_score(y_test, y_pred):.4f}")




[LightGBM] [Info] Number of positive: 2855, number of negative: 2434
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 5289, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.539800 -> initscore=0.159536
[LightGBM] [Info] Start training from score 0.159536


[I 2025-06-06 14:34:42,996] A new study created in memory with name: no-name-713298b6-d13c-4bf5-a07f-11e0aee6e427
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-06 14:34:43,168] Trial 0 finished with value: 0.9773242630385488 and parameters: {'n_estimators': 94, 'max_depth': 6, 'learning_rate': 0.12093568738686385, 'subsample': 0.9863378916386731, 'colsample_bytree': 0.8335377263877248}. Best is trial 0 with value: 0.9773242630385488.
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-06 14:34:43,435] Trial 1 finished with value: 0.9856386999244142 and parameters: {'n_estimators': 188, 'max_depth': 6, 'learning_rate': 0.2611850593135319, 'subsample': 0.7358722158252515, 'colsample_bytree': 0.6424260573288976}. Best is trial 1 with value: 0.9856386999244142.
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-06 14:34:43,668] Trial 2 finished with value: 0.9758125472411187 and parameters: {'n_estimators': 177, 'max_depth': 5, 'learning_rate': 0.071975

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       609
           1       0.98      0.99      0.99       714

    accuracy                           0.99      1323
   macro avg       0.99      0.99      0.99      1323
weighted avg       0.99      0.99      0.99      1323

Final Optimized Accuracy: 0.9856


