In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Import dataset
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.drop('id', axis=1, inplace=True)
df.fillna(df.mean(), inplace=True)
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,Female,59.0,0,0,Yes,Private,Rural,76.15,28.893237,Unknown,1
9,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [3]:
# One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

columns = [col for col in df.columns if df[col].dtype == 'object']
transformer = make_column_transformer((OneHotEncoder(), columns), remainder='passthrough')
df = pd.DataFrame(transformer.fit_transform(df))
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,67.0,0.0,1.0,228.69,36.6,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,61.0,0.0,0.0,202.21,28.893237,1.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,80.0,0.0,1.0,105.92,32.5,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,49.0,0.0,0.0,171.23,34.4,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,79.0,1.0,0.0,174.12,24.0,1.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,81.0,0.0,0.0,186.21,29.0,1.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,74.0,1.0,1.0,70.09,27.4,1.0
7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,69.0,0.0,0.0,94.39,22.8,1.0
8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,59.0,0.0,0.0,76.15,28.893237,1.0
9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,78.0,0.0,0.0,58.57,24.2,1.0


In [4]:
# Split dataset into training and testing set
from sklearn.model_selection import train_test_split

X = df.drop(columns=[len(df.columns) - 1], axis=1)
y = df[len(df.columns) - 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

pipe = [('Data Scaling', MinMaxScaler()),
        ('Feature Selection', SelectKBest()),
        ('Logistic Regression', LogisticRegression())]

estimators = Pipeline(pipe)
estimators.fit(X_train, y_train)
test_score = estimators.score(X_test, y_test)
print("Test score:", test_score)

Test score: 0.9419439008480104


In [6]:
# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

k = list(range(1, df.columns.size - 1))
C = np.arange(0.001, 0.01, 1).tolist()

params_grid = {
    'Feature Selection__k': k,
    'Logistic Regression__C': C
}

GSCV3 = GridSearchCV(estimators, param_grid=params_grid, cv=StratifiedKFold(n_splits=5))
GSCV3.fit(X_train, y_train)

GStest_score = GSCV3.score(X_test, y_test)
print("Test score:", GStest_score)
print("Best params:", GSCV3.best_params_)

mask = GSCV3.best_estimator_.named_steps['Feature Selection'].get_support()
selected_features = X.columns[mask]
print("Selected features:", selected_features)

Test score: 0.9419439008480104
Best params: {'Feature Selection__k': 1, 'Logistic Regression__C': 0.001}
Selected features: Int64Index([16], dtype='int64')


In [7]:
# Classification Report
from sklearn.metrics import confusion_matrix, classification_report

gs_pred = GSCV3.predict(X_test)
print("Confusion matrix:", confusion_matrix(y_test, gs_pred))
print("Classification report:", classification_report(y_test, gs_pred))

Confusion matrix: [[1444    0]
 [  89    0]]
Classification report:               precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      1444
         1.0       0.00      0.00      0.00        89

    accuracy                           0.94      1533
   macro avg       0.47      0.50      0.49      1533
weighted avg       0.89      0.94      0.91      1533



In [8]:
# AUC Score
from sklearn.metrics import roc_auc_score

gs_pred_proba = GSCV3.predict_proba(X_test)
print("AUC score:", roc_auc_score(y_test, gs_pred_proba[:, 1]))

AUC score: 0.8355029723925426


Jawab pertanyaan berikut di dalam notebook menggunakan cell markdown:
1. Model algoritme apakah yang paling cocok untuk dataset kasus tersebut dilihat dari
metrik evaluasi yang digunakan?
[ Best params: {'Feature Selection__k': 1, 'Logistic Regression__C': 0.001} ]

2. Apa saja feature-feature yang penting untuk membedakan antara pasien yang
rawan terkena stroke dengan yang sehat berdasarkan model terbaik yang dibuat?
[ Umur ]

