In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.pipeline import make_pipeline

# Load data
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Preprocessing data
df = df.drop(['id', 'ever_married', 'work_type', 'Residence_type'], axis=1)
df['gender'] = df['gender'].map({'Male': 1, 'Female': 2})
df['smoking_status'] = df['smoking_status'].map({
    'Unknown': 0,
    'never smoked': 1,
    'formerly smoked': 2,
    'smokes': 3
})

# Handle missing values
imputer = SimpleImputer(strategy='median')
df['bmi'] = imputer.fit_transform(df[['bmi']])
df = df.dropna()

# Split data
X = df[['gender','age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'smoking_status','bmi']]
y = df['stroke']

# Split data sebelum SMOTE untuk menghindari data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Buat pipeline dengan SMOTE dan scaling
pipeline = make_pipeline(
    StandardScaler(),
    SMOTE(random_state=42),
    LogisticRegression(
        solver="lbfgs",
        max_iter=1000,
        random_state=42
    )
)

# Latih model
pipeline.fit(X_train, y_train)

# Prediksi
y_pred = pipeline.predict(X_test)

# Evaluasi
print("\n" + "="*50)
print("Model Evaluation with SMOTE")
print("="*50)
print(f"Training Accuracy: {pipeline.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]))

# Contoh prediksi
sample_data = X_test.iloc[:3]
print("\n" + "="*50)
print("Sample Predictions")
print("="*50)
for i in range(3):
    prob = pipeline.predict_proba(sample_data.iloc[i:i+1])[0][1]
    print(f"Pasien {i+1}:")
    print(f"  Faktor Risiko: {dict(sample_data.iloc[i])}")
    print(f"  Prediksi: {'Stroke' if y_pred[i] == 1 else 'Tidak Stroke'}")
    print(f"  Probabilitas Stroke: {prob:.2%}\n")


Model Evaluation with SMOTE
Training Accuracy: 0.7424
Test Accuracy: 0.7368
Logistic Regression Accuracy: 0.7367906066536204

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.73      0.84       972
           1       0.13      0.80      0.23        50

    accuracy                           0.74      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.74      0.81      1022

ROC AUC Score: 0.8385390946502057

Sample Predictions
Pasien 1:
  Faktor Risiko: {'gender': 2.0, 'age': 28.0, 'hypertension': 1.0, 'heart_disease': 0.0, 'avg_glucose_level': 83.66, 'smoking_status': 1.0, 'bmi': 36.4}
  Prediksi: Tidak Stroke
  Probabilitas Stroke: 8.02%

Pasien 2:
  Faktor Risiko: {'gender': 2.0, 'age': 3.0, 'hypertension': 0.0, 'heart_disease': 0.0, 'avg_glucose_level': 93.3, 'smoking_status': 0.0, 'bmi': 19.5}
  Prediksi: Tidak Stroke
  Probabilitas Stroke: 0.54%

Pasien 3:
  Faktor Risiko: {'gende