In [55]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.pipeline import make_pipeline

In [7]:
# Load data
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
# Preporcessing data (Data cleansing)
df = df.drop(['id', 'ever_married', 'work_type', 'Residence_type'], axis=1)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,202.21,,never smoked,1
2,Male,80.0,0,1,105.92,32.5,never smoked,1
3,Female,49.0,0,0,171.23,34.4,smokes,1
4,Female,79.0,1,0,174.12,24.0,never smoked,1


In [11]:
# Ubah value gender supaya mudah dimanipulasi
df['gender'] = df['gender'].map({'Male': 1, 'Female': 2})

In [13]:
# Ubah value smoking_status 
df['smoking_status'] = df['smoking_status'].map({
    'Unknown': 0,
    'never smoked': 1,
    'formerly smoked': 2,
    'smokes': 3
})

In [19]:
# Handle missing values 
imputer = SimpleImputer(strategy='median')
df['bmi'] = imputer.fit_transform(df[['bmi']])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,67.0,0,1,228.69,36.6,2,1
1,2.0,61.0,0,0,202.21,28.1,1,1
2,1.0,80.0,0,1,105.92,32.5,1,1
3,2.0,49.0,0,0,171.23,34.4,3,1
4,2.0,79.0,1,0,174.12,24.0,1,1


In [21]:
df.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
count,5109.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,1.586025,43.226614,0.097456,0.054012,106.147677,28.862035,1.179843,0.048728
std,0.492592,22.612647,0.296607,0.226063,45.28356,7.699562,1.029961,0.21532
min,1.0,0.08,0.0,0.0,55.12,10.3,0.0,0.0
25%,1.0,25.0,0.0,0.0,77.245,23.8,0.0,0.0
50%,2.0,45.0,0.0,0.0,91.885,28.1,1.0,0.0
75%,2.0,61.0,0.0,0.0,114.09,32.8,2.0,0.0
max,2.0,82.0,1.0,1.0,271.74,97.6,3.0,1.0


In [25]:
#Jumlah data tidak sesuai, perlu disamakan/ dihilangkan yang na
df = df.dropna()
df.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
count,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0
mean,1.586025,43.229986,0.097475,0.054022,106.140399,28.8633,1.179683,0.048738
std,0.492592,22.613575,0.296633,0.226084,45.285004,7.699785,1.029998,0.21534
min,1.0,0.08,0.0,0.0,55.12,10.3,0.0,0.0
25%,1.0,25.0,0.0,0.0,77.24,23.8,0.0,0.0
50%,2.0,45.0,0.0,0.0,91.88,28.1,1.0,0.0
75%,2.0,61.0,0.0,0.0,114.09,32.8,2.0,0.0
max,2.0,82.0,1.0,1.0,271.74,97.6,3.0,1.0


In [27]:
#Split to X and y param
X = df[['gender','age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'smoking_status','bmi']]
y = df['stroke']

In [31]:
# Standardisasi fitur numerik
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

In [35]:
model_rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # menangani data imbalance
    random_state=42
)

In [37]:
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

In [41]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9481409001956947
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022



In [47]:
from imblearn.over_sampling import SMOTE

# Setelah split
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Fit ulang Random Forest
model_rf.fit(X_train_res, y_train_res)
y_pred_rf = model_rf.predict(X_test)

In [49]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.8943248532289628
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.94       972
           1       0.15      0.24      0.18        50

    accuracy                           0.89      1022
   macro avg       0.55      0.58      0.56      1022
weighted avg       0.92      0.89      0.91      1022



In [63]:
print("ROC AUC Score:", roc_auc_score(y_test, model_rf.predict_proba(X_test)[:, 1]))

ROC AUC Score: 0.7895267489711935


In [71]:
sample_data = X_test.iloc[:3]  # Data asli, belum discale
sample_scaled = X_test[:3]        # Data yang sudah discale untuk prediksi

print("\n" + "="*50)
print("Sample Predictions")
print("="*50)
for i in range(3):
    prob = model_rf.predict_proba(sample_scaled[i].reshape(1, -1))[0][1]
    pred = model_rf.predict(sample_scaled[i].reshape(1, -1))[0]
    print(f"Pasien {i+1}:")
    print(f"  Faktor Risiko: {dict(sample_data.iloc[i])}")
    print(f"  Prediksi: {'Stroke' if pred == 1 else 'Tidak Stroke'}")
    print(f"  Probabilitas Stroke: {prob:.2%}\n")


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'