In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
# Preporcessing data (Data cleansing)
df = df.drop(['id', 'ever_married', 'work_type', 'Residence_type'], axis=1)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,202.21,,never smoked,1
2,Male,80.0,0,1,105.92,32.5,never smoked,1
3,Female,49.0,0,0,171.23,34.4,smokes,1
4,Female,79.0,1,0,174.12,24.0,never smoked,1


In [3]:
# Ubah value gender supaya mudah dimanipulasi
df['gender'] = df['gender'].map({'Male': 1, 'Female': 2})

In [4]:
# Ubah value smoking_status 
df['smoking_status'] = df['smoking_status'].map({
    'Unknown': 0,
    'never smoked': 1,
    'formerly smoked': 2,
    'smokes': 3
})

In [5]:
# Handle missing values 
imputer = SimpleImputer(strategy='median')
df['bmi'] = imputer.fit_transform(df[['bmi']])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,67.0,0,1,228.69,36.6,2,1
1,2.0,61.0,0,0,202.21,28.1,1,1
2,1.0,80.0,0,1,105.92,32.5,1,1
3,2.0,49.0,0,0,171.23,34.4,3,1
4,2.0,79.0,1,0,174.12,24.0,1,1


In [6]:
df.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
count,5109.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,1.586025,43.226614,0.097456,0.054012,106.147677,28.862035,1.179843,0.048728
std,0.492592,22.612647,0.296607,0.226063,45.28356,7.699562,1.029961,0.21532
min,1.0,0.08,0.0,0.0,55.12,10.3,0.0,0.0
25%,1.0,25.0,0.0,0.0,77.245,23.8,0.0,0.0
50%,2.0,45.0,0.0,0.0,91.885,28.1,1.0,0.0
75%,2.0,61.0,0.0,0.0,114.09,32.8,2.0,0.0
max,2.0,82.0,1.0,1.0,271.74,97.6,3.0,1.0


In [27]:
# Handle untuk menyamakan data
df = df.dropna()
df.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
count,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0
mean,1.586025,43.229986,0.097475,0.054022,106.140399,28.8633,1.179683,0.048738
std,0.492592,22.613575,0.296633,0.226084,45.285004,7.699785,1.029998,0.21534
min,1.0,0.08,0.0,0.0,55.12,10.3,0.0,0.0
25%,1.0,25.0,0.0,0.0,77.24,23.8,0.0,0.0
50%,2.0,45.0,0.0,0.0,91.88,28.1,1.0,0.0
75%,2.0,61.0,0.0,0.0,114.09,32.8,2.0,0.0
max,2.0,82.0,1.0,1.0,271.74,97.6,3.0,1.0


In [9]:
#Split to X and y param
X = df[['gender','age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'smoking_status','bmi']]
y = df['stroke']

In [10]:
# Standardisasi fitur numerik
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Split data menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

In [12]:
# Latih model Logistic Regression
model = LogisticRegression(multi_class="multinomial",solver="lbfgs",max_iter=1000,class_weight='balanced',random_state=42)
model.fit(X_train, y_train)



In [13]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")

Training Accuracy: 0.7397
Test Accuracy: 0.7329


In [15]:
y_pred = model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.7328767123287672
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.73      0.84       972
           1       0.13      0.80      0.23        50

    accuracy                           0.73      1022
   macro avg       0.56      0.76      0.53      1022
weighted avg       0.94      0.73      0.81      1022



In [17]:
#Try to test the model 
import pandas as pd

# Contoh data pasien dalam format dictionary
patient_data = [
    { # Pasien 1: Risiko tinggi
        'gender': 1,           # 1 = Female, 0 = Male
        'age': 68,
        'hypertension': 1,     # 1 = Ya, 0 = Tidak
        'heart_disease': 1,    # 1 = Ya, 0 = Tidak
        'avg_glucose_level': 220.5,
        'smoking_status': 3,   # 0=Unknown, 1=never, 2=former, 3=current
        'bmi': 34.2
    },
    { # Pasien 2: Risiko rendah
        'gender': 2,
        'age': 45,
        'hypertension': 0,
        'heart_disease': 0,
        'avg_glucose_level': 95.1,
        'smoking_status': 1,
        'bmi': 22.8
    },
    { # Pasien 3: Kasus borderline
        'gender': 2,
        'age': 58,
        'hypertension': 0,
        'heart_disease': 1,
        'avg_glucose_level': 142.3,
        'smoking_status': 0,
        'bmi': 28.6
    }
]

In [19]:
# Convert ke DataFrame
test_df = pd.DataFrame(patient_data)
# Preprocessing (sama seperti saat training)
test_scaled = scaler.transform(test_df)  # Gunakan scaler yang sudah difit

In [21]:
# Prediksi
predictions = model.predict(test_scaled)
pred_probs = model.predict_proba(test_scaled)[:, 1]  # Probabilitas kelas positif (stroke)

In [23]:
for i, (pred, prob) in enumerate(zip(predictions, pred_probs)):
    print(f"Pasien {i+1}:")
    print(f"  Faktor Risiko: {dict(test_df.iloc[i])}")
    print(f"  Prediksi: {'Stroke' if pred == 1 else 'Tidak Stroke'}")
    print(f"  Probabilitas Stroke: {prob:.2%}")
    print("\n")

Pasien 1:
  Faktor Risiko: {'gender': 1.0, 'age': 68.0, 'hypertension': 1.0, 'heart_disease': 1.0, 'avg_glucose_level': 220.5, 'smoking_status': 3.0, 'bmi': 34.2}
  Prediksi: Stroke
  Probabilitas Stroke: 89.33%


Pasien 2:
  Faktor Risiko: {'gender': 2.0, 'age': 45.0, 'hypertension': 0.0, 'heart_disease': 0.0, 'avg_glucose_level': 95.1, 'smoking_status': 1.0, 'bmi': 22.8}
  Prediksi: Tidak Stroke
  Probabilitas Stroke: 22.77%


Pasien 3:
  Faktor Risiko: {'gender': 2.0, 'age': 58.0, 'hypertension': 0.0, 'heart_disease': 1.0, 'avg_glucose_level': 142.3, 'smoking_status': 0.0, 'bmi': 28.6}
  Prediksi: Stroke
  Probabilitas Stroke: 54.47%




In [25]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)