In [1]:
import pandas as pd

df = pd.read_csv("../data/dataset.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
# Separate target
y = df["stroke"]

# Drop target and ID from features
X = df.drop(columns=["stroke", "id"])

X.head(), y.head()

(   gender   age  hypertension  heart_disease ever_married      work_type  \
 0    Male  67.0             0              1          Yes        Private   
 1  Female  61.0             0              0          Yes  Self-employed   
 2    Male  80.0             0              1          Yes        Private   
 3  Female  49.0             0              0          Yes        Private   
 4  Female  79.0             1              0          Yes  Self-employed   
 
   Residence_type  avg_glucose_level   bmi   smoking_status  
 0          Urban             228.69  36.6  formerly smoked  
 1          Rural             202.21   NaN     never smoked  
 2          Rural             105.92  32.5     never smoked  
 3          Urban             171.23  34.4           smokes  
 4          Rural             174.12  24.0     never smoked  ,
 0    1
 1    1
 2    1
 3    1
 4    1
 Name: stroke, dtype: int64)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((4088, 10), (1022, 10))

In [4]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_features, categorical_features

(Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'], dtype='object'),
 Index(['gender', 'ever_married', 'work_type', 'Residence_type',
        'smoking_status'],
       dtype='object'))

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [6]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [7]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [9]:
preprocessor.fit(X_train)

In [10]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape

((4088, 21), (1022, 21))

In [11]:
# Feature names after preprocessing (works on newer sklearn versions)
onehot = preprocessor.named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = onehot.get_feature_names_out(categorical_features)

final_feature_names = list(numeric_features) + list(cat_feature_names)
len(final_feature_names), final_feature_names[:10]

(21,
 ['age',
  'hypertension',
  'heart_disease',
  'avg_glucose_level',
  'bmi',
  'gender_Female',
  'gender_Male',
  'gender_Other',
  'ever_married_No',
  'ever_married_Yes'])

In [12]:
X_train_processed.shape, X_test_processed.shape

((4088, 21), (1022, 21))

In [13]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

In [14]:
def evaluate_model(trained_pipeline, X_test, y_test):
    """
    trained_pipeline: sklearn Pipeline (preprocessor + classifier)
    X_test: raw test DataFrame (NOT preprocessed manually)
    y_test: true labels (0/1)
    """

    # 1) Predict class labels (0/1)
    y_pred = trained_pipeline.predict(X_test)

    # 2) Predict probabilities for the positive class (needed for AUC)
    y_prob = trained_pipeline.predict_proba(X_test)[:, 1]

    # 3) Compute all required metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_prob),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    return metrics

In [15]:
evaluate_model

<function __main__.evaluate_model(trained_pipeline, X_test, y_test)>

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [17]:
logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [18]:
logreg_pipeline.fit(X_train, y_train)

In [19]:
logreg_metrics = evaluate_model(logreg_pipeline, X_test, y_test)
logreg_metrics

{'Accuracy': 0.952054794520548,
 'Precision': 1.0,
 'Recall': 0.02,
 'F1': 0.0392156862745098,
 'AUC': np.float64(0.8416666666666667),
 'MCC': np.float64(0.1379860743303784)}

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier(n_neighbors=5))
])

knn_pipeline.fit(X_train, y_train)

knn_metrics = evaluate_model(knn_pipeline, X_test, y_test)
knn_metrics

{'Accuracy': 0.9481409001956947,
 'Precision': 0.0,
 'Recall': 0.0,
 'F1': 0.0,
 'AUC': np.float64(0.6214197530864197),
 'MCC': np.float64(-0.012306240693812793)}

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# 1) Build pipeline: preprocessing + Naive Bayes
nb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GaussianNB())
])

# 2) Train the model
nb_pipeline.fit(X_train, y_train)

# 3) Evaluate the model
nb_metrics = evaluate_model(nb_pipeline, X_test, y_test)
nb_metrics

{'Accuracy': 0.2984344422700587,
 'Precision': 0.06405228758169934,
 'Recall': 0.98,
 'F1': 0.12024539877300613,
 'AUC': np.float64(0.7860493827160494),
 'MCC': np.float64(0.12100293429194203)}

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(
        random_state=42
    ))
])

dt_pipeline.fit(X_train, y_train)

dt_metrics = evaluate_model(dt_pipeline, X_test, y_test)
dt_metrics

{'Accuracy': 0.9119373776908023,
 'Precision': 0.15517241379310345,
 'Recall': 0.18,
 'F1': 0.16666666666666666,
 'AUC': np.float64(0.5647942386831275),
 'MCC': np.float64(0.12081810174057676)}

In [23]:
dt_metrics


{'Accuracy': 0.9119373776908023,
 'Precision': 0.15517241379310345,
 'Recall': 0.18,
 'F1': 0.16666666666666666,
 'AUC': np.float64(0.5647942386831275),
 'MCC': np.float64(0.12081810174057676)}