In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [2]:
file_path = 'C:/Users/zen/Documents/-- four/s2/FYP I/XAl-on-healthcare-diagnostics/version_3.0/healthcare-dataset-stroke-data.csv'
data = pd.read_csv(file_path)

print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female

<font color="bluegrey" size=+1.0><b>Preprocess</b></font>

In [3]:
# === CLEANING ===

# Drop unnecessary ID column
data.drop(columns=['id'], inplace=True)

# Remove rare gender outlier
data = data[data['gender'] != 'Other']

# Fill missing BMI with mean (you could also bin it, but keeping numeric for now)
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

# Handle smoking status for children
data.loc[(data['smoking_status'] == 'Unknown') & (data['age'] <= 12), 'smoking_status'] = 'never smoked'

# === SIMPLIFYING CATEGORICALS ===

# Binary mappings (keep these — they're interpretable)
data['gender'] = data['gender'].map({'Male': 1, 'Female': 0})
data['ever_married'] = data['ever_married'].map({'Yes': 1, 'No': 0})
data['Residence_type'] = data['Residence_type'].map({'Urban': 1, 'Rural': 0})

# Normalize and simplify category values (still readable)
data['work_type'] = data['work_type'].replace({
    'Private': 'private',
    'Self-employed': 'self_employed',
    'Govt_job': 'govt_job',
    'Never_worked': 'never_worked',
    'children': 'children'
})

data['smoking_status'] = data['smoking_status'].replace({
    'formerly smoked': 'formerly_smoked',
    'never smoked': 'never_smoked',
    'Unknown': 'unknown'
})

# Optional: Convert work_type & smoking_status to category dtype (helpful for EBM)
data['work_type'] = data['work_type'].astype('category')
data['smoking_status'] = data['smoking_status'].astype('category')

# === BIN BMI IF YOU WANT HUMAN-FRIENDLY OUTPUT (OPTIONAL) ===
def categorize_bmi(bmi):
    if bmi <= 18.5:
        return 'Underweight'
    elif 18.5 < bmi <= 25:
        return 'Normal'
    elif 25 < bmi <= 30:
        return 'Overweight'
    else:
        return 'Obese'

data['bmi_category'] = data['bmi'].apply(categorize_bmi)
data.drop(columns=['bmi'], inplace=True)
data['bmi_category'] = data['bmi_category'].astype('category')

# === Define features and target ===
target_column = 'stroke'
X = data.drop(columns=[target_column])
y = data[target_column]

# === Train-test split (no scaling, no balancing here) ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:
# training set
train_data = X_train.copy()  # Copy features from the train set
train_data['stroke'] = y_train  # Add the target variable to the training set
train_data.to_csv('train_data.csv', index=False)

# test set
test_data = X_test.copy()  # Copy features from the test set
test_data['stroke'] = y_test  # Add the target variable to the test set
test_data.to_csv('test_data.csv', index=False)