In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
file_path = 'C:/Users/zen/Documents/-- four/s2/FYP I/XAl-on-healthcare-diagnostics/version_3.0/healthcare-dataset-stroke-data.csv'
data = pd.read_csv(file_path)

print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female

<font color="bluegrey" size=+1.0><b>Preprocess</b></font>

In [3]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

data = data.drop(columns=['id'])

target_column = 'stroke'

# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

def preprocess_data(X_train, X_test, categorical_columns, numerical_columns):
    """
    Apply preprocessing pipeline to the training and testing data.

    Args:
    - X_train: The training feature data.
    - X_test: The testing feature data.
    - categorical_columns: List of categorical columns to be one-hot encoded.
    - numerical_columns: List of numerical columns to be scaled.

    Returns:
    - X_train_processed: Preprocessed training data.
    - X_test_processed: Preprocessed testing data.
    """
    # Set up a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_columns),  # Scale numerical columns
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # Encode categorical columns
        ]
    )
    

    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    X_train_processed = pipeline.fit_transform(X_train)
    X_test_processed = pipeline.transform(X_test)

    print(f"Processed Training Data Shape: {X_train_processed.shape}")
    print(f"Processed Testing Data Shape: {X_test_processed.shape}")
    
    return X_train_processed, X_test_processed, pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_columns = ['age', 'avg_glucose_level', 'bmi']

X_train_processed, X_test_processed, pipeline = preprocess_data(X_train, X_test, categorical_columns, numerical_columns)

Processed Training Data Shape: (4088, 18)
Processed Testing Data Shape: (1022, 18)


In [4]:
processed_features = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_columns)

all_features = list(numerical_columns) + list(processed_features)

processed_data_train = pd.DataFrame(X_train_processed, columns=all_features)
processed_data_train['stroke'] = y_train.values
processed_data_train.to_csv('processed_stroke_data_train.csv', index=False)
print("Processed data saved to 'processed_stroke_data_train.csv'")

Processed data saved to 'processed_stroke_data_train.csv'


In [5]:
processed_features_test = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_columns)

all_features_test = list(numerical_columns) + list(processed_features_test)

# Create df for the test data
processed_data_test = pd.DataFrame(X_test_processed, columns=all_features_test)
processed_data_test['stroke'] = y_test.values

# Save the processed test data to a CSV file
processed_data_test.to_csv('processed_stroke_data_test.csv', index=False)
print("Processed test data saved to 'processed_stroke_data_test.csv'")

Processed test data saved to 'processed_stroke_data_test.csv'
