# Feature Engineering


In [1]:
# Load required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("All packages loaded successfully!")


All packages loaded successfully!


In [2]:
import os
import kagglehub
from dotenv import load_dotenv

load_dotenv()


True

# Load Data


In [3]:
dataset_path = kagglehub.dataset_download("fedesoriano/stroke-prediction-dataset")
csv_file_path = os.path.join(dataset_path, 'healthcare-dataset-stroke-data.csv')

s_df = pd.read_csv(csv_file_path)
print(f"Dataset shape: {s_df.shape}")
s_df.head()


Dataset shape: (5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Initial Preprocessing


In [4]:
s_df = s_df.set_index('id')
s_df.head()


Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
# Handle the 'Other' gender category (combining with Female as done in EDA)
s_df['gender'] = np.where(
    s_df['gender'] == 'Other',
    'Female',
    s_df['gender']
)
print("Gender distribution after handling 'Other' category:")
print(s_df['gender'].value_counts())


Gender distribution after handling 'Other' category:
gender
Female    2995
Male      2115
Name: count, dtype: int64


In [6]:
# Convert binary variables to consistent Yes/No format (as done in EDA)
s_df['hypertension'] = s_df['hypertension'].replace({0: 'No', 1: 'Yes'})
s_df['heart_disease'] = s_df['heart_disease'].replace({0: 'No', 1: 'Yes'})

print("Binary variable distributions:")
print(f"Hypertension: {s_df['hypertension'].value_counts().to_dict()}")
print(f"Heart Disease: {s_df['heart_disease'].value_counts().to_dict()}")


Binary variable distributions:
Hypertension: {'No': 4612, 'Yes': 498}
Heart Disease: {'No': 4834, 'Yes': 276}


# Feature Engineering


## Handle Missing Values


In [7]:
# Check missing values
print("Missing values summary:")
missing_values = s_df.isnull().sum()
for col, missing_count in missing_values[missing_values > 0].items():
    percentage = (missing_count / len(s_df)) * 100
    print(f"  {col}: {missing_count:,} ({percentage:.2f}%)")
    
print(f"\nTotal missing values: {missing_values.sum()}")


Missing values summary:
  bmi: 201 (3.93%)

Total missing values: 201


In [8]:
# Handle missing BMI values with median imputation
# Using median as it's less sensitive to outliers than mean
bmi_median = s_df['bmi'].median()
print(f"BMI median value for imputation: {bmi_median:.2f}")

# Fill missing values
s_df['bmi'] = s_df['bmi'].fillna(bmi_median)

# Verify no missing values remain
print(f"Missing BMI values after imputation: {s_df['bmi'].isnull().sum()}")
print(f"Total missing values in dataset: {s_df.isnull().sum().sum()}")


BMI median value for imputation: 28.10
Missing BMI values after imputation: 0
Total missing values in dataset: 0


## Encode Categorical Features


In [9]:
# Identify categorical and numerical features
numerical_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 
                       'work_type', 'Residence_type', 'smoking_status']
target_feature = 'stroke'

print(f"Numerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print(f"Target feature: {target_feature}")

# Verify all columns are accounted for
all_features = numerical_features + categorical_features + [target_feature]
print(f"\nAll features accounted for: {set(all_features) == set(s_df.columns)}")
print(f"Dataset columns: {list(s_df.columns)}")
print(f"Identified features: {all_features}")


Numerical features (3): ['age', 'avg_glucose_level', 'bmi']
Categorical features (7): ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
Target feature: stroke

All features accounted for: True
Dataset columns: ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']
Identified features: ['age', 'avg_glucose_level', 'bmi', 'gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']


In [10]:
# Create a copy for feature engineering
fe_df = s_df.copy()

# Display current categorical feature distributions before encoding
print("Categorical feature distributions before encoding:")
for col in categorical_features:
    print(f"\n{col}:")
    print(fe_df[col].value_counts())


Categorical feature distributions before encoding:

gender:
gender
Female    2995
Male      2115
Name: count, dtype: int64

hypertension:
hypertension
No     4612
Yes     498
Name: count, dtype: int64

heart_disease:
heart_disease
No     4834
Yes     276
Name: count, dtype: int64

ever_married:
ever_married
Yes    3353
No     1757
Name: count, dtype: int64

work_type:
work_type
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

Residence_type:
Residence_type
Urban    2596
Rural    2514
Name: count, dtype: int64

smoking_status:
smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64


In [11]:
# Apply One-Hot Encoding to categorical features
print("Applying one-hot encoding to categorical features:")

# Create one-hot encoded features
fe_df_encoded = pd.get_dummies(fe_df, columns=categorical_features, drop_first=True, dtype=int)

print(f"Dataset shape before encoding: {fe_df.shape}")
print(f"Dataset shape after encoding: {fe_df_encoded.shape}")

# Display the new column names created by one-hot encoding
new_columns = [col for col in fe_df_encoded.columns if col not in numerical_features + [target_feature]]
print(f"\nNew one-hot encoded columns ({len(new_columns)}):")
for col in new_columns:
    print(f"  - {col}")

print("\nOne-hot encoding completed!")


Applying one-hot encoding to categorical features:
Dataset shape before encoding: (5110, 11)
Dataset shape after encoding: (5110, 16)

New one-hot encoded columns (12):
  - gender_Male
  - hypertension_Yes
  - heart_disease_Yes
  - ever_married_Yes
  - work_type_Never_worked
  - work_type_Private
  - work_type_Self-employed
  - work_type_children
  - Residence_type_Urban
  - smoking_status_formerly smoked
  - smoking_status_never smoked
  - smoking_status_smokes

One-hot encoding completed!


In [12]:
# Verify one-hot encoding results
print("One-hot encoded features verification:")
print(f"All columns in encoded dataset: {list(fe_df_encoded.columns)}")
print(f"\nNumerical features: {numerical_features}")
print(f"Target feature: {target_feature}")
print(f"One-hot encoded features: {len(new_columns)}")

# Update fe_df to use the encoded version
fe_df = fe_df_encoded.copy()
print(f"\nUpdated working dataset shape: {fe_df.shape}")


One-hot encoded features verification:
All columns in encoded dataset: ['age', 'avg_glucose_level', 'bmi', 'stroke', 'gender_Male', 'hypertension_Yes', 'heart_disease_Yes', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']

Numerical features: ['age', 'avg_glucose_level', 'bmi']
Target feature: stroke
One-hot encoded features: 12

Updated working dataset shape: (5110, 16)


## Scale Numerical Features


In [13]:
# Display numerical feature statistics before scaling
print("Numerical features statistics before scaling:")
print(fe_df[numerical_features].describe().round(2))

# Save unscaled numerical features for NeMo Data Designer
# NDD needs real values for LLM medical coherence validation
unscaled_numerical = fe_df[numerical_features].copy()
print(f"\n✓ Saved unscaled numerical features for NDD: {unscaled_numerical.shape}")

Numerical features statistics before scaling:
           age  avg_glucose_level      bmi
count  5110.00            5110.00  5110.00
mean     43.23             106.15    28.86
std      22.61              45.28     7.70
min       0.08              55.12    10.30
25%      25.00              77.24    23.80
50%      45.00              91.88    28.10
75%      61.00             114.09    32.80
max      82.00             271.74    97.60

✓ Saved unscaled numerical features for NDD: (5110, 3)


In [14]:
# Apply RobustScaler to numerical features only
# Based on EDA, avg_glucose_level and bmi have outliers
robust_scaler = RobustScaler()
fe_df[numerical_features] = robust_scaler.fit_transform(fe_df[numerical_features])

print("Numerical features statistics after robust scaling:")
print(fe_df[numerical_features].describe().round(3))

# One-hot encoded features don't need scaling as they are already binary (0/1)
print(f"\nOne-hot encoded features remain as binary (0/1) - no scaling needed")
print(f"Total features after encoding and scaling: {fe_df.shape[1]}")

# Save the fitted scaler for later use with NDD synthetic data
print(f"\n✓ Fitted scaler saved (will be exported with data)")

Numerical features statistics after robust scaling:
            age  avg_glucose_level       bmi
count  5110.000           5110.000  5110.000
mean     -0.049              0.387     0.085
std       0.628              1.229     0.856
min      -1.248             -0.998    -1.978
25%      -0.556             -0.397    -0.478
50%       0.000              0.000     0.000
75%       0.444              0.603     0.522
max       1.028              4.881     7.722

One-hot encoded features remain as binary (0/1) - no scaling needed
Total features after encoding and scaling: 16

✓ Fitted scaler saved (will be exported with data)


## Create Final Prepared Dataset


In [15]:
# Create the final prepared dataframe
prep_df = fe_df.copy()

print("Final prepared dataset info:")
print(f"Shape: {prep_df.shape}")
print(f"Data types:")
print(prep_df.dtypes)
print(f"\nMissing values: {prep_df.isnull().sum().sum()}")
print(f"Target variable distribution:")
print(prep_df[target_feature].value_counts().sort_index())


Final prepared dataset info:
Shape: (5110, 16)
Data types:
age                               float64
avg_glucose_level                 float64
bmi                               float64
stroke                              int64
gender_Male                         int64
hypertension_Yes                    int64
heart_disease_Yes                   int64
ever_married_Yes                    int64
work_type_Never_worked              int64
work_type_Private                   int64
work_type_Self-employed             int64
work_type_children                  int64
Residence_type_Urban                int64
smoking_status_formerly smoked      int64
smoking_status_never smoked         int64
smoking_status_smokes               int64
dtype: object

Missing values: 0
Target variable distribution:
stroke
0    4861
1     249
Name: count, dtype: int64


In [16]:
# Display first few rows of prepared dataset
print("First 5 rows of prepared dataset:")
prep_df.head()


First 5 rows of prepared dataset:


Unnamed: 0_level_0,age,avg_glucose_level,bmi,stroke,gender_Male,hypertension_Yes,heart_disease_Yes,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9046,0.611111,3.712987,0.944444,1,1,0,1,1,0,1,0,0,1,1,0,0
51676,0.444444,2.9943,0.0,1,0,0,0,1,0,0,1,0,0,0,1,0
31112,0.972222,0.38092,0.488889,1,1,0,1,1,0,1,0,0,0,0,1,0
60182,0.111111,2.153481,0.7,1,0,0,0,1,0,1,0,0,1,0,0,1
1665,0.944444,2.231917,-0.455556,1,0,1,0,1,0,0,1,0,0,0,1,0


In [17]:
# Prepare features and target for machine learning
X = prep_df.drop(columns=[target_feature])
y = prep_df[target_feature]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# Class imbalance ratio
class_counts = y.value_counts().sort_index()
majority_class = class_counts.max()
minority_class = class_counts.min()
imbalance_ratio = majority_class / minority_class
print(f"Class imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"Minority class percentage: {(minority_class / len(y)) * 100:.1f}%")


Features (X) shape: (5110, 15)
Target (y) shape: (5110,)
Feature columns: ['age', 'avg_glucose_level', 'bmi', 'gender_Male', 'hypertension_Yes', 'heart_disease_Yes', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']
Target distribution: {0: 4861, 1: 249}
Class imbalance ratio: 19.52:1
Minority class percentage: 4.9%


## Feature Engineering Summary


In [18]:
print("FEATURE ENGINEERING SUMMARY")
print("=" * 50)

print("TRANSFORMATIONS APPLIED:")
print("1. Missing Value Handling:")
print("   - BMI: 201 missing values imputed with median (28.1)")

print("\n2. Categorical Encoding:")
print("   - Applied One-Hot Encoding to all 7 categorical features")
print("   - Used drop_first=True to avoid multicollinearity")
print("   - Nominal categories properly converted to binary features")

print("\n3. Numerical Feature Scaling:")
print("   - Applied RobustScaler to 3 numerical features (age, avg_glucose_level, bmi)")
print("   - One-hot encoded features left as binary (0/1) - no scaling needed")
print("   - Features normalized while maintaining distribution shape")

print("\n4. Data Preprocessing:")
print("   - Handled 'Other' gender category (combined with Female)")
print("   - Converted binary variables to consistent Yes/No format before encoding")
print("   - Set patient ID as index")

print("\nFINAL DATASET CHARACTERISTICS:")
print(f"- Shape: {prep_df.shape}")
print(f"- Total features: {len(X.columns)}")
print(f"- Numerical features (scaled): {len(numerical_features)}")
print(f"- One-hot encoded features: {len([col for col in X.columns if col not in numerical_features])}")
print(f"- All numeric data types: {all(prep_df.dtypes.apply(lambda x: x in ['int64', 'float64']))}")
print(f"- No missing values: {prep_df.isnull().sum().sum() == 0}")
print(f"- Target variable: {target_feature}")
print(f"- Class imbalance: {imbalance_ratio:.1f}:1 (Severe imbalance suitable for testing techniques)")

print(f"\nREADY FOR MACHINE LEARNING:")
print("- prep_df: Complete preprocessed dataset (SCALED)")
print("- X: Feature matrix ready for model training")
print("- y: Target vector ready for model training")
print("- Categorical features properly one-hot encoded")
print("- Numerical features properly scaled")
print("- Dataset ready for train/test split and model experimentation")

print(f"\nFOR NEMO DATA DESIGNER:")
print("- unscaled_numerical: Original numerical values for NDD")
print("- robust_scaler: Fitted scaler to apply to synthetic data")
print("- NDD should generate with unscaled values, then re-scale")

print("=" * 50)


FEATURE ENGINEERING SUMMARY
TRANSFORMATIONS APPLIED:
1. Missing Value Handling:
   - BMI: 201 missing values imputed with median (28.1)

2. Categorical Encoding:
   - Applied One-Hot Encoding to all 7 categorical features
   - Used drop_first=True to avoid multicollinearity
   - Nominal categories properly converted to binary features

3. Numerical Feature Scaling:
   - Applied RobustScaler to 3 numerical features (age, avg_glucose_level, bmi)
   - One-hot encoded features left as binary (0/1) - no scaling needed
   - Features normalized while maintaining distribution shape

4. Data Preprocessing:
   - Handled 'Other' gender category (combined with Female)
   - Converted binary variables to consistent Yes/No format before encoding
   - Set patient ID as index

FINAL DATASET CHARACTERISTICS:
- Shape: (5110, 16)
- Total features: 15
- Numerical features (scaled): 3
- One-hot encoded features: 12
- All numeric data types: True
- No missing values: True
- Target variable: stroke
- Class im

# Save Data

In [19]:
import joblib

data_dir = os.getcwd().replace('notebooks', 'data')

# Save the scaled dataset (for all models)
prep_df.to_csv(os.path.join(data_dir, 'stroke_data_prepared.csv'), index=True)
print(f"✓ Scaled dataset saved: {data_dir}/stroke_data_prepared.csv")

# Create unscaled version for NeMo Data Designer
# Combine unscaled numerical features with one-hot encoded features and target
unscaled_df = prep_df.copy()
unscaled_df[numerical_features] = unscaled_numerical.values
unscaled_df.to_csv(os.path.join(data_dir, 'stroke_data_unscaled.csv'), index=True)
print(f"✓ Unscaled dataset saved: {data_dir}/stroke_data_unscaled.csv")

# Save the fitted scaler for applying to NDD synthetic data
scaler_path = os.path.join(data_dir, 'robust_scaler.pkl')
joblib.dump(robust_scaler, scaler_path)
print(f"✓ Scaler saved: {scaler_path}")

print(f"\n📦 All files saved to: {data_dir}/")
print(f"   - stroke_data_prepared.csv (scaled, for ML models)")
print(f"   - stroke_data_unscaled.csv (unscaled, for NDD)")
print(f"   - robust_scaler.pkl (to scale NDD synthetic data)")

✓ Scaled dataset saved: /Users/wmurray/Documents/Personal Work/class-imbalance-testing/data/stroke_data_prepared.csv
✓ Unscaled dataset saved: /Users/wmurray/Documents/Personal Work/class-imbalance-testing/data/stroke_data_unscaled.csv
✓ Scaler saved: /Users/wmurray/Documents/Personal Work/class-imbalance-testing/data/robust_scaler.pkl

📦 All files saved to: /Users/wmurray/Documents/Personal Work/class-imbalance-testing/data/
   - stroke_data_prepared.csv (scaled, for ML models)
   - stroke_data_unscaled.csv (unscaled, for NDD)
   - robust_scaler.pkl (to scale NDD synthetic data)
