In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv('../data/heart_disease.csv')

# Convert target to binary
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

print(f"Original dataset shape: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()[df.isnull().sum() > 0]}")

Original dataset shape: (303, 14)

Missing values:
ca      4
thal    2
dtype: int64


In [2]:
# We have 4 missing in 'ca' and 2 in 'thal'
# These are important features (high correlation with target)
# Strategy: Since only 6 rows (2% of data), we'll drop them

print(f"Rows before dropping missing values: {len(df)}")

# Drop rows with missing values
df_clean = df.dropna()

print(f"Rows after dropping missing values: {len(df_clean)}")
print(f"Dropped: {len(df) - len(df_clean)} rows")
print(f"\nMissing values now:\n{df_clean.isnull().sum().sum()}")

# Check target distribution after dropping
print(f"\nTarget distribution after cleaning:")
print(df_clean['target'].value_counts())

Rows before dropping missing values: 303
Rows after dropping missing values: 297
Dropped: 6 rows

Missing values now:
0

Target distribution after cleaning:
target
0    160
1    137
Name: count, dtype: int64


In [3]:
# Let's examine outliers more carefully
# For medical data, extreme values might be real critical cases

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Check outliers in key numerical features
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

for col in numerical_cols:
    outliers, lower, upper = detect_outliers_iqr(df_clean, col)
    print(f"\n{col}:")
    print(f"  Lower bound: {lower:.2f}, Upper bound: {upper:.2f}")
    print(f"  Number of outliers: {len(outliers)}")
    if len(outliers) > 0:
        print(f"  Outlier values: {sorted(outliers[col].values)}")


age:
  Lower bound: 28.50, Upper bound: 80.50
  Number of outliers: 0

trestbps:
  Lower bound: 90.00, Upper bound: 170.00
  Number of outliers: 9
  Outlier values: [np.float64(172.0), np.float64(174.0), np.float64(178.0), np.float64(178.0), np.float64(180.0), np.float64(180.0), np.float64(180.0), np.float64(192.0), np.float64(200.0)]

chol:
  Lower bound: 113.50, Upper bound: 373.50
  Number of outliers: 5
  Outlier values: [np.float64(394.0), np.float64(407.0), np.float64(409.0), np.float64(417.0), np.float64(564.0)]

thalach:
  Lower bound: 83.50, Upper bound: 215.50
  Number of outliers: 1
  Outlier values: [np.float64(71.0)]

oldpeak:
  Lower bound: -2.40, Upper bound: 4.00
  Number of outliers: 5
  Outlier values: [np.float64(4.2), np.float64(4.2), np.float64(4.4), np.float64(5.6), np.float64(6.2)]


In [4]:
# Medical reasoning for outliers:
# - High cholesterol (>400) and high BP (>180) are real health conditions
# - Very low max heart rate (<100) could indicate heart problems
# - We'll keep outliers as they're medically valid

# However, let's cap extreme values to reduce their impact
def cap_outliers(data, column, lower_percentile=1, upper_percentile=99):
    lower = data[column].quantile(lower_percentile/100)
    upper = data[column].quantile(upper_percentile/100)
    data[column] = data[column].clip(lower, upper)
    return data

# Make a copy before capping
df_processed = df_clean.copy()

# Cap extreme outliers (1st and 99th percentile)
for col in ['trestbps', 'chol', 'oldpeak']:
    df_processed = cap_outliers(df_processed, col, 1, 99)
    
print("Outliers capped successfully!")
print(f"\nProcessed dataset shape: {df_processed.shape}")

Outliers capped successfully!

Processed dataset shape: (297, 14)


In [5]:
# Create some new features that might be useful

# Age groups
df_processed['age_group'] = pd.cut(df_processed['age'], 
                                    bins=[0, 40, 50, 60, 100], 
                                    labels=['<40', '40-50', '50-60', '>60'])

# Cholesterol categories (based on medical standards)
df_processed['chol_category'] = pd.cut(df_processed['chol'], 
                                        bins=[0, 200, 240, 1000], 
                                        labels=['Normal', 'Borderline', 'High'])

# Blood pressure categories
df_processed['bp_category'] = pd.cut(df_processed['trestbps'], 
                                      bins=[0, 120, 140, 1000], 
                                      labels=['Normal', 'Elevated', 'High'])

print("New features created:")
print(df_processed[['age_group', 'chol_category', 'bp_category']].head(10))

# We'll keep these for EDA but won't use in initial model
# (to keep it simple first)

New features created:
  age_group chol_category bp_category
0       >60    Borderline        High
1       >60          High        High
2       >60    Borderline      Normal
3       <40          High    Elevated
4     40-50    Borderline    Elevated
5     50-60    Borderline      Normal
6       >60          High    Elevated
7     50-60          High      Normal
8       >60          High    Elevated
9     50-60    Borderline    Elevated


In [6]:
# Separate features and target
X = df_clean.drop('target', axis=1)
y = df_clean['target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution in train set:")
print(y_train.value_counts())
print(f"\nTarget distribution in test set:")
print(y_test.value_counts())

Features shape: (297, 13)
Target shape: (297,)

Features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

Training set: (237, 13)
Test set: (60, 13)

Target distribution in train set:
target
0    128
1    109
Name: count, dtype: int64

Target distribution in test set:
target
0    32
1    28
Name: count, dtype: int64


In [7]:
# Scale numerical features for better model performance
# Important: Fit scaler only on training data to avoid data leakage!

scaler = StandardScaler()

# Identify numerical columns to scale
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Fit and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])

# Transform test data (using same scaler)
X_test_scaled = X_test.copy()
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])

print("Feature scaling completed!")
print(f"\nBefore scaling (first row):")
print(X_train[numerical_features].iloc[0])
print(f"\nAfter scaling (first row):")
print(X_train_scaled[numerical_features].iloc[0])

Feature scaling completed!

Before scaling (first row):
age          54.0
trestbps    124.0
chol        266.0
thalach     109.0
oldpeak       2.2
Name: 55, dtype: float64

After scaling (first row):
age        -0.085668
trestbps   -0.462582
chol        0.312737
thalach    -1.827448
oldpeak     0.967117
Name: 55, dtype: float64


In [9]:
import os

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)
print("Models directory created!")

Models directory created!


In [10]:
# Save for modeling
X_train_scaled.to_csv('../data/X_train.csv', index=False)
X_test_scaled.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

# Save the scaler for later use in deployment
import pickle
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("✅ All processed data saved!")
print("Ready for modeling!")

✅ All processed data saved!
Ready for modeling!
