In [5]:
# Step 2: Data Acquisition & Preprocessing

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 1) Load dataset
df = pd.read_csv("../data/raw/patients_raw.csv")
print("Dataset shape:", df.shape)
df.head()

# 2) Rename target column for clarity
# In the Kaggle dataset, DEATH_EVENT is 1 if patient died
# We'll treat 1 as HIGH RISK
df.rename(columns={"DEATH_EVENT": "high_risk"}, inplace=True)

# 3) Handle missing values (if any)
imputer = SimpleImputer(strategy="median")
df[df.columns] = imputer.fit_transform(df)

# 4) Separate features and target
X = df.drop(columns=["high_risk"])
y = df["high_risk"]

# 5) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6) Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7) Save processed data
pd.DataFrame(X_train_scaled, columns=X.columns).to_csv("../data/processed/X_train.csv", index=False)
pd.DataFrame(X_test_scaled, columns=X.columns).to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

print("✅ Preprocessing complete. Files saved in data/processed/")


Dataset shape: (299, 13)
✅ Preprocessing complete. Files saved in data/processed/
