In [None]:
# --- Data Handling ---
import pandas as pd
import numpy as np
import os

# --- Data Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Preprocessing ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- To Save The Processed Data ---
import joblib

# Set some display options for pandas and seaborn for better visualization
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("Libraries imported successfully!")

In [None]:
# Define the path to the raw data file
# Using os.path.join is a good practice as it works across different operating systems
RAW_DATA_PATH = os.path.join("..", "data", "heart_disease", "raw", "heart_failure_clinical_records_dataset.csv")

# Load the dataset
df = pd.read_csv(RAW_DATA_PATH)

print("Data loaded successfully!")

In [None]:
# Display the first 5 rows to see what the data looks like
print("First 5 rows of the dataset:")
display(df.head())

# Get a concise summary of the dataframe, including data types and non-null values
print("\nDataset Information:")
df.info()

# Get descriptive statistics for each numerical column
print("\nDescriptive Statistics:")
display(df.describe())

In [None]:
# Check the distribution of our target variable
print(df['DEATH_EVENT'].value_counts(normalize=True))

# Visualize the distribution
sns.countplot(x='DEATH_EVENT', data=df)
plt.title('Distribution of Death Events (0 = Survived, 1 = Died)')
plt.show()

In [None]:
# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Feature Distributions by Death Event', fontsize=16)

# Age vs. Death Event
sns.boxplot(ax=axes[0, 0], x='DEATH_EVENT', y='age', data=df)
axes[0, 0].set_title('Age')

# Ejection Fraction vs. Death Event
sns.boxplot(ax=axes[0, 1], x='DEATH_EVENT', y='ejection_fraction', data=df)
axes[0, 1].set_title('Ejection Fraction')

# Serum Creatinine vs. Death Event
sns.boxplot(ax=axes[0, 2], x='DEATH_EVENT', y='serum_creatinine', data=df)
axes[0, 2].set_title('Serum Creatinine')

# Time vs. Death Event
sns.boxplot(ax=axes[1, 0], x='DEATH_EVENT', y='time', data=df)
axes[1, 0].set_title('Follow-up Time (days)')

# Serum Sodium vs. Death Event
sns.boxplot(ax=axes[1, 1], x='DEATH_EVENT', y='serum_sodium', data=df)
axes[1, 1].set_title('Serum Sodium')

# Platelets vs. Death Event
sns.boxplot(ax=axes[1, 2], x='DEATH_EVENT', y='platelets', data=df)
axes[1, 2].set_title('Platelets')


plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of All Features')
plt.show()

In [None]:
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

In [None]:
# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [None]:
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the SAME scaler
X_test_scaled = scaler.transform(X_test)

# For convenience, let's put the scaled data back into pandas DataFrames
X_train = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X.columns)

print("\nFirst 5 rows of the scaled training data:")
display(X_train.head())

In [None]:
# Define paths for processed data
PROCESSED_DATA_DIR = os.path.join("..", "data", "heart_disease", "processed")
SCALER_PATH = os.path.join("..", "models", "heart_disease",  "scaler.joblib")

# Create the directories if they don't exist
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(os.path.dirname(SCALER_PATH), exist_ok=True)


# Save the data sets
joblib.dump(X_train, os.path.join(PROCESSED_DATA_DIR, "X_train.joblib"))
joblib.dump(X_test, os.path.join(PROCESSED_DATA_DIR, "X_test.joblib"))
joblib.dump(y_train, os.path.join(PROCESSED_DATA_DIR, "y_train.joblib"))
joblib.dump(y_test, os.path.join(PROCESSED_DATA_DIR, "y_test.joblib"))

# Save the scaler
joblib.dump(scaler, SCALER_PATH)

print("\nProcessed data and scaler saved successfully!")