In [None]:
"""Data Analyzing & Cleaning"""

from data.clean_data import fetch_check
import matplotlib.pyplot as plt
import ipyparallel as ipp
import pandas as pd
import numpy as np

Key Features for a Predictive Model

For a predictive machine learning model, the strongest predictors are likely:

	- HadHeartAttack
	- HadAngina
	- HadStroke
	- HadDiabetes
	- SmokerStatus
	- BMI
	- AgeCategory
	- Sex

In [None]:
keepers = [
    "Sex",
    "GeneralHealth",
    "PhysicalHealthDays",
    "SleepHours",
    "SmokerStatus",
    "ECigaretteUsage",
    "RaceEthnicityCategory",
    "AgeCategory",
    "WeightInKilograms",
    "BMI",
    "AlcoholDrinkers",
    "HighRiskLastYear",
]


In [None]:
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# set scaler
scaler = StandardScaler()

df = fetch_check(to_fetch=True, to_fillna=True, to_dropna=False)
df = df[keepers]

# scale features
X_scaled = scaler.fit(df)
X_scaled = scaler.transform(df)

In [None]:
def load_and_preprocess_data(df):

    df["HeartDisease"] = ((df["HadHeartAttack"] == 1) | (df["HadAngina"] == 1)).astype(
        int
    )
    
    print("\nHeartDisease column distribution:")
    print(df["HeartDisease"].value_counts(normalize=True))

    x = df.drop(["HeartDisease"], axis=1)
    y = df["HeartDisease"]

    for column in x.columns:
        if x[column].dtype in ["int64", "float64"]:
            x[column].fillna(x[column].median(), inplace=True)
        else:
            x[column].fillna(x[column].mode()[0], inplace=True)

    return x, y


In [None]:
X = pd.DataFrame(X_scaled)
sns.pairplot(X, diag_kind="hist").savefig(fname='./data/plots/target_columns_all_nans_all_fills.png')