In [2]:
"""Data Analyzing & Cleaning"""

from pathlib import Path
import matplotlib.pyplot as plt
import ipyparallel as ipp
import pandas as pd
import numpy as np
import requests
import time

In [3]:
csv_url = "https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv"
raw_data_path = './data/csv/raw_data.csv'
clean_data_path = './data/csv/clean_data.csv'

Key Features for a Predictive Model

For a predictive machine learning model, the strongest predictors are likely:

	- HadHeartAttack
	- HadAngina
	- HadStroke
	- HadDiabetes
	- SmokerStatus
	- BMI
	- AgeCategory
	- Sex

In [4]:
from data.clean_data import clean_data

to_refetch: bool = True
to_fillna: bool = False
to_dropna: bool = True
    
# check for existing file and not_fetch bool stats
if not Path(clean_data_path).exists() or to_refetch:  
    df = pd.read_csv(csv_url)
    df = clean_data(df, 'data/csv/', fillna=to_fillna, dropna=to_dropna)
    
else:
    df = pd.read_csv(clean_data_path)

In [5]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HadDiabetes,SmokerStatus,ECigaretteUsage,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HighRiskLastYear
342,11,1,2.0,4.0,0.0,0.0,1.0,9.0,0.0,0.0,...,0.0,1.0,0.0,3.0,9.0,1.60,71.67,27.99,0.0,0.0
343,11,0,2.0,0.0,0.0,0.0,1.0,6.0,0.0,0.0,...,1.0,1.0,0.0,3.0,10.0,1.78,95.25,30.13,0.0,0.0
345,11,0,2.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,1.0,0.0,3.0,11.0,1.85,108.86,31.66,1.0,0.0
346,11,1,3.0,5.0,0.0,0.0,1.0,9.0,0.0,0.0,...,0.0,0.0,0.0,3.0,12.0,1.70,90.72,31.32,0.0,0.0
347,11,1,1.0,3.0,15.0,0.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,3.0,12.0,1.55,79.38,33.07,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445117,0,0,2.0,0.0,0.0,1.0,1.0,6.0,0.0,0.0,...,0.0,0.0,0.0,3.0,8.0,1.78,102.06,32.28,1.0,0.0
445123,0,1,3.0,0.0,7.0,0.0,1.0,7.0,0.0,0.0,...,0.0,0.0,0.0,2.0,1.0,1.93,90.72,24.34,0.0,0.0
445124,0,0,1.0,0.0,15.0,0.0,1.0,7.0,0.0,0.0,...,1.0,0.0,0.0,1.0,9.0,1.68,83.91,29.86,1.0,0.0
445128,0,1,0.0,2.0,2.0,0.0,1.0,7.0,0.0,0.0,...,0.0,0.0,0.0,2.0,6.0,1.70,83.01,28.66,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 238647 entries, 342 to 445130
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   State                  238647 non-null  int64  
 1   Sex                    238647 non-null  int64  
 2   GeneralHealth          238647 non-null  float64
 3   PhysicalHealthDays     238647 non-null  float64
 4   MentalHealthDays       238647 non-null  float64
 5   LastCheckupTime        238647 non-null  float64
 6   PhysicalActivities     238647 non-null  float64
 7   SleepHours             238647 non-null  float64
 8   HadHeartAttack         238647 non-null  float64
 9   HadAngina              238647 non-null  float64
 10  HadStroke              238647 non-null  float64
 11  HadArthritis           238647 non-null  float64
 12  HadDiabetes            238647 non-null  float64
 13  SmokerStatus           238647 non-null  float64
 14  ECigaretteUsage        238647 non-null 

In [7]:
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler