In [12]:
"""Data Analyzing & Cleaning"""

from pathlib import Path
from data.clean_data import clean_data
import matplotlib.pyplot as plt
import ipyparallel as ipp
import pandas as pd
import numpy as np
import requests
import time

In [13]:
# Check if raw data exists
if not Path('./data/csv/raw_data.csv').exists():
  print('none file')
  df = pd.read_csv(
    "https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv"
	)
else: df = pd.read_csv('data/csv/raw_data.csv', index_col=None)

Key Features for a Predictive Model

For a predictive machine learning model, the strongest predictors are likely:

	- HadHeartAttack
	- HadAngina
	- HadStroke
	- HadDiabetes
	- SmokerStatus
	- BMI
	- AgeCategory
	- Sex

In [14]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.60,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.50,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.50,23.30,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,Virgin Islands,Female,Good,0.0,3.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.65,69.85,25.63,,Yes,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
445128,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
445129,Virgin Islands,Female,Poor,30.0,30.0,5 or more years ago,No,5.0,1 to 5,No,...,1.70,49.90,17.23,,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
445130,Virgin Islands,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,5.0,None of them,Yes,...,1.83,108.86,32.55,No,Yes,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes


In [15]:
not_refetch: bool = False
if not Path('./data/csv/clean_data.csv').exists():
	df = pd.read_csv(
		"https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv"
	)
	df = clean_data(df, 'data/csv/', fillna=False, dropna=False)
elif Path('./data/csv/clean_data.csv').exists() and not_refetch is False:
	df = pd.read_csv(
		"https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv"
	)
	df = clean_data(df, 'data/csv/', fillna=False, dropna=False)
else: df = pd.read_csv('./data/csv/clean_data.csv')

In [16]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HadDiabetes,SmokerStatus,ECigaretteUsage,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HighRiskLastYear
0,11,1,2.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,1.0,0.0,1.0,3.0,12.0,,,,0.0,0.0
1,11,1,0.0,0.0,0.0,,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,3.0,12.0,1.60,68.04,26.57,0.0,0.0
2,11,1,2.0,2.0,3.0,0.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,3.0,7.0,1.57,63.50,25.61,0.0,0.0
3,11,1,0.0,0.0,0.0,0.0,1.0,7.0,0.0,0.0,...,0.0,2.0,0.0,3.0,,1.65,63.50,23.30,0.0,0.0
4,11,1,3.0,2.0,0.0,0.0,1.0,9.0,0.0,0.0,...,0.0,0.0,0.0,3.0,4.0,1.57,53.98,21.77,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,0,1,1.0,0.0,3.0,1.0,1.0,6.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,1.65,69.85,25.63,,0.0
445128,0,1,0.0,2.0,2.0,0.0,1.0,7.0,0.0,0.0,...,0.0,0.0,0.0,2.0,6.0,1.70,83.01,28.66,0.0,0.0
445129,0,1,4.0,30.0,30.0,3.0,0.0,5.0,0.0,0.0,...,0.0,3.0,3.0,,9.0,1.70,49.90,17.23,,0.0
445130,0,0,2.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,...,0.0,0.0,0.0,2.0,10.0,1.83,108.86,32.55,0.0,0.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   State                  445132 non-null  int64  
 1   Sex                    445132 non-null  int64  
 2   GeneralHealth          443934 non-null  float64
 3   PhysicalHealthDays     434205 non-null  float64
 4   MentalHealthDays       436065 non-null  float64
 5   LastCheckupTime        436824 non-null  float64
 6   PhysicalActivities     444039 non-null  float64
 7   SleepHours             439679 non-null  float64
 8   HadHeartAttack         442067 non-null  float64
 9   HadAngina              440727 non-null  float64
 10  HadStroke              443575 non-null  float64
 11  HadArthritis           442499 non-null  float64
 12  HadDiabetes            429880 non-null  float64
 13  SmokerStatus           409670 non-null  float64
 14  ECigaretteUsage        409472 non-nu

In [18]:
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [19]:
# scaler = StandardScaler()

# features = [
# 	'State',
# 	'Sex',
# 	'GeneralHealth',
# 	'PhysicalHealthDays',
# 	'MentalHealthDays',
# 	'LastCheckupTime',
# 	'PhysicalActivities',
# 	'SleepHours',
# 	'HadHeartAttack',
# 	'HadAngina',
# 	'HadStroke',
# 	'HadArthritis',
# 	'HadDiabetes',
# 	'SmokerStatus',
# 	'ECigaretteUsage',
# 	'RaceEthnicityCategory',
# 	'AgeCategory',
# 	'HeightInMeters',
# 	'WeightInKilograms',
# 	'BMI',
# 	'AlcoholDrinkers',
# 	'HighRiskLastYear',
# ]
# X = df[features]

# X_scaled = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled)
# sns.pairplot(X_scaled)
# plt.show()