In [20]:
"""Data Analyzing & Cleaning"""

from pathlib import Path
from data.clean_data import clean_data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests


In [21]:
# Check if raw data exists
if not Path('./data/csv/raw_data.csv').exists():
  print('none file')
  df = pd.read_csv(
    "https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv"
	)
else: df = pd.read_csv('data/csv/raw_data.csv', index_col=None)

Key Features for a Predictive Model

For a predictive machine learning model, the strongest predictors are likely:

	- HadHeartAttack
	- HadAngina
	- HadStroke
	- HadDiabetes
	- SmokerStatus
	- BMI
	- AgeCategory
	- Sex

In [22]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.60,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.50,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.50,23.30,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,Virgin Islands,Female,Good,0.0,3.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.65,69.85,25.63,,Yes,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
445128,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
445129,Virgin Islands,Female,Poor,30.0,30.0,5 or more years ago,No,5.0,1 to 5,No,...,1.70,49.90,17.23,,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
445130,Virgin Islands,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,5.0,None of them,Yes,...,1.83,108.86,32.55,No,Yes,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes


In [23]:
if not Path('./data/csv/clean_data.csv').exists():
	df = clean_data(df, 'data/csv/')
else: df = pd.read_csv('./data/csv/clean_data.csv')

In [24]:
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,...,HadDiabetes,SmokerStatus,ECigaretteUsage,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HighRiskLastYear
0,11,1,2,0.0,0.0,0,0,8.0,0,0,...,1.0,0,1,3,12,1.70,80.74,27.44,0,0
1,11,1,0,0.0,0.0,0,0,6.0,0,0,...,0.0,0,0,3,12,1.60,68.04,26.57,0,0
2,11,1,2,2.0,3.0,0,1,5.0,0,0,...,0.0,0,0,3,7,1.57,63.50,25.61,0,0
3,11,1,0,0.0,0.0,0,1,7.0,0,0,...,0.0,2,0,3,9,1.65,63.50,23.30,0,0
4,11,1,3,2.0,0.0,0,1,9.0,0,0,...,0.0,0,0,3,4,1.57,53.98,21.77,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,0,1,1,0.0,3.0,1,1,6.0,0,0,...,0.0,0,0,2,0,1.65,69.85,25.63,1,0
445128,0,1,0,2.0,2.0,0,1,7.0,0,0,...,0.0,0,0,2,6,1.70,83.01,28.66,0,0
445129,0,1,4,30.0,30.0,3,0,5.0,0,0,...,0.0,3,3,3,9,1.70,49.90,17.23,1,0
445130,0,0,2,0.0,0.0,0,0,5.0,1,0,...,0.0,0,0,2,10,1.83,108.86,32.55,0,0


In [25]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "browser"
cmap = plt.colormaps.get_cmap("tab10")


def multi_scatter(df):
  x_df = df['AgeCategory']
  y_df = df[df.drop(columns=['AgeCategory'])]
  for i, col in enumerate(x_df):
      plt.scatter(
        x_df,
        y_df[col],
        alpha=0.6,
        label=col,
        color=cmap(i)
      )

  # Add labels, title, and legend
  plt.title(f'Scatter Plots Overlaid for AgeCategory', fontsize=16)
  plt.xlabel('AgeCategory', fontsize=12)
  plt.ylabel("Values", fontsize=12)
  plt.legend(title="Columns", fontsize=10, loc='upper right')
  plt.grid(True, linestyle='--', alpha=0.5)
  plt.show()

In [26]:
multi_scatter(df)

ValueError: Boolean array expected for the condition, not int64