In [20]:
"""Data Analyzing & Cleaning"""

import pandas as pd # type: ignore
import requests # type: ignore


In [21]:
from value_maps import (
    e_smoking_history,
    last_checkup,
    smoking_history,
    race_ethnicity_category,
    age_ranges,
    gen_health_weights,
    states,
    binaryValues
)

In [22]:
# 'https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2020/heart_2020_cleaned.csv',
# 'https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_no_nans.csv',
data_eps = [
	'https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv'
]

dfs = {}

for i, e in enumerate(data_eps):
  response = requests.get(data_eps[i])
  
  if response.status_code == 200:
    dfs[f'df_{i}'] = pd.read_csv(data_eps[i])
    print('Success')
  else:
    print('Failure')
  print(data_eps[i])
  print(f'df_{i}')


Success
https://web-app-media-assests.sfo3.cdn.digitaloceanspaces.com/Indicators_of_Heart_Disease/2022/heart_2022_with_nans.csv
df_0


In [23]:
df = pd.DataFrame(dfs['df_0'])

In [24]:
df.columns.value_counts()

State                        1
Sex                          1
DifficultyWalking            1
DifficultyDressingBathing    1
DifficultyErrands            1
SmokerStatus                 1
ECigaretteUsage              1
ChestScan                    1
RaceEthnicityCategory        1
AgeCategory                  1
HeightInMeters               1
WeightInKilograms            1
BMI                          1
AlcoholDrinkers              1
HIVTesting                   1
FluVaxLast12                 1
PneumoVaxEver                1
TetanusLast10Tdap            1
HighRiskLastYear             1
DifficultyConcentrating      1
BlindOrVisionDifficulty      1
DeafOrHardOfHearing          1
HadHeartAttack               1
GeneralHealth                1
PhysicalHealthDays           1
MentalHealthDays             1
LastCheckupTime              1
PhysicalActivities           1
SleepHours                   1
RemovedTeeth                 1
HadAngina                    1
HadDiabetes                  1
HadStrok

Key Features for a Predictive Model

For a predictive machine learning model, the strongest predictors are likely:

	- HadHeartAttack
	- HadAngina
	- HadStroke
	- HadDiabetes
	- SmokerStatus
	- BMI
	- AgeCategory
	- Sex

In [25]:
"""
Cell generated by Data Wrangler.
"""


def clean_data(df):
    # Replace missing values with the most common value of each column in: 'HighRiskLastYear', 'AlcoholDrinkers' and 16 other columns
    df = df.fillna(
        {
            "HighRiskLastYear": df["HighRiskLastYear"].mode()[0],
            "AlcoholDrinkers": df["AlcoholDrinkers"].mode()[0],
            "AgeCategory": df["AgeCategory"].mode()[0],
            "RaceEthnicityCategory": df["RaceEthnicityCategory"].mode()[0],
            "ECigaretteUsage": df["ECigaretteUsage"].mode()[0],
            "SmokerStatus": df["SmokerStatus"].mode()[0],
            "HadDiabetes": df["HadDiabetes"].mode()[0],
            "HadArthritis": df["HadArthritis"].mode()[0],
            "HadKidneyDisease": df["HadKidneyDisease"].mode()[0],
            "HadDepressiveDisorder": df["HadDepressiveDisorder"].mode()[0],
            "HadAsthma": df["HadAsthma"].mode()[0],
            "HadStroke": df["HadStroke"].mode()[0],
            "HadAngina": df["HadAngina"].mode()[0],
            "HadHeartAttack": df["HadHeartAttack"].mode()[0],
            "PhysicalActivities": df["PhysicalActivities"].mode()[0],
            "LastCheckupTime": df["LastCheckupTime"].mode()[0],
            "GeneralHealth": df["GeneralHealth"].mode()[0],
            "Sex": df["Sex"].mode()[0],
            "SleepHours": df["SleepHours"].median(),
            "BMI": df["BMI"].median(),
            "WeightInKilograms": df["WeightInKilograms"].median(),
            "State": "Unknown"
        }
    )
    df["GeneralHealthIDs"] = df["GeneralHealth"].map(gen_health_weights)
    df["AgeCategoryIDs"] = df["AgeCategory"].map(age_ranges)
    df["RaceEthnicityCategoryIDs"] = df["RaceEthnicityCategory"].map(race_ethnicity_category)
    df["SmokerStatusIDs"] = df["SmokerStatus"].map(smoking_history)
    df["ECigaretteUsageIDs"] = df["ECigaretteUsage"].map(e_smoking_history)
    df["LastCheckupTimeIDs"] = df["LastCheckupTime"].map(last_checkup)
    df["StateIDs"] = df["State"].map(states)
    df["SexBI"] = df["Sex"].map(binaryValues)
    df["PhysicalActivitiesBI"] = df["PhysicalActivities"].map(binaryValues)
    df["HadHeartAttackBI"] = df["HadHeartAttack"].map(binaryValues)
    df["HadAnginaBI"] = df["HadAngina"].map(binaryValues)
    df["HadStrokeBI"] = df["HadStroke"].map(binaryValues)
    df["HadArthritisBI"] = df["HadArthritis"].map(binaryValues)
    df["HadDiabetesBI"] = df["HadDiabetes"].map(binaryValues)
    df["AlcoholDrinkersBI"] = df["AlcoholDrinkers"].map(binaryValues)
    df["HighRiskLastYearBI"] = df["HighRiskLastYear"].map(binaryValues)
    df = df.drop(columns=df.select_dtypes(include=["object"]).columns)
    return df


df = clean_data(df.copy())

In [26]:
df

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,GeneralHealthIDs,AgeCategoryIDs,RaceEthnicityCategoryIDs,SmokerStatusIDs,...,StateIDs,SexBI,PhysicalActivitiesBI,HadHeartAttackBI,HadAnginaBI,HadStrokeBI,HadArthritisBI,HadDiabetesBI,AlcoholDrinkersBI,HighRiskLastYearBI
0,0.0,0.0,8.0,,80.74,27.44,2,12,3,0,...,11,1,0,0,0,0,0,1.0,0,0
1,0.0,0.0,6.0,1.60,68.04,26.57,0,12,3,0,...,11,1,0,0,0,0,0,0.0,0,0
2,2.0,3.0,5.0,1.57,63.50,25.61,2,7,3,0,...,11,1,1,0,0,0,0,0.0,0,0
3,0.0,0.0,7.0,1.65,63.50,23.30,0,9,3,2,...,11,1,1,0,0,0,1,0.0,0,0
4,2.0,0.0,9.0,1.57,53.98,21.77,3,4,3,0,...,11,1,1,0,0,0,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,0.0,3.0,6.0,1.65,69.85,25.63,1,0,2,0,...,0,1,1,0,0,0,0,0.0,1,0
445128,2.0,2.0,7.0,1.70,83.01,28.66,0,6,2,0,...,0,1,1,0,0,0,0,0.0,0,0
445129,30.0,30.0,5.0,1.70,49.90,17.23,4,9,3,3,...,0,1,0,0,0,0,0,0.0,1,0
445130,0.0,0.0,5.0,1.83,108.86,32.55,2,10,2,0,...,0,0,0,1,0,0,0,0.0,0,0


In [29]:
pd.DataFrame.to_csv(df, './data.csv')