In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from pylab import rcParams
import plotly.graph_objects as go
import missingno as msno
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [2]:
df = pd.read_csv('datasets/heart_cdc.csv')

In [3]:
# Define a dictionary for column renaming
columns_name_mapping = {
    "cvdcrhd4": "HeartDisease",
    "weight2": "BMI1", 
    "height3":"BMI2", 
    "_smoker3":"Smoking", 
    "drnkany5":"AlcoholDrinking", 
    "cvdstrk3":"Stroke", 
    "physhlth":"PhysicalHealth",
    "menthlth":"MentalHealth", 
    "diffwalk":"DiffWalking", 
    "_sex":"Sex", 
    "_ageg5yr":"AgeCategory", 
    "_race":"Race", 
    "diabete4":"Diabetic", 
    "exerany2":"PhysicalActivity", 
    "genhlth":"GenHealth", 
    "sleptim1":"SleepTime", 
    "_asthms1":"Asthma", 
    "chckdny2":"KidneyDisease", 
    "chcscncr":"SkinCancer",
    "_bmi5":"BMI_calc"
}

# Define a list of the columns you want to keep
columns_to_keep = [ 
    "HeartDisease",
    "BMI1", 
    "BMI2", 
    "Smoking", 
    "AlcoholDrinking", 
    "Stroke", 
    "PhysicalHealth",
    "MentalHealth", 
    "DiffWalking", 
    "Sex", 
    "AgeCategory", 
    "Race", 
    "Diabetic", 
    "PhysicalActivity", 
    "GenHealth", 
    "SleepTime", 
    "Asthma", 
    "KidneyDisease", 
    "SkinCancer",
    "BMI_calc"
]

# Rename columns and select the columns to keep in one step
df = df.rename(columns=columns_name_mapping)[columns_to_keep]

In [4]:
value_mapping = {
    'HeartDisease': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'AlcoholDrinking': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'Stroke': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'PhysicalActivity': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'DiffWalking': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'SkinCancer': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'KidneyDisease': {1:'Yes', 2:'No', 7:'Not Clear', 9:'Refused'},
    'Asthma': {1:'Current', 2:'Former', 3:'Never', 9:'Not Clear'},
    'Smoking': {1:'Smoker', 2:'Approximate Smoker', 3:'Former Smoker', 4:'Never', 9:'Not Clear'},
    'Sex': {1:'Male', 2:'Female'},
    'AgeCategory': {1:'18-24', 2:'25-29', 3:'30-34', 4:'35-39', 5:'40-44', 6:'45-49', 7:'50-54', 8:'55-59', 9:'60-64', 10:'65-69', 11:'70-74', 12:'75-79', 13:'80+', 14:'Not Clear'},
    'Race': {1:'White', 2:'Black', 3:'Native', 4:'Asian', 5:'Hawaian', 6:'Other', 7:'Multiracial', 8:'Hispanic', 9:'Not Clear'},
    'Diabetic': {1:'Yes', 2:'Yes(Pregnant)', 3:'No', 4:'Border Line', 7:'Not Clear', 9:'Refused'},
    'GenHealth': {1:'Excellent', 2:'Very Good', 3:'Good', 4:'Fair', 5:'Poor', 7:'Not Clear', 9:'Refused'},
}

# Use the replace() method to change the values in the DataFrame
df.replace(value_mapping, inplace=True)

In [4]:
value_mapping = {
    'HeartDisease': {1:1, 2:2, 7:3, 9:4},
    'AlcoholDrinking': {1:1, 2:2, 7:3, 9:4},
    'Stroke': {1:1, 2:2, 7:3, 9:4},
    'PhysicalActivity': {1:1, 2:2, 7:3, 9:4},
    'DiffWalking': {1:1, 2:2, 7:3, 9:4},
    'SkinCancer': {1:1, 2:2, 7:3, 9:4},
    'KidneyDisease': {1:1, 2:2, 7:3, 9:4},
    'Asthma': {1:1, 2:2, 3:3, 9:4},
    'Smoking': {1:1, 2:2, 3:3, 4:4, 9:5},
    'Diabetic': {1:1, 2:2, 3:3, 4:4, 7:5, 9:6},
    'GenHealth': {1:1, 2:2, 3:3, 4:4, 5:5, 7:6, 9:7},
}

# Use the replace() method to change the values in the DataFrame
df.replace(value_mapping, inplace=True)

In [5]:
df.head()

Unnamed: 0,HeartDisease,BMI1,BMI2,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,BMI_calc
0,2.0,106.0,507.0,1,2,2.0,3.0,30.0,2.0,2,8,1.0,1.0,1.0,2.0,5.0,1,2.0,1.0,1660.0
1,2.0,170.0,504.0,5,4,2.0,88.0,88.0,2.0,2,10,2.0,3.0,1.0,3.0,7.0,2,2.0,2.0,2918.0
2,2.0,7777.0,508.0,4,2,2.0,88.0,88.0,2.0,2,10,2.0,3.0,1.0,3.0,7.0,3,2.0,2.0,
3,2.0,9999.0,9999.0,4,2,2.0,88.0,88.0,2.0,2,13,1.0,3.0,2.0,1.0,6.0,3,2.0,2.0,
4,2.0,126.0,506.0,4,2,1.0,88.0,88.0,2.0,2,13,1.0,3.0,1.0,2.0,7.0,3,2.0,2.0,2034.0


In [5]:
df.isna().sum()

HeartDisease            3
BMI1                 9852
BMI2                10824
Smoking                 0
AlcoholDrinking         0
Stroke                  3
PhysicalHealth          5
MentalHealth            5
DiffWalking         15280
Sex                     0
AgeCategory             0
Race                    1
Diabetic                6
PhysicalActivity        3
GenHealth               8
SleepTime               3
Asthma                  0
KidneyDisease           6
SkinCancer              3
BMI_calc            41357
dtype: int64

In [6]:
var_known = df.dropna(subset=['DiffWalking'])
var_missing = df[df['DiffWalking'].isna()]

In [9]:
X_known = var_known.drop(columns=['DiffWalking'])
y_known = pd.DataFrame(var_known['DiffWalking'])

In [12]:
le = LabelEncoder()

In [13]:
X_known = X_known.apply(le.fit_transform)

In [14]:
cat_tree = DecisionTreeClassifier(random_state=0)
cat_tree.fit(X_known, y_known)

In [15]:
missing_values = var_missing.drop(columns=['DiffWalking'])

In [16]:
missing_values['DiffWalking'] = cat_tree.predict(missing_values)

In [17]:
df.loc[var_missing.index, 'DiffWalking'] = missing_values['DiffWalking']

In [24]:
df.head()

Unnamed: 0,HeartDisease,BMI1,BMI2,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,BMI_calc
0,2.0,106.0,507.0,1,2,2.0,3.0,30.0,2.0,2,8,1.0,1.0,1.0,2.0,5.0,1,2.0,1.0,1660.0
1,2.0,170.0,504.0,5,4,2.0,88.0,88.0,2.0,2,10,2.0,3.0,1.0,3.0,7.0,2,2.0,2.0,2918.0
2,2.0,7777.0,508.0,4,2,2.0,88.0,88.0,2.0,2,10,2.0,3.0,1.0,3.0,7.0,3,2.0,2.0,
3,2.0,9999.0,9999.0,4,2,2.0,88.0,88.0,2.0,2,13,1.0,3.0,2.0,1.0,6.0,3,2.0,2.0,
4,2.0,126.0,506.0,4,2,1.0,88.0,88.0,2.0,2,13,1.0,3.0,1.0,2.0,7.0,3,2.0,2.0,2034.0


In [19]:
for column in df.columns:
    unique_values = len(df[column].unique())
    print(f"{column} : {unique_values}")

HeartDisease : 5
BMI1 : 575
BMI2 : 148
Smoking : 5
AlcoholDrinking : 4
Stroke : 5
PhysicalHealth : 34
MentalHealth : 34
DiffWalking : 4
Sex : 2
AgeCategory : 14
Race : 10
Diabetic : 7
PhysicalActivity : 5
GenHealth : 8
SleepTime : 27
Asthma : 4
KidneyDisease : 5
SkinCancer : 5
BMI_calc : 3770


In [20]:
df['DiffWalking'].value_counts()

2.0    335375
1.0     64639
3.0      1172
4.0       772
Name: DiffWalking, dtype: int64

In [21]:
missing_values['DiffWalking'].shape

(15280,)

In [22]:
missing_values.isna().sum()

HeartDisease            3
BMI1                 9851
BMI2                10824
Smoking                 0
AlcoholDrinking         0
Stroke                  3
PhysicalHealth          3
MentalHealth            3
Sex                     0
AgeCategory             0
Race                    1
Diabetic                6
PhysicalActivity        3
GenHealth               4
SleepTime               3
Asthma                  0
KidneyDisease           6
SkinCancer              3
BMI_calc            11885
DiffWalking             0
dtype: int64

# BMI

**Add BMI Column:**

Weight: How much do you weigh without shoes?

Height: How tall are you without shoes?

height(meters) = height(inches) / 12 * 0.0254

BMI = weight(pounds) / (height(meters) ** 2)

In [None]:
df['BMI'] = np.nan
for i in range(len(df['Height'])):
        if not pd.isna(df['Height'].iloc[i]) and not pd.isna(df['Weight'].iloc[i]):
                height_str = str(df['Height'].iloc[i])
                height_inches = int(height_str[0]) * 12 + int(height_str[1:3])
                df['BMI'].iloc[i] = df['Weight'].iloc[i] / (height_inches ** 2) * 703
                
df = df.drop(['Weight', 'Height', 'BMI_calc'], axis=1)

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(features_data)

plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cluster_labels)
plt.title("t-SNE Visualization with Cluster Assignments")
plt.show()