In [246]:
from enum import unique

import pandas as pd
import numpy as np
# https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease

In [247]:
df = pd.read_csv('heart_2022_with_nans.csv')
target = df.pop('HadHeartAttack')
df = df.drop("RemovedTeeth", axis=1)
df['HadHeartAttack'] = target
df.head(5)


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadAngina,HadStroke,...,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,No,No,...,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,No,No,...,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,No,No,...,63.5,25.61,No,No,No,No,,No,Yes,No
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,No,No,...,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,No,No,...,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No,No


In [248]:
df.shape

(445132, 39)

## 1. Missing value

In [249]:
percent_na = df.isna().mean() * 100
print(percent_na)

State                         0.000000
Sex                           0.000000
GeneralHealth                 0.269134
PhysicalHealthDays            2.454777
MentalHealthDays              2.036924
LastCheckupTime               1.866413
PhysicalActivities            0.245545
SleepHours                    1.225030
HadAngina                     0.989594
HadStroke                     0.349784
HadAsthma                     0.398309
HadSkinCancer                 0.706083
HadCOPD                       0.498504
HadDepressiveDisorder         0.631723
HadKidneyDisease              0.432681
HadArthritis                  0.591510
HadDiabetes                   0.244197
DeafOrHardOfHearing           4.638399
BlindOrVisionDifficulty       4.844406
DifficultyConcentrating       5.445576
DifficultyWalking             5.394355
DifficultyDressingBathing     5.372564
DifficultyErrands             5.763684
SmokerStatus                  7.966626
ECigaretteUsage               8.011107
ChestScan                

In [250]:
cols_up_to_5 = percent_na[percent_na <= 5].index.tolist()
cols_5_to_15 = percent_na[(percent_na > 5) & (percent_na <= 15)].index.tolist()
cols_above_15 = percent_na[percent_na > 15].index.tolist()

for col_name in cols_up_to_5:
    df[col_name] = df[col_name].fillna(df[col_name].mode()[0])

for col_name in cols_5_to_15:
    df[col_name] = df[col_name].fillna(df[col_name].mode()[0])

for col_name in cols_above_15:
    df[col_name] = df[col_name].fillna(df[col_name].mode()[0])



## 2.Data Type Conversion


In [251]:
numeric_cols = [
    'PhysicalHealthDays', 'MentalHealthDays', 'SleepHours',
    'HeightInMeters', 'WeightInKilograms', 'BMI'
]

binary_like_cols = [
    'PhysicalActivities','HadAngina','HadStroke','HadAsthma','HadSkinCancer',
    'HadCOPD','HadDepressiveDisorder','HadKidneyDisease','HadArthritis',
    'HadHeartAttack','FluVaxLast12','PneumoVaxEver',
    'HIVTesting','CovidPos','HighRiskLastYear', "AlcoholDrinkers", "ChestScan"
]

categorical_cols = [
    'State','Sex','GeneralHealth','LastCheckupTime','RaceEthnicityCategory','AgeCategory',
    'SmokerStatus','ECigaretteUsage','TetanusLast10Tdap',
]


In [252]:
for name_col in numeric_cols:
    df[name_col] = df[name_col].astype(float)



In [253]:
for name_col in binary_like_cols:
    print(name_col+ ': ', df[name_col].unique())

PhysicalActivities:  ['No' 'Yes']
HadAngina:  ['No' 'Yes']
HadStroke:  ['No' 'Yes']
HadAsthma:  ['No' 'Yes']
HadSkinCancer:  ['No' 'Yes']
HadCOPD:  ['No' 'Yes']
HadDepressiveDisorder:  ['No' 'Yes']
HadKidneyDisease:  ['No' 'Yes']
HadArthritis:  ['No' 'Yes']
HadHeartAttack:  ['No' 'Yes']
FluVaxLast12:  ['Yes' 'No']
PneumoVaxEver:  ['No' 'Yes']
HIVTesting:  ['No' 'Yes']
CovidPos:  ['No' 'Yes'
 'Tested positive using home test without a health professional']
HighRiskLastYear:  ['No' 'Yes']
AlcoholDrinkers:  ['No' 'Yes']
ChestScan:  ['No' 'Yes']


In [254]:
df['HadDiabetes'].value_counts()

HadDiabetes
No                                         369809
Yes                                         61158
No, pre-diabetes or borderline diabetes     10329
Yes, but only during pregnancy (female)      3836
Name: count, dtype: int64

Хочу сделать наличие диабета бинарным. весь мусор закину в новые фичи и сделаю диабет бинарным

In [255]:
df['prediabetes'] = 0
df['gestational_diabetes'] = 0

mask_prediabetes = df["HadDiabetes"].str.contains('No, pre-diabetes or borderline diabetes')

mask_gestational_diabetes = df["HadDiabetes"].str.contains('Yes, but only during pregnancy (female)', regex=False)

In [256]:
df.loc[mask_prediabetes, "prediabetes"] = 1
df.loc[mask_gestational_diabetes, "gestational_diabetes"] = 1


In [257]:
mask_true = df['HadDiabetes'].isin(['Yes', 'No, pre-diabetes or borderline diabetes'])
df.loc[mask_true, 'HadDiabetes_binary'] = 1

mask_false = df['HadDiabetes'] == 'No'
df.loc[mask_false, 'HadDiabetes_binary'] = 0

print(df['HadDiabetes_binary'].value_counts())
df = df.drop('HadDiabetes', axis=1)


HadDiabetes_binary
0.0    369809
1.0     71487
Name: count, dtype: int64


In [258]:

df['CovidPos'] = df['CovidPos'].replace(
    'Tested positive using home test without a health professional', 'Yes'
)


## 3. Dublicate Removal


In [259]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)


(445132, 41)
(444575, 41)


## 4. Ouliver Removal
Выбросов не нашел


## 5.  Encoding


In [260]:
df['Sex'] = df['Sex'].map({"Male": 1, "Female": 0}).astype(int)


In [261]:
df['GeneralHealth'] = df['GeneralHealth'].map({"Poor": 1, "Fair": 2, 'Good': 3, 'Very good': 4, 'Excellent': 5 }).astype(int)


In [262]:
df['LastCheckupTime'] = df['LastCheckupTime'].map({
    'Within past year (anytime less than 12 months ago)': 1,
    'Within past 2 years (1 year but less than 2 years ago)': 2,
    'Within past 5 years (2 years but less than 5 years ago)': 3,
    '5 or more years ago': 4
}).astype(int)

In [None]:
### Перевожу бинарные колонки

In [263]:
feature_names = df.columns.tolist()
cur_name = feature_names[8]
# print(cur_name + ": ", df[cur_name].unique())
for i in binary_like_cols:
    df[i] = df[i].map({
    'Yes': 1,
    'No': 0,
}).astype(int)
    print(i + ": ", df[i].unique())

PhysicalActivities:  [0 1]
HadAngina:  [0 1]
HadStroke:  [0 1]
HadAsthma:  [0 1]
HadSkinCancer:  [0 1]
HadCOPD:  [0 1]
HadDepressiveDisorder:  [0 1]
HadKidneyDisease:  [0 1]
HadArthritis:  [0 1]
HadHeartAttack:  [0 1]
FluVaxLast12:  [1 0]
PneumoVaxEver:  [0 1]
HIVTesting:  [0 1]
CovidPos:  [0 1]
HighRiskLastYear:  [0 1]
AlcoholDrinkers:  [0 1]
ChestScan:  [0 1]


In [265]:
df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)


## 6. Feature Scaling


In [278]:
from  sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

## 7. Datetime Feature Engineering
их тут нет