In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression


In [2]:
mydata = pd.read_csv('datasets/heart_2020_cleaned.csv')

In [3]:
mydata.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
mydata.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [5]:
mydata.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [6]:
mydata.shape

(319795, 18)

In [7]:
#stroke = felç, DiffWalking = yurumede zorluk, race = Irk, kidney disease = bobrek hastaligi
print(mydata['HeartDisease'].value_counts())



HeartDisease
No     292422
Yes     27373
Name: count, dtype: int64


In [8]:
print(mydata['HeartDisease'].isnull().sum())


0


In [9]:
mydata['HeartDisease'] = mydata['HeartDisease'].map({'Yes': 1, 'No': 0})


In [10]:
mydata['Smoking'].unique()

array(['Yes', 'No'], dtype=object)

In [11]:
mydata['Smoking'].isnull().sum()

np.int64(0)

In [12]:
#bazi veriler kalp krizinde direkt anlamlidir. 
#Ornegin ssigara icip icmemesi kalp krizini tetikleyen unsurlardandir bunlarda label encoding yeterli olacaktir
mydata['Smoking'] = mydata['Smoking'].map({'Yes': 1, 'No':0})

In [13]:
mydata['AlcoholDrinking'] = mydata['AlcoholDrinking'].map({'Yes': 1, 'No': 0})


In [14]:
mydata['HeartDisease'].value_counts(normalize=True)

HeartDisease
0    0.914405
1    0.085595
Name: proportion, dtype: float64

In [15]:
mydata['Stroke'] = mydata['Stroke'].map({'Yes':1,'No':0}) 

In [16]:
mydata['PhysicalHealth'].describe()


count    319795.00000
mean          3.37171
std           7.95085
min           0.00000
25%           0.00000
50%           0.00000
75%           2.00000
max          30.00000
Name: PhysicalHealth, dtype: float64

In [17]:
mydata['PhysicalHealth'].unique()


array([ 3.,  0., 20., 28.,  6., 15.,  5., 30.,  7.,  1.,  2., 21.,  4.,
       10., 14., 18.,  8., 25., 16., 29., 27., 17., 24., 12., 23., 26.,
       22., 19.,  9., 13., 11.])

In [18]:
mydata['MentalHealth'].describe()


count    319795.000000
mean          3.898366
std           7.955235
min           0.000000
25%           0.000000
50%           0.000000
75%           3.000000
max          30.000000
Name: MentalHealth, dtype: float64

In [19]:
mydata['MentalHealth'].unique()


array([30.,  0.,  2.,  5., 15.,  8.,  4.,  3., 10., 14., 20.,  1.,  7.,
       24.,  9., 28., 16., 12.,  6., 25., 17., 18., 21., 29., 22., 13.,
       23., 27., 26., 11., 19.])

In [20]:
mydata['DiffWalking'].isnull().sum()

np.int64(0)

In [21]:
mydata['DiffWalking'].unique()

array(['No', 'Yes'], dtype=object)

In [22]:
mydata['DiffWalking'] = mydata['DiffWalking'].map({'Yes':1,'No':0})

In [23]:
mydata['Sex'] = mydata['Sex'].map({'Female': 0, 'Male': 1})

In [24]:
mydata['AgeCategory'].unique()


array(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
       '60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],
      dtype=object)

In [25]:
age_mapping = {
    '18-24': 0,
    '25-29': 1,
    '30-34': 2,
    '35-39': 3,
    '40-44': 4,
    '45-49': 5,
    '50-54': 6,
    '55-59': 7,
    '60-64': 8,
    '65-69': 9,
    '70-74': 10,
    '75-79': 11,
    '80 or older': 12
}
mydata['AgeCategory'] = mydata['AgeCategory'].map(age_mapping)

In [26]:
mydata['Race'].unique()

array(['White', 'Black', 'Asian', 'American Indian/Alaskan Native',
       'Other', 'Hispanic'], dtype=object)

In [27]:
mydata = pd.get_dummies(mydata, columns=['Race'], prefix='Race', drop_first=True)

In [28]:
print(mydata.columns)

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma',
       'KidneyDisease', 'SkinCancer', 'Race_Asian', 'Race_Black',
       'Race_Hispanic', 'Race_Other', 'Race_White'],
      dtype='object')


In [29]:
mydata.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,16.6,1,0,0,3.0,30.0,0,0,7,...,Very good,5.0,Yes,No,Yes,False,False,False,False,True
1,0,20.34,0,0,1,0.0,0.0,0,0,12,...,Very good,7.0,No,No,No,False,False,False,False,True
2,0,26.58,1,0,0,20.0,30.0,0,1,9,...,Fair,8.0,Yes,No,No,False,False,False,False,True
3,0,24.21,0,0,0,0.0,0.0,0,0,11,...,Good,6.0,No,No,Yes,False,False,False,False,True
4,0,23.71,0,0,0,28.0,0.0,1,0,4,...,Very good,8.0,No,No,No,False,False,False,False,True


In [30]:
mydata['Diabetic'].value_counts()

Diabetic
No                         269653
Yes                         40802
No, borderline diabetes      6781
Yes (during pregnancy)       2559
Name: count, dtype: int64

In [31]:
diabetic_mapping = {
    'No': 0,
    'No, borderline diabetes': 1,
    'Yes (during pregnancy)': 2,
    'Yes': 3
}

mydata['Diabetic'] = mydata['Diabetic'].map(diabetic_mapping)

In [32]:
mydata['PhysicalActivity'] = mydata['PhysicalActivity'].map({'Yes': 1, 'No': 0})

In [33]:
mydata['GenHealth'].unique()

array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)

In [34]:
mydata['GenHealth'].isnull().sum()

np.int64(0)

In [35]:
mydata['GenHealth'] = mydata['GenHealth'].map({
    'Excellent': 4,
    'Very good': 3,
    'Good': 2,
    'Fair': 1,
    'Poor': 0
})

In [36]:
mydata['SkinCancer'].unique()

array(['Yes', 'No'], dtype=object)

In [37]:
mydata['Asthma'] = mydata['Asthma'].map({'Yes': 1, 'No': 0})
mydata['KidneyDisease'] = mydata['KidneyDisease'].map({'Yes': 1, 'No': 0})
mydata['SkinCancer'] = mydata['SkinCancer'].map({'Yes': 1, 'No': 0})

In [38]:
mydata.isnull().sum()

HeartDisease             0
BMI                      0
Smoking                  0
AlcoholDrinking          0
Stroke                   0
PhysicalHealth           0
MentalHealth             0
DiffWalking              0
Sex                      0
AgeCategory              0
Diabetic                 0
PhysicalActivity         0
GenHealth           113858
SleepTime                0
Asthma                   0
KidneyDisease            0
SkinCancer               0
Race_Asian               0
Race_Black               0
Race_Hispanic            0
Race_Other               0
Race_White               0
dtype: int64

In [39]:
mydata['GenHealth'].unique()

array([nan,  1.,  2.,  0.,  4.])