#### Prediction of Heart Disease

## Import Necessary Library

In [8]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree


from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics

---

## Setup : Import the Dataset

Dataset from Kaggle : `Indicators of Heart Disease`

Source: https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease/data 

In [3]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


#### Explanation of the variables of the dataset
1. HeartDisease : Respondents who ever had heart disease. ( Yes or No )
2. BMI : Body Mass Index (BMI).
3. Smoking : Respondents that smoked at least 100 cigarettes in their entire life. ( Yes or No )
4. AlcoholDrinking : Adult men having more than 14 drinks per week & adult women having more than 7 drinks per week. ( Yes or No )
5. Stroke : Respondents who ever had stroke. ( Yes or No )
6. PhysicalHealth : Days during the past 30 days was your physical health(physical illness and injury) not good? ( 0-30 days )
7. MentalHealth : Days during the past 30 days was your mental health not good? ( 0-30 days )
8. DiffWalking : Difficulty walking or climbing stairs. ( Yes or No )
9. Sex : male or female
10. AgeCategory: Fourteen-level age category.
11. Race : Imputed race/ethnicity value.
12. Diabetic : Respondents who ever had diabetes. ( Yes or No )
13. PhysicalActivity : Physical activity or exercise during the past 30 days other than their regular job. ( Yes or No )
14. GenHealth : Would you say that in general your health is...
15. SleepTime : Average hours of sleep in a 24-hour period.
16. Asthma : Respondents who ever had asthma. ( Yes or No )
17. KidneyDisease : Respondents who ever had kidney disease. ( Yes or No )
18. SkinCancer : Respondents who ever had skin cancer. ( Yes or No )

---

## Dataset Cleaning

In [4]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [6]:
df.nunique()

HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

In [9]:
df =  df[df.columns].replace({'Yes':1, 'No':0, 'Male':1,'Female':0,'No, borderline diabetes':'0','Yes (during pregnancy)':'1' })
df['Diabetic'] = df['Diabetic'].astype(int)

---
## Exploratory Analysis¶