# Heart Failure

In [1]:
import pandas as pd

## Retrieve the Data

**The Data was retrieved from Kaggle : [https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction]**

In [2]:
# Import the data into a dataframe
path = "resources/heart.csv"
data = pd.read_csv(path)

data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# Check all the Fields
data.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [4]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


1. Age Range 28 - 77 years old

In [5]:
data['Sex'].value_counts()

Sex
M    725
F    193
Name: count, dtype: int64

In [6]:
data['HeartDisease'].value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

This data shows that 508  people has the heart disease out of 4

In [7]:
# Confirm all the values exists
data.isnull().mean()

Age               0.0
Sex               0.0
ChestPainType     0.0
RestingBP         0.0
Cholesterol       0.0
FastingBS         0.0
RestingECG        0.0
MaxHR             0.0
ExerciseAngina    0.0
Oldpeak           0.0
ST_Slope          0.0
HeartDisease      0.0
dtype: float64

In [17]:
numerical_fields = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS','MaxHR', 'Oldpeak']

categorical_fields = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [20]:
# Preview the category columns in the raw format

# Before one-hot-encoded
before_one_hot_encoded =data.copy()

before_one_hot_encoded = before_one_hot_encoded.drop(columns=numerical_fields)
del(before_one_hot_encoded['HeartDisease'])

before_one_hot_encoded

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up
...,...,...,...,...,...
913,M,TA,Normal,N,Flat
914,M,ASY,Normal,N,Flat
915,M,ASY,Normal,Y,Flat
916,F,ATA,LVH,N,Flat


In [21]:
# OneHotEncoder on columns with categorical data
from sklearn.preprocessing import OneHotEncoder

# Create an instance of OneHotEncoder()
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Columns to one hot encode
columns_to_enocde = categorical_fields

# Fit the encoder to the data
# Identifies the unique categories of each column and assigns a binary vector 
enc.fit(data[columns_to_enocde])

# Transform the data
# Each unique category becomes a separate binary column
data_one_hot_encoded = enc.transform(data[columns_to_enocde])


# Create a pandas dataframe to join this with the original dataframe
data_one_hot_encoded_df = pd.DataFrame(
    data_one_hot_encoded,
    columns = enc.get_feature_names_out(columns_to_enocde),
    index= data.index
)


In [22]:
data_one_hot_encoded_df

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
916,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Splitting the data

In [9]:
# Import module
from sklearn.model_selection import train_test_split

# Split training and testing sets
# Create the features Dataframe X
X = data.copy()
X = data.drop(columns = ['HeartDisease'])

# Create the target Dataframe, y
y = data['HeartDisease']

# Use train_test_split to separate the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

