# Download heart disease dataset heart.csv in Exercise folder and do following, (credits of dataset: https://www.kaggle.com/fedesoriano/heart-failure-prediction)

-Load heart disease dataset in pandas dataframe
-Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
-Convert text columns to numbers using label encoding and one hot encoding
-Apply scaling
-Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy
-Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life

In [2]:
# Load the heart disease dataset into a pandas DataFrame
import pandas as pd
df = pd.read_csv("heart.csv")

In [3]:
# Display the first few rows to understand the data structure
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
# Check the shape of the DataFrame to know the number of rows and columns
df.shape

(918, 12)

In [5]:
# Generate descriptive statistics to get an overview of the data
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
# Filter out outliers in the Cholesterol column based on the mean and standard deviation
df[df.Cholesterol>(df.Cholesterol.mean()+3*df.Cholesterol.std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


In [7]:
# Confirm the shape of the DataFrame remains unchanged as this was just a filtering operation
df.shape

(918, 12)

In [8]:
# Remove outliers in the Cholesterol column and create a new DataFrame df1
df1 = df[df.Cholesterol<=(df.Cholesterol.mean()+3*df.Cholesterol.std())]
df1.shape

(915, 12)

In [9]:
# Filter operations for outliers based on the MaxHR, FastingBS, and Oldpeak columns
df[df.MaxHR>(df.MaxHR.mean()+3*df.MaxHR.std())]
df[df.FastingBS>(df.FastingBS.mean()+3*df.FastingBS.std())]
df[df.Oldpeak>(df.Oldpeak.mean()+3*df.Oldpeak.std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1


In [10]:
# Remove outliers in the Oldpeak column from df1 and create a new DataFrame df2
df2 = df1[df1.Oldpeak<=(df1.Oldpeak.mean()+3*df1.Oldpeak.std())]
df2.shape

(909, 12)

In [11]:
# Filter operation for outliers in the RestingBP column
df[df.RestingBP>(df.RestingBP.mean()+3*df.RestingBP.std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1


In [12]:
# Remove outliers in the RestingBP column from df2 and create a new DataFrame df3
df3 = df2[df2.RestingBP<=(df2.RestingBP.mean()+3*df2.RestingBP.std())]
df3.shape

(902, 12)

In [13]:
# Check unique values in categorical columns to understand the data and prepare for encoding
df.ChestPainType.unique()
df.RestingECG.unique()
df.ExerciseAngina.unique()
df.ST_Slope.unique()


array(['Up', 'Flat', 'Down'], dtype=object)

In [14]:
# Copy df3 to df4 and replace categorical string values with numerical codes for machine learning
df4 = df3.copy()
df4.ExerciseAngina.replace({'N': 0, 'Y': 1}, inplace=True)
df4.ST_Slope.replace({'Down': 1, 'Flat': 2, 'Up': 3}, inplace=True)
df4.RestingECG.replace({'Normal': 1, 'ST': 2, 'LVH': 3}, inplace=True)
df4.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0


In [15]:
# Convert remaining categorical variables into dummy/indicator variables
df5 = pd.get_dummies(df4, drop_first=True)
df5.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,1,0,1,0


In [16]:
# Prepare features (X) and target variable (y) for model training
X = df5.drop("HeartDisease", axis='columns')
y = df5.HeartDisease
X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,1,0,1,0


In [17]:
# Standardize the features using StandardScaler for better model performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
# Split the scaled features and target variable into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)


In [19]:
# Train a RandomForestClassifier on the training data and evaluate its accuracy on the test data
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8784530386740331

In [20]:
# Apply PCA for dimensionality reduction while retaining 95% of the variance
from sklearn.decomposition import PCA
pca = PCA(0.95)
X_pca = pca.fit_transform(X)

In [21]:
# Split the PCA-transformed features into training and testing sets
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [22]:
# Train and evaluate another RandomForestClassifier on the PCA-transformed data
model_rf = RandomForestClassifier()
model_rf.fit(X_train_pca, y_train)
model_rf.score(X_test_pca, y_test)

0.7016574585635359