In [1]:
import pandas as pd

df = pd.read_csv("heart.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [2]:
# Remove outliers using Z-score
import numpy as np
from scipy import stats

In [7]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
z_scores = abs(stats.zscore(df[numeric_cols]))

tol = (z_scores < 3).all(axis = 1)
df_no_outliers = df[tol].reset_index(drop = True)
df_no_outliers

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
894,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
895,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
896,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
897,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [9]:
cat_columns = ['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']
new_df = pd.get_dummies(df_no_outliers, columns = cat_columns , prefix = cat_columns, drop_first = True)
new_df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,45,110,264,0,132,1.2,1,1,0,0,1,1,0,0,1,0
895,68,144,193,1,141,3.4,1,1,0,0,0,1,0,0,1,0
896,57,130,131,0,115,1.2,1,1,0,0,0,1,0,1,1,0
897,57,130,236,0,174,0.0,1,0,1,0,0,0,0,0,1,0


In [11]:
# Scale the features
X = new_df.drop(['HeartDisease'],axis = 1)
y = new_df.HeartDisease

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.46590022,  0.84963584, ..., -0.8229452 ,
        -0.99888827,  1.13469459],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.8229452 ,
         1.00111297, -0.88129441],
       [-1.7455875 , -0.1185065 ,  0.79361247, ..., -0.8229452 ,
        -0.99888827,  1.13469459],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ...,  1.21514774,
         1.00111297, -0.88129441],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ..., -0.8229452 ,
         1.00111297, -0.88129441],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.8229452 ,
        -0.99888827,  1.13469459]])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 30)

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8611111111111112

In [20]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[ 3.1127753 , -0.41658958,  0.66578215, ...,  0.18819508,
        -0.11003054,  0.27117526],
       [ 0.19020862, -1.07624669,  0.91867491, ...,  1.02192511,
        -0.23292103, -0.16634318],
       [ 1.63879275,  2.18185473,  0.41776913, ..., -0.59357507,
        -2.04145124,  0.13946001],
       ...,
       [-1.77715593, -1.47905475, -0.30823014, ...,  0.09497848,
         0.37862368,  0.12794198],
       [ 1.10463024,  0.20458484,  1.84982607, ...,  1.07153522,
         1.39743338,  0.34416985],
       [ 2.51254528, -0.47243606, -0.90466298, ...,  0.11362542,
        -0.06814383, -0.01807308]])

In [21]:
X_pca.shape

(899, 13)

In [22]:
X_train_pca, X_test_pca, y_train,y_test = train_test_split(X_pca, y, test_size = 0.2 , random_state = 30)

In [24]:
model = LogisticRegression()
model.fit(X_train_pca, y_train)
model.score(X_test_pca, y_test)

0.8666666666666667

In [None]:
# Improved the accuracy slightly while making the computation much easier by eliminating 3 columns