In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

In [2]:
df = pd.read_csv("heart.csv")
df.head(6)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1


Column Descriptions:

age: age in years

sex: (1 = male, 0 = female)

cp: The chest pain(1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic)

trestbps: resting blood pressure (mm Hg on admission to the hospital)

chol: cholesterol measurement in mg/dl

fbs: fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)

restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)

thalach: maximum heart rate 

exang: Exercise induced angina (1 = yes; 0 = no)

oldpeak: ST depression induced by exercise relative to rest 

slope: the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping)

ca: The number of major vessels (0-3)

thal: A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)

target: Heart disease (0 = no, 1 = yes)

In [3]:
#rename column headers
df.columns = ['age', 'sex', 'chest_pain', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
df.head(6)

Unnamed: 0,age,sex,chest_pain,resting_blood_pressure,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope,num_major_vessels,thalassemia,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1


In [4]:
#Checking data types of the columns
df.dtypes

age                          int64
sex                          int64
chest_pain                   int64
resting_blood_pressure       int64
cholesterol                  int64
fasting_blood_sugar          int64
rest_ecg                     int64
max_heart_rate_achieved      int64
exercise_induced_angina      int64
st_depression              float64
st_slope                     int64
num_major_vessels            int64
thalassemia                  int64
target                       int64
dtype: object

In [5]:
#Checking if there are any null values
df.isnull().sum()

age                        0
sex                        0
chest_pain                 0
resting_blood_pressure     0
cholesterol                0
fasting_blood_sugar        0
rest_ecg                   0
max_heart_rate_achieved    0
exercise_induced_angina    0
st_depression              0
st_slope                   0
num_major_vessels          0
thalassemia                0
target                     0
dtype: int64

In [6]:
#new data frame without categorical data to check for outliers
df_mean = df[['resting_blood_pressure','cholesterol','max_heart_rate_achieved','st_depression']]
mean = df_mean.mean(axis = 0) 
mean

resting_blood_pressure     131.623762
cholesterol                246.264026
max_heart_rate_achieved    149.646865
st_depression                1.039604
dtype: float64

In [7]:
df_mean.head()

Unnamed: 0,resting_blood_pressure,cholesterol,max_heart_rate_achieved,st_depression
0,145,233,150,2.3
1,130,250,187,3.5
2,130,204,172,1.4
3,120,236,178,0.8
4,120,354,163,0.6


In [8]:
zfactor = np.abs(stats.zscore(df_mean))
print(zfactor)

[[0.76395577 0.25633371 0.01544279 1.08733806]
 [0.09273778 0.07219949 1.63347147 2.12257273]
 [0.09273778 0.81677269 0.97751389 0.31091206]
 ...
 [0.70684287 1.029353   0.37813176 2.03630317]
 [0.09273778 2.2275329  1.51512489 0.13837295]
 [0.09273778 0.19835726 1.0649749  0.89686172]]


In [9]:
#find outliers
print(np.where(zfactor > 3))

(array([ 28,  85, 204, 220, 221, 223, 246, 248, 272]), array([1, 1, 3, 1, 3, 0, 1, 0, 2]))


In [12]:
#removing outliers
df_mean_o = df_mean[(zfactor < 3).all(axis=1)]
df = df[(zfactor < 3).all(axis=1)]
print(df_mean.shape)
print(df_mean_o.shape)
print(df.shape)

(303, 4)
(294, 4)
(294, 14)


In [11]:
#adjusted mean values after outlier removed
mean_o = df_mean_o.mean(axis = 0) 
mean_o

resting_blood_pressure     131.057823
cholesterol                243.646259
max_heart_rate_achieved    149.887755
st_depression                0.986054
dtype: float64

In [13]:
df.to_csv('DW.csv')
df_mean_o.to_csv('DWO.csv')