In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import *
%matplotlib inline

sns.set_style('ticks')
sns.set_palette('colorblind')

# Import and read dataset

In [3]:
# Life Expectancy (WHO)
df = pd.read_csv("heart.csv")

In [38]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Dropping Feature

In [5]:
# keknya bisa drop fitur RestingECG, Sex, FastingBS
feature = ['RestingECG', 'Sex', 'FastingBS']
df_DF = df.drop(feature, axis=1)

## Outlier, missing value, etc

In [6]:
df_modifiedOutlier = df.copy()
df_deleteOutlier = df.copy()

df_DF_modifiedOutlier = df_DF.copy()
df_DF_deleteOutlier = df_DF.copy()

In [7]:
# Outlier detection using interquartile range

def detect_outliers_iqr(data):
    outliers = []
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers

RestingBP_outliers = detect_outliers_iqr(df['RestingBP'])
Cholesterol_outliers = detect_outliers_iqr(df['Cholesterol'])
MaxHR_outliers = detect_outliers_iqr(df['MaxHR'])
Oldpeak_outliers = detect_outliers_iqr(df['Oldpeak'])

In [8]:
print("Outliers in RestingBP: \n", RestingBP_outliers, "\n")
print("Outliers in Cholesterol: \n", Cholesterol_outliers, "\n")
print("Outliers in MaxHR: \n", MaxHR_outliers, "\n")
print("Outliers in Oldpeak: \n", Oldpeak_outliers, "\n")

Outliers in RestingBP: 
 [0, 80, 172, 172, 174, 178, 178, 178, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 185, 190, 190, 192, 200, 200, 200, 200] 

Outliers in Cholesterol: 
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 409, 412, 417, 458, 466, 468, 491, 518, 529, 564, 603] 

Outliers in MaxHR: 
 [60, 63] 

Outliers in Oldpeak: 
 [-2.6, 3.8, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.2, 4.2, 4.4, 5.0, 5.6, 6.2] 



In [9]:
# modify Outlier
def mod_outlier(dataset):
  dataset['MaxHR'] = dataset['MaxHR'].replace([63, 60], [220 - 60, 220 - 51]) # Replace MaxHR outlier dengan 220 dikurang umur
  dataset['RestingBP'] = dataset['RestingBP'].replace(0, dataset['RestingBP'].mean())
  dataset = dataset[~dataset.Cholesterol.isin(Cholesterol_outliers)] # drop
  dataset = dataset[dataset.Oldpeak >= 0] # drop minus value
  return dataset

df_modifiedOutlier = mod_outlier(df_modifiedOutlier)
df_DF_modifiedOutlier = mod_outlier(df_DF_modifiedOutlier)

In [10]:
# remove all outlier
def remove_outlier(dataset):
  dataset = dataset[~dataset.Cholesterol.isin(Cholesterol_outliers)]
  dataset = dataset[~dataset.Oldpeak.isin(Oldpeak_outliers)]
  dataset = dataset[dataset.Oldpeak >= 0]
  dataset = dataset[~dataset.MaxHR.isin(MaxHR_outliers)]
  dataset = dataset[~dataset.RestingBP.isin(RestingBP_outliers)]
  return dataset

df_deleteOutlier = remove_outlier(df_deleteOutlier)
df_DF_deleteOutlier = remove_outlier(df_DF_deleteOutlier)

## Encoding

In [11]:
# create a copy

df_encoded = df.copy()
df_modifiedOutlier_encoded = df_modifiedOutlier.copy()
df_deleteOutlier_encoded = df_deleteOutlier.copy()

df_DF_encoded = df_DF.copy()
df_DF_modifiedOutlier_encoded = df_DF_modifiedOutlier.copy()
df_DF_deleteOutlier_encoded = df_DF_deleteOutlier.copy()

#### Label encoder

target : 'RestingECG', 'ExerciseAngina', 'ST_Slope'

In [12]:
label_encoder = preprocessing.LabelEncoder()

def label_encode_data(dataset, cols):
  for columns in cols:
    dataset[columns] = label_encoder.fit_transform(dataset[columns])

In [13]:
label_encode_data(df_encoded, ['RestingECG', 'ExerciseAngina', 'ST_Slope'])
label_encode_data(df_modifiedOutlier_encoded, ['RestingECG', 'ExerciseAngina', 'ST_Slope'])
label_encode_data(df_deleteOutlier_encoded, ['RestingECG', 'ExerciseAngina', 'ST_Slope'])

label_encode_data(df_DF_encoded, ['ExerciseAngina', 'ST_Slope'])
label_encode_data(df_DF_modifiedOutlier_encoded, ['ExerciseAngina', 'ST_Slope'])
label_encode_data(df_DF_deleteOutlier_encoded, ['ExerciseAngina', 'ST_Slope'])

#### One hot encoding

targetnya : 'Sex', 'ChestPainType'

In [14]:
# create function to one hot encoding dataset on columns
def onehot_encode_data(dataset, cols):
  for columns in cols:
    dataset = pd.concat([dataset, pd.get_dummies(dataset[columns], prefix=columns)], axis=1)
    dataset = dataset.drop(columns, axis=1)
  return dataset


In [15]:
df_encoded = onehot_encode_data(df_encoded, ['Sex', 'ChestPainType'])
df_modifiedOutlier_encoded = onehot_encode_data(df_modifiedOutlier_encoded, ['Sex', 'ChestPainType'])
df_deleteOutlier_encoded = onehot_encode_data(df_deleteOutlier_encoded, ['Sex', 'ChestPainType'])

In [16]:
df_DF_encoded = onehot_encode_data(df_DF_encoded, ['ChestPainType'])
df_DF_deleteOutlier_encoded = onehot_encode_data(df_DF_deleteOutlier_encoded, ['ChestPainType'])
df_DF_modifiedOutlier_encoded = onehot_encode_data(df_DF_modifiedOutlier_encoded, ['ChestPainType'])

## Splitting dataset

In [17]:
from sklearn.model_selection import train_test_split

df_encoded_splitTrain, df_encoded_splitTest = train_test_split(df_encoded, test_size=0.2, random_state=0)
df_modifiedOutlier_encoded_splitTrain, df_modifiedOutlier_encoded_splitTest = train_test_split(df_modifiedOutlier_encoded, test_size=0.2, random_state=0)
df_deleteOutlier_encoded_splitTrain, df_deleteOutlier_encoded_splitTest = train_test_split(df_deleteOutlier_encoded, test_size=0.2, random_state=0)

df_DF_encoded_splitTrain, df_DF_encoded_splitTest = train_test_split(df_DF_encoded, test_size=0.2, random_state=0)
df_DF_modifiedOutlier_encoded_splitTrain, df_DF_modifiedOutlier_encoded_splitTest = train_test_split(df_DF_modifiedOutlier_encoded, test_size=0.2, random_state=0)
df_DF_deleteOutlier_encoded_splitTrain, df_DF_deleteOutlier_encoded_splitTest = train_test_split(df_DF_deleteOutlier_encoded, test_size=0.2, random_state=0)

# Scaling

### min-max

In [18]:
df_encoded_splitTrain_minmaxScaled = df_encoded_splitTrain.copy()
df_modifiedOutlier_encoded_splitTrain_minmaxScaled = df_modifiedOutlier_encoded_splitTrain.copy()
df_deleteOutlier_encoded_splitTrain_minmaxScaled = df_deleteOutlier_encoded_splitTrain.copy()

df_encoded_splitTest_minmaxScaled = df_encoded_splitTest.copy()
df_modifiedOutlier_encoded_splitTest_minmaxScaled = df_modifiedOutlier_encoded_splitTest.copy()
df_deleteOutlier_encoded_splitTest_minmaxScaled = df_deleteOutlier_encoded_splitTest.copy()


df_DF_encoded_splitTrain_minmaxScaled = df_DF_encoded_splitTrain.copy()
df_DF_modifiedOutlier_encoded_splitTrain_minmaxScaled = df_DF_modifiedOutlier_encoded_splitTrain.copy()
df_DF_deleteOutlier_encoded_splitTrain_minmaxScaled = df_DF_deleteOutlier_encoded_splitTrain.copy()

df_DF_encoded_splitTest_minmaxScaled = df_DF_encoded_splitTest.copy()
df_DF_modifiedOutlier_encoded_splitTest_minmaxScaled = df_DF_modifiedOutlier_encoded_splitTest.copy()
df_DF_deleteOutlier_encoded_splitTest_minmaxScaled = df_DF_deleteOutlier_encoded_splitTest.copy()

In [19]:
from sklearn.preprocessing import MinMaxScaler

def minmax_scale_data(dataset):
  scaler = MinMaxScaler()
  df_scaled = scaler.fit_transform(dataset)
  df_scaled = pd.DataFrame(df_scaled, columns=dataset.columns)
  return df_scaled

In [20]:
df_encoded_splitTrain_minmaxScaled = minmax_scale_data(df_encoded_splitTrain_minmaxScaled)
df_modifiedOutlier_encoded_splitTrain_minmaxScaled = minmax_scale_data(df_modifiedOutlier_encoded_splitTrain_minmaxScaled)
df_deleteOutlier_encoded_splitTrain_minmaxScaled = minmax_scale_data(df_deleteOutlier_encoded_splitTrain_minmaxScaled)

df_encoded_splitTest_minmaxScaled = minmax_scale_data(df_encoded_splitTest_minmaxScaled)
df_modifiedOutlier_encoded_splitTest_minmaxScaled = minmax_scale_data(df_modifiedOutlier_encoded_splitTest_minmaxScaled)
df_deleteOutlier_encoded_splitTest_minmaxScaled = minmax_scale_data(df_deleteOutlier_encoded_splitTest_minmaxScaled)


df_DF_encoded_splitTrain_minmaxScaled = minmax_scale_data(df_DF_encoded_splitTrain_minmaxScaled)
df_DF_modifiedOutlier_encoded_splitTrain_minmaxScaled = minmax_scale_data(df_DF_modifiedOutlier_encoded_splitTrain_minmaxScaled)
df_DF_deleteOutlier_encoded_splitTrain_minmaxScaled = minmax_scale_data(df_DF_deleteOutlier_encoded_splitTrain_minmaxScaled)

df_DF_encoded_splitTest_minmaxScaled = minmax_scale_data(df_DF_encoded_splitTest_minmaxScaled)
df_DF_modifiedOutlier_encoded_splitTest_minmaxScaled = minmax_scale_data(df_DF_modifiedOutlier_encoded_splitTest_minmaxScaled)
df_DF_deleteOutlier_encoded_splitTest_minmaxScaled = minmax_scale_data(df_DF_deleteOutlier_encoded_splitTest_minmaxScaled)

### Standard Scaler

In [21]:
df_encoded_splitTrain_stdScaled = df_encoded_splitTrain.copy()
df_modifiedOutlier_encoded_splitTrain_stdScaled = df_modifiedOutlier_encoded_splitTrain.copy()
df_deleteOutlier_encoded_splitTrain_stdScaled = df_deleteOutlier_encoded_splitTrain.copy()
df_encoded_splitTest_stdScaled = df_encoded_splitTest.copy()
df_modifiedOutlier_encoded_splitTest_stdScaled = df_modifiedOutlier_encoded_splitTest.copy()
df_deleteOutlier_encoded_splitTest_stdScaled = df_deleteOutlier_encoded_splitTest.copy()

df_DF_encoded_splitTrain_stdScaled = df_DF_encoded_splitTrain.copy()
df_DF_modifiedOutlier_encoded_splitTrain_stdScaled = df_DF_modifiedOutlier_encoded_splitTrain.copy()
df_DF_deleteOutlier_encoded_splitTrain_stdScaled = df_DF_deleteOutlier_encoded_splitTrain.copy()
df_DF_encoded_splitTest_stdScaled = df_DF_encoded_splitTest.copy()
df_DF_modifiedOutlier_encoded_splitTest_stdScaled = df_DF_modifiedOutlier_encoded_splitTest.copy()
df_DF_deleteOutlier_encoded_splitTest_stdScaled = df_DF_deleteOutlier_encoded_splitTest.copy()

In [22]:
# import standard scaler
from sklearn.preprocessing import StandardScaler

def std_scale_data(dataset):
  scaler = StandardScaler()
  df_scaled_std = scaler.fit_transform(dataset)
  df_scaled_std = pd.DataFrame(df_scaled_std, columns=dataset.columns)
  return df_scaled_std

In [23]:
df_encoded_splitTrain_stdScaled = std_scale_data(df_encoded_splitTrain_stdScaled)
df_modifiedOutlier_encoded_splitTrain_stdScaled = std_scale_data(df_modifiedOutlier_encoded_splitTrain_stdScaled)
df_deleteOutlier_encoded_splitTrain_stdScaled = std_scale_data(df_deleteOutlier_encoded_splitTrain_stdScaled)
df_encoded_splitTest_stdScaled = std_scale_data(df_encoded_splitTest_stdScaled)
df_modifiedOutlier_encoded_splitTest_stdScaled = std_scale_data(df_modifiedOutlier_encoded_splitTest_stdScaled)
df_deleteOutlier_encoded_splitTest_stdScaled = std_scale_data(df_deleteOutlier_encoded_splitTest_stdScaled)

df_DF_encoded_splitTrain_stdScaled = std_scale_data(df_DF_encoded_splitTrain_stdScaled)
df_DF_modifiedOutlier_encoded_splitTrain_stdScaled = std_scale_data(df_DF_modifiedOutlier_encoded_splitTrain_stdScaled)
df_DF_deleteOutlier_encoded_splitTrain_stdScaled = std_scale_data(df_DF_deleteOutlier_encoded_splitTrain_stdScaled)
df_DF_encoded_splitTest_stdScaled = std_scale_data(df_DF_encoded_splitTest_stdScaled)
df_DF_modifiedOutlier_encoded_splitTest_stdScaled = std_scale_data(df_DF_modifiedOutlier_encoded_splitTest_stdScaled)
df_DF_deleteOutlier_encoded_splitTest_stdScaled = std_scale_data(df_DF_deleteOutlier_encoded_splitTest_stdScaled)

In [27]:
# import knn from sklearn
from sklearn.neighbors import KNeighborsClassifier

# train model with df_DF_encoded_splitTrain_stdScaled
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(df_DF_encoded_splitTrain_stdScaled.astype('int'), df_DF_encoded_splitTrain_stdScaled['HeartDisease'].astype('int'))

# predict class for df_DF_encoded_splitTest_stdScaled
predicted_class = knn_model.predict(df_DF_encoded_splitTest_stdScaled)

In [33]:
df_DF_encoded_splitTrain_stdScaled.astype('int').head()

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,1,0,-1,0,1,1,0,0,0,0,0,0
1,0,0,-1,0,1,0,0,0,0,0,0,0
2,1,1,1,0,0,0,1,-1,-1,0,1,0
3,1,0,0,-1,1,0,0,0,0,0,0,0
4,0,0,-1,0,1,0,0,0,0,0,0,0


# Saving dataset

In [35]:
train_data = {
  # 'df_encoded_splitTrain_stdScaled' : df_encoded_splitTrain_stdScaled,
  # 'df_modifiedOutlier_encoded_splitTrain_stdScaled' : df_modifiedOutlier_encoded_splitTrain_stdScaled,
  # 'df_deleteOutlier_encoded_splitTrain_stdScaled' : df_deleteOutlier_encoded_splitTrain_stdScaled,
  # 'df_DF_encoded_splitTrain_stdScaled' : df_DF_encoded_splitTrain_stdScaled,
  # 'df_DF_modifiedOutlier_encoded_splitTrain_stdScaled' : df_DF_modifiedOutlier_encoded_splitTrain_stdScaled,
  # 'df_DF_deleteOutlier_encoded_splitTrain_stdScaled' : df_DF_deleteOutlier_encoded_splitTrain_stdScaled,
  'df_encoded_splitTrain_minmaxScaled' : df_encoded_splitTrain_minmaxScaled,
  'df_modifiedOutlier_encoded_splitTrain_minmaxScaled' : df_modifiedOutlier_encoded_splitTrain_minmaxScaled,
  'df_deleteOutlier_encoded_splitTrain_minmaxScaled' : df_deleteOutlier_encoded_splitTrain_minmaxScaled,
  'df_DF_encoded_splitTrain_minmaxScaled' : df_DF_encoded_splitTrain_minmaxScaled,
  'df_DF_modifiedOutlier_encoded_splitTrain_minmaxScaled' : df_DF_modifiedOutlier_encoded_splitTrain_minmaxScaled,
  'df_DF_deleteOutlier_encoded_splitTrain_minmaxScaled' : df_DF_deleteOutlier_encoded_splitTrain_minmaxScaled,
  
  'df_DF_encoded_splitTrain_stdScaled_rounded' : df_DF_encoded_splitTrain_stdScaled.astype('int'),
  
  # g di scaling
  'df_encoded_splitTrain' : df_encoded_splitTrain,
  'df_DF_encoded_splitTrain' : df_DF_encoded_splitTrain,
  'df_modifiedOutlier_encoded_splitTrain' : df_modifiedOutlier_encoded_splitTrain,
  'df_DF_modifiedOutlier_encoded_splitTrain' : df_DF_modifiedOutlier_encoded_splitTrain,
  'df_deleteOutlier_encoded_splitTrain' : df_deleteOutlier_encoded_splitTrain,
  'df_DF_deleteOutlier_encoded_splitTrain' : df_DF_deleteOutlier_encoded_splitTrain,
}

test_data = {
  # 'df_encoded_splitTest_stdScaled' : df_encoded_splitTest_stdScaled,
  # 'df_modifiedOutlier_encoded_splitTest_stdScaled' : df_modifiedOutlier_encoded_splitTest_stdScaled,
  # 'df_deleteOutlier_encoded_splitTest_stdScaled' : df_deleteOutlier_encoded_splitTest_stdScaled,
  # 'df_DF_encoded_splitTest_stdScaled' : df_DF_encoded_splitTest_stdScaled,
  # 'df_DF_modifiedOutlier_encoded_splitTest_stdScaled' : df_DF_modifiedOutlier_encoded_splitTest_stdScaled,
  # 'df_DF_deleteOutlier_encoded_splitTest_stdScaled' : df_DF_deleteOutlier_encoded_splitTest_stdScaled,
  'df_encoded_splitTest_minmaxScaled' : df_encoded_splitTest_minmaxScaled,
  'df_modifiedOutlier_encoded_splitTest_minmaxScaled' : df_modifiedOutlier_encoded_splitTest_minmaxScaled,
  'df_deleteOutlier_encoded_splitTest_minmaxScaled' : df_deleteOutlier_encoded_splitTest_minmaxScaled,
  'df_DF_encoded_splitTest_minmaxScaled' : df_DF_encoded_splitTest_minmaxScaled,
  'df_DF_modifiedOutlier_encoded_splitTest_minmaxScaled' : df_DF_modifiedOutlier_encoded_splitTest_minmaxScaled,
  'df_DF_deleteOutlier_encoded_splitTest_minmaxScaled' : df_DF_deleteOutlier_encoded_splitTest_minmaxScaled,

  'df_DF_encoded_splitTest_stdScaled_rounded' : df_DF_encoded_splitTest_stdScaled.astype('int'),
  
  # g di scaling
  'df_encoded_splitTest' : df_encoded_splitTest,
  'df_DF_encoded_splitTest' : df_DF_encoded_splitTest,
  'df_modifiedOutlier_encoded_splitTest' : df_modifiedOutlier_encoded_splitTest,
  'df_DF_modifiedOutlier_encoded_splitTest' : df_DF_modifiedOutlier_encoded_splitTest,
  'df_deleteOutlier_encoded_splitTest' : df_deleteOutlier_encoded_splitTest,
  'df_DF_deleteOutlier_encoded_splitTest' : df_DF_deleteOutlier_encoded_splitTest,
}

In [37]:
# save train and test data
for key in train_data.keys():
  train_data[key].to_csv('./Dataset/' + key + '.csv', index=False)

for key in test_data.keys():
  test_data[key].to_csv('./Dataset/' + key + '.csv', index=False)


  