## Dependencies loading

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

### Project setup

In [2]:
raw_input_data_path = "../data/input"
preprocessed_output_data_path = "../data/output"

## Load dataset

In [3]:
df_train = pd.read_csv(f"{preprocessed_output_data_path}/train.csv", index_col=0)
df_test = pd.read_csv(f"{preprocessed_output_data_path}/test.csv", index_col=0)

In [4]:
df_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
166,56,Male,asymptomatic,132,184,False,left ventricular hypertrophy,105,yes,2.1,flat,1,fixed defect,0
41,54,Male,non-anginal pain,120,258,False,left ventricular hypertrophy,147,no,0.4,flat,0,reversable defect,1
763,58,Female,typical angina,150,283,True,left ventricular hypertrophy,162,no,1.0,upsloping,0,normal,1
826,42,Male,non-anginal pain,130,180,False,normal,150,no,0.0,upsloping,0,normal,1
936,43,Male,non-anginal pain,130,315,False,normal,162,no,1.9,upsloping,1,normal,1


In [5]:
df_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
886,61,Male,asymptomatic,120,260,False,normal,140,yes,3.6,flat,1,reversable defect,0
516,60,Male,asymptomatic,130,206,False,left ventricular hypertrophy,132,yes,2.4,flat,2,reversable defect,0
362,43,Female,non-anginal pain,122,213,False,normal,165,no,0.2,flat,0,normal,1
177,64,Male,asymptomatic,120,246,False,left ventricular hypertrophy,96,yes,2.2,downsloping,1,normal,0
11,43,Female,asymptomatic,132,341,True,left ventricular hypertrophy,136,yes,3.0,flat,0,reversable defect,0


## Feature engineering

#### One-Hot Encoding

In [6]:
df_train = pd.get_dummies(df_train, dtype = int)
df_train.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak,ca,target,sex_Female,sex_Male,...,restecg_left ventricular hypertrophy,restecg_normal,exang_no,exang_yes,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
166,56,132,184,False,105,2.1,1,0,0,1,...,1,0,0,1,0,1,0,1,0,0
41,54,120,258,False,147,0.4,0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
763,58,150,283,True,162,1.0,0,1,1,0,...,1,0,1,0,0,0,1,0,1,0
826,42,130,180,False,150,0.0,0,1,0,1,...,0,1,1,0,0,0,1,0,1,0
936,43,130,315,False,162,1.9,1,1,0,1,...,0,1,1,0,0,0,1,0,1,0


In [7]:
df_test = pd.get_dummies(df_test, dtype = int)
df_test.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak,ca,target,sex_Female,sex_Male,...,restecg_left ventricular hypertrophy,restecg_normal,exang_no,exang_yes,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
886,61,120,260,False,140,3.6,1,0,0,1,...,0,1,0,1,0,1,0,0,0,1
516,60,130,206,False,132,2.4,2,0,0,1,...,1,0,0,1,0,1,0,0,0,1
362,43,122,213,False,165,0.2,0,1,1,0,...,0,1,1,0,0,1,0,0,1,0
177,64,120,246,False,96,2.2,1,0,0,1,...,1,0,0,1,1,0,0,0,1,0
11,43,132,341,True,136,3.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


In [8]:
print(df_train.shape, df_test.shape)

(814, 25) (204, 25)


#### Feature normalization

In this step we will normalize all the numeric feature in the range of 0 to 1

In [9]:
# training data
scaler = MinMaxScaler()
df_train[['age','trestbps','chol','thalach','oldpeak','ca']] = scaler.fit_transform(df_train[['age','trestbps','chol','thalach','oldpeak','ca']])
df_train.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak,ca,target,sex_Female,sex_Male,...,restecg_left ventricular hypertrophy,restecg_normal,exang_no,exang_yes,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
166,0.5625,0.358491,0.13242,False,0.259542,0.33871,0.25,0,0,1,...,1,0,0,1,0,1,0,1,0,0
41,0.520833,0.245283,0.30137,False,0.580153,0.064516,0.0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
763,0.604167,0.528302,0.358447,True,0.694656,0.16129,0.0,1,1,0,...,1,0,1,0,0,0,1,0,1,0
826,0.270833,0.339623,0.123288,False,0.603053,0.0,0.0,1,0,1,...,0,1,1,0,0,0,1,0,1,0
936,0.291667,0.339623,0.431507,False,0.694656,0.306452,0.25,1,0,1,...,0,1,1,0,0,0,1,0,1,0


In [10]:
# test data
scaler = MinMaxScaler()
df_test[['age','trestbps','chol','thalach','oldpeak','ca']] = scaler.fit_transform(df_test[['age','trestbps','chol','thalach','oldpeak','ca']])
df_test.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak,ca,target,sex_Female,sex_Male,...,restecg_left ventricular hypertrophy,restecg_normal,exang_no,exang_yes,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
886,0.666667,0.302326,0.451049,False,0.526718,0.818182,0.25,0,0,1,...,0,1,0,1,0,1,0,0,0,1
516,0.645833,0.418605,0.262238,False,0.465649,0.545455,0.5,0,0,1,...,1,0,0,1,0,1,0,0,0,1
362,0.291667,0.325581,0.286713,False,0.717557,0.045455,0.0,1,1,0,...,0,1,1,0,0,1,0,0,1,0
177,0.729167,0.302326,0.402098,False,0.19084,0.5,0.25,0,0,1,...,1,0,0,1,1,0,0,0,1,0
11,0.291667,0.44186,0.734266,True,0.496183,0.681818,0.0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
