## Dependencies loading

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

### Project setup

In [2]:
raw_input_data_path = "../data/input"
preprocessed_output_data_path = "../data/output"

## Load dataset

In [3]:
df_train = pd.read_csv(f"{preprocessed_output_data_path}/train.csv", index_col=0)
df_test = pd.read_csv(f"{preprocessed_output_data_path}/test.csv", index_col=0)

In [4]:
df_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
166,56,Male,asymptomatic,132,184,0,left ventricular hypertrophy,105,1,2.1,flat,1,fixed defect,0
41,54,Male,non-anginal pain,120,258,0,left ventricular hypertrophy,147,0,0.4,flat,0,reversable defect,1
763,58,Female,typical angina,150,283,1,left ventricular hypertrophy,162,0,1.0,upsloping,0,normal,1
826,42,Male,non-anginal pain,130,180,0,normal,150,0,0.0,upsloping,0,normal,1
936,43,Male,non-anginal pain,130,315,0,normal,162,0,1.9,upsloping,1,normal,1


In [5]:
df_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
886,61,Male,asymptomatic,120,260,0,normal,140,1,3.6,flat,1,reversable defect,0
516,60,Male,asymptomatic,130,206,0,left ventricular hypertrophy,132,1,2.4,flat,2,reversable defect,0
362,43,Female,non-anginal pain,122,213,0,normal,165,0,0.2,flat,0,normal,1
177,64,Male,asymptomatic,120,246,0,left ventricular hypertrophy,96,1,2.2,downsloping,1,normal,0
11,43,Female,asymptomatic,132,341,1,left ventricular hypertrophy,136,1,3.0,flat,0,reversable defect,0


## Feature engineering

#### One-Hot Encoding

In [6]:
df_train = pd.get_dummies(df_train, dtype = int)
df_train.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,sex_Female,...,cp_typical angina,restecg_ST-T wave abnormality,restecg_left ventricular hypertrophy,restecg_normal,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
166,56,132,184,0,105,1,2.1,1,0,0,...,0,0,1,0,0,1,0,1,0,0
41,54,120,258,0,147,0,0.4,0,1,0,...,0,0,1,0,0,1,0,0,0,1
763,58,150,283,1,162,0,1.0,0,1,1,...,1,0,1,0,0,0,1,0,1,0
826,42,130,180,0,150,0,0.0,0,1,0,...,0,0,0,1,0,0,1,0,1,0
936,43,130,315,0,162,0,1.9,1,1,0,...,0,0,0,1,0,0,1,0,1,0


In [7]:
df_test = pd.get_dummies(df_test, dtype = int)
df_test.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,sex_Female,...,cp_typical angina,restecg_ST-T wave abnormality,restecg_left ventricular hypertrophy,restecg_normal,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
886,61,120,260,0,140,1,3.6,1,0,0,...,0,0,0,1,0,1,0,0,0,1
516,60,130,206,0,132,1,2.4,2,0,0,...,0,0,1,0,0,1,0,0,0,1
362,43,122,213,0,165,0,0.2,0,1,1,...,0,0,0,1,0,1,0,0,1,0
177,64,120,246,0,96,1,2.2,1,0,0,...,0,0,1,0,1,0,0,0,1,0
11,43,132,341,1,136,1,3.0,0,0,1,...,0,0,1,0,0,1,0,0,0,1


In [8]:
print(df_train.shape, df_test.shape)

(814, 24) (204, 24)


#### Feature normalization

In this step we will normalize all the numeric feature in the range of 0 to 1

In [9]:
# training data
scaler = MinMaxScaler()
df_train[['age','trestbps','chol','thalach','oldpeak','ca']] = scaler.fit_transform(df_train[['age','trestbps','chol','thalach','oldpeak','ca']])
df_train.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,sex_Female,...,cp_typical angina,restecg_ST-T wave abnormality,restecg_left ventricular hypertrophy,restecg_normal,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
166,0.5625,0.358491,0.13242,0,0.259542,1,0.33871,0.25,0,0,...,0,0,1,0,0,1,0,1,0,0
41,0.520833,0.245283,0.30137,0,0.580153,0,0.064516,0.0,1,0,...,0,0,1,0,0,1,0,0,0,1
763,0.604167,0.528302,0.358447,1,0.694656,0,0.16129,0.0,1,1,...,1,0,1,0,0,0,1,0,1,0
826,0.270833,0.339623,0.123288,0,0.603053,0,0.0,0.0,1,0,...,0,0,0,1,0,0,1,0,1,0
936,0.291667,0.339623,0.431507,0,0.694656,0,0.306452,0.25,1,0,...,0,0,0,1,0,0,1,0,1,0


In [10]:
# test data
scaler = MinMaxScaler()
df_test[['age','trestbps','chol','thalach','oldpeak','ca']] = scaler.fit_transform(df_test[['age','trestbps','chol','thalach','oldpeak','ca']])
df_test.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,sex_Female,...,cp_typical angina,restecg_ST-T wave abnormality,restecg_left ventricular hypertrophy,restecg_normal,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
886,0.666667,0.302326,0.451049,0,0.526718,1,0.818182,0.25,0,0,...,0,0,0,1,0,1,0,0,0,1
516,0.645833,0.418605,0.262238,0,0.465649,1,0.545455,0.5,0,0,...,0,0,1,0,0,1,0,0,0,1
362,0.291667,0.325581,0.286713,0,0.717557,0,0.045455,0.0,1,1,...,0,0,0,1,0,1,0,0,1,0
177,0.729167,0.302326,0.402098,0,0.19084,1,0.5,0.25,0,0,...,0,0,1,0,1,0,0,0,1,0
11,0.291667,0.44186,0.734266,1,0.496183,1,0.681818,0.0,0,1,...,0,0,1,0,0,1,0,0,0,1


## Feature selection

In [11]:
# segregating dataset into features i.e., X and target variables i.e., y
X = df_train.drop(['target'],axis=1)
y = df_train['target']

### Univariate feature selection

#### Pearson

In [12]:
num_feats=17

def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

17 selected features


#### Chi2 test

In [13]:
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

17 selected features


### Multivariate feature selection

#### Wrapper method

In [14]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Fitting estimator with 23 features.
17 selected features


#### Embedded feature selection

In [15]:
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2", solver='lbfgs'), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

8 selected features


In [16]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, criterion='gini',random_state=5), max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

9 selected features


In [17]:
# put all selection together
feature_name = X.columns
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.count_nonzero(feature_selection_df, axis=1)-1
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,Total
1,thalach,True,True,True,True,True,5
2,thal_reversable defect,True,True,True,True,True,5
3,oldpeak,True,True,True,True,True,5
4,cp_asymptomatic,True,True,True,True,True,5
5,ca,True,True,True,True,True,5
6,trestbps,True,False,True,True,True,4
7,thal_normal,True,True,True,False,True,4
8,exang,True,True,True,True,False,4
9,slope_upsloping,True,True,True,False,False,3
10,slope_flat,True,True,True,False,False,3


We select 8 features with total "True" greater than 3 and drop other columns and save it for model building

In [18]:
df_train = df_train.drop(columns = ['slope_upsloping', 'chol', 'age', 'thal_fixed defect', 'slope_flat', 'slope_downsloping', 'sex_Male', 'sex_Female', 'restecg_normal', 'restecg_left ventricular hypertrophy', 'restecg_ST-T wave abnormality', 'fbs', 'cp_typical angina', 'cp_non-anginal pain', 'cp_atypical angina'], axis = 1)
df_test = df_test.drop(columns = ['slope_upsloping', 'chol', 'age', 'thal_fixed defect', 'slope_flat', 'slope_downsloping', 'sex_Male', 'sex_Female', 'restecg_normal', 'restecg_left ventricular hypertrophy', 'restecg_ST-T wave abnormality', 'fbs', 'cp_typical angina', 'cp_non-anginal pain', 'cp_atypical angina'], axis = 1)

In [19]:
df_train.head()

Unnamed: 0,trestbps,thalach,exang,oldpeak,ca,target,cp_asymptomatic,thal_normal,thal_reversable defect
166,0.358491,0.259542,1,0.33871,0.25,0,1,0,0
41,0.245283,0.580153,0,0.064516,0.0,1,0,0,1
763,0.528302,0.694656,0,0.16129,0.0,1,0,1,0
826,0.339623,0.603053,0,0.0,0.0,1,0,1,0
936,0.339623,0.694656,0,0.306452,0.25,1,0,1,0


In [20]:
df_test.head()

Unnamed: 0,trestbps,thalach,exang,oldpeak,ca,target,cp_asymptomatic,thal_normal,thal_reversable defect
886,0.302326,0.526718,1,0.818182,0.25,0,1,0,1
516,0.418605,0.465649,1,0.545455,0.5,0,1,0,1
362,0.325581,0.717557,0,0.045455,0.0,1,0,1,0
177,0.302326,0.19084,1,0.5,0.25,0,1,1,0
11,0.44186,0.496183,1,0.681818,0.0,0,1,0,1


In [21]:
df_train.to_csv(f"{preprocessed_output_data_path}/train_fe.csv")
df_test.to_csv(f"{preprocessed_output_data_path}/test_fe.csv")