In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LassoCV, LogisticRegression

In [7]:
dataset = pd.read_csv('F:/My docs/Sem 7/Capstone/Dataset/Dataset/heart.csv')

In [4]:
dataset.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'condition']

In [8]:
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values

In [9]:
def fe_creation(df):
    df['age2'] = df['age']//10
    df['trestbps2'] = df['trestbps']//10 #10
    df['chol2'] = df['chol']//40
    df['thalach2'] = df['thalach']//40
    df['oldpeak2'] = df['oldpeak']//0.4
    for i in ['sex', 'age2', 'fbs', 'restecg', 'exang','thal', ]:
        for j in ['cp','trestbps2', 'chol2', 'thalach2', 'oldpeak2', 'slope', 'ca']:
            df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
    return df

data = fe_creation(dataset)

In [10]:
categorical_columns = []
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = data.columns.values.tolist()
for col in features:
    if data[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['sex_cp',
 'sex_trestbps2',
 'sex_chol2',
 'sex_thalach2',
 'sex_oldpeak2',
 'sex_slope',
 'sex_ca',
 'age2_cp',
 'age2_trestbps2',
 'age2_chol2',
 'age2_thalach2',
 'age2_oldpeak2',
 'age2_slope',
 'age2_ca',
 'fbs_cp',
 'fbs_trestbps2',
 'fbs_chol2',
 'fbs_thalach2',
 'fbs_oldpeak2',
 'fbs_slope',
 'fbs_ca',
 'restecg_cp',
 'restecg_trestbps2',
 'restecg_chol2',
 'restecg_thalach2',
 'restecg_oldpeak2',
 'restecg_slope',
 'restecg_ca',
 'exang_cp',
 'exang_trestbps2',
 'exang_chol2',
 'exang_thalach2',
 'exang_oldpeak2',
 'exang_slope',
 'exang_ca',
 'thal_cp',
 'thal_trestbps2',
 'thal_chol2',
 'thal_thalach2',
 'thal_oldpeak2',
 'thal_slope',
 'thal_ca']

In [13]:
for col in categorical_columns:
    if col in data.columns:
        le = LabelEncoder()
        le.fit(list(data[col].astype(str).values))
        data[col] = le.transform(list(data[col].astype(str).values))

In [14]:
train = data.copy()
target = train.pop('target')

In [15]:
num_features_opt = 25   # the number of features that we need to choose as a result
num_features_max = 35   # the somewhat excessive number of features, which we will choose at each stage
features_best = []

### Pearson correlation

In [16]:
threshold = 0.9

In [20]:
def highlight(value):
    if value > threshold:
        style = 'background-color: pink'
    else:
        style = 'background-color: palegreen'
    return style

# Absolute value correlation matrix
corr_matrix = data.corr().abs().round(2)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.style.format("{:.2f}").applymap(highlight)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age2,trestbps2,chol2,thalach2,oldpeak2,sex_cp,sex_trestbps2,sex_chol2,sex_thalach2,sex_oldpeak2,sex_slope,sex_ca,age2_cp,age2_trestbps2,age2_chol2,age2_thalach2,age2_oldpeak2,age2_slope,age2_ca,fbs_cp,fbs_trestbps2,fbs_chol2,fbs_thalach2,fbs_oldpeak2,fbs_slope,fbs_ca,restecg_cp,restecg_trestbps2,restecg_chol2,restecg_thalach2,restecg_oldpeak2,restecg_slope,restecg_ca,exang_cp,exang_trestbps2,exang_chol2,exang_thalach2,exang_oldpeak2,exang_slope,exang_ca,thal_cp,thal_trestbps2,thal_chol2,thal_thalach2,thal_oldpeak2,thal_slope,thal_ca
age,,0.1,0.07,0.28,0.21,0.12,0.12,0.4,0.1,0.21,0.17,0.28,0.07,0.23,0.95,0.28,0.2,0.35,0.2,0.12,0.01,0.05,0.22,0.04,0.16,0.04,0.92,0.92,0.94,0.92,0.91,0.92,0.94,0.06,0.2,0.16,0.04,0.24,0.02,0.23,0.14,0.04,0.08,0.21,0.01,0.16,0.01,0.06,0.17,0.12,0.03,0.18,0.02,0.19,0.04,0.14,0.11,0.03,0.15,0.01,0.16
sex,,,0.05,0.06,0.2,0.05,0.06,0.04,0.14,0.1,0.03,0.12,0.21,0.28,0.12,0.05,0.18,0.04,0.1,0.87,0.94,0.93,0.94,0.85,0.91,0.89,0.13,0.14,0.13,0.14,0.09,0.13,0.09,0.01,0.01,0.02,0.02,0.1,0.02,0.09,0.07,0.07,0.07,0.06,0.0,0.06,0.01,0.12,0.11,0.12,0.13,0.15,0.13,0.17,0.19,0.2,0.23,0.2,0.23,0.2,0.25
cp,,,,0.05,0.08,0.09,0.04,0.3,0.39,0.15,0.12,0.18,0.16,0.43,0.08,0.05,0.09,0.24,0.14,0.45,0.02,0.09,0.04,0.1,0.0,0.12,0.2,0.05,0.1,0.03,0.1,0.05,0.11,0.64,0.12,0.04,0.19,0.01,0.15,0.01,0.47,0.07,0.01,0.11,0.01,0.08,0.02,0.17,0.33,0.4,0.33,0.35,0.35,0.42,0.26,0.14,0.21,0.1,0.21,0.12,0.22
trestbps,,,,,0.12,0.18,0.11,0.05,0.07,0.19,0.12,0.1,0.06,0.14,0.25,0.99,0.14,0.06,0.2,0.03,0.25,0.0,0.08,0.03,0.1,0.0,0.26,0.41,0.27,0.24,0.27,0.22,0.25,0.16,0.51,0.22,0.13,0.22,0.1,0.19,0.08,0.16,0.07,0.13,0.04,0.15,0.07,0.1,0.37,0.1,0.05,0.13,0.01,0.1,0.08,0.33,0.11,0.04,0.12,0.02,0.1
chol,,,,,,0.01,0.15,0.01,0.07,0.05,0.0,0.07,0.1,0.09,0.2,0.13,0.98,0.03,0.05,0.21,0.16,0.04,0.2,0.13,0.18,0.13,0.17,0.2,0.31,0.2,0.19,0.2,0.2,0.03,0.05,0.26,0.0,0.05,0.01,0.04,0.17,0.12,0.05,0.16,0.11,0.14,0.12,0.03,0.1,0.25,0.06,0.08,0.07,0.09,0.06,0.11,0.24,0.09,0.1,0.1,0.1
fbs,,,,,,,0.08,0.01,0.03,0.01,0.06,0.14,0.03,0.03,0.11,0.16,0.03,0.02,0.0,0.09,0.09,0.05,0.04,0.06,0.02,0.1,0.13,0.11,0.1,0.1,0.1,0.09,0.13,0.83,0.92,0.92,0.9,0.74,0.86,0.88,0.03,0.04,0.08,0.08,0.05,0.1,0.03,0.08,0.07,0.03,0.02,0.04,0.0,0.08,0.01,0.03,0.0,0.03,0.0,0.05,0.03
restecg,,,,,,,,0.04,0.07,0.06,0.09,0.07,0.01,0.14,0.1,0.11,0.18,0.1,0.06,0.03,0.08,0.09,0.02,0.09,0.02,0.08,0.09,0.11,0.12,0.09,0.12,0.08,0.11,0.04,0.1,0.11,0.04,0.12,0.03,0.1,0.9,0.95,0.94,0.96,0.88,0.94,0.93,0.05,0.09,0.09,0.04,0.1,0.03,0.09,0.01,0.04,0.04,0.01,0.06,0.02,0.04
thalach,,,,,,,,,0.38,0.34,0.39,0.21,0.1,0.42,0.39,0.06,0.01,0.9,0.32,0.11,0.05,0.06,0.27,0.22,0.12,0.13,0.3,0.35,0.38,0.2,0.43,0.3,0.39,0.16,0.02,0.02,0.38,0.25,0.19,0.11,0.17,0.04,0.04,0.27,0.11,0.18,0.04,0.23,0.36,0.36,0.06,0.45,0.21,0.42,0.03,0.13,0.14,0.15,0.24,0.03,0.18
exang,,,,,,,,,,0.29,0.26,0.12,0.21,0.44,0.11,0.08,0.09,0.33,0.28,0.07,0.17,0.18,0.02,0.28,0.03,0.17,0.0,0.11,0.13,0.03,0.18,0.05,0.13,0.2,0.06,0.07,0.12,0.22,0.11,0.08,0.23,0.05,0.04,0.15,0.08,0.15,0.02,0.84,0.94,0.95,0.93,0.9,0.9,0.92,0.04,0.24,0.26,0.12,0.33,0.12,0.25
oldpeak,,,,,,,,,,,0.58,0.22,0.21,0.43,0.22,0.21,0.07,0.32,0.99,0.01,0.15,0.08,0.02,0.53,0.15,0.18,0.18,0.25,0.22,0.17,0.44,0.1,0.27,0.08,0.07,0.0,0.13,0.57,0.29,0.11,0.12,0.01,0.08,0.15,0.36,0.26,0.02,0.22,0.32,0.26,0.19,0.62,0.04,0.34,0.15,0.26,0.23,0.12,0.55,0.02,0.28


In [21]:
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]
features_filtered = data.drop(columns = collinear_features)
print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])
features_best.append(features_filtered.columns.tolist())

The number of features that passed the collinearity threshold:  29


### Linear SVC

In [23]:
lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(train, target)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) if model.get_support()[i]])
features_best.append(X_selected_df.columns.tolist())



### Lasso

In [24]:
lasso = LassoCV(cv=3).fit(train, target)
model = SelectFromModel(lasso, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) if model.get_support()[i]])
features_best.append(X_selected_df.columns.tolist())

  model = cd_fast.enet_coordinate_descent_gram(


### SelectKBest using Chi2

In [25]:
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(train, target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']  #naming the dataframe columns
features_best.append(featureScores.nlargest(num_features_max,'Score')['Feature'].tolist())
print(featureScores.nlargest(len(dfcolumns),'Score')) 

              Feature       Score
50     exang_oldpeak2  660.648362
52           exang_ca  260.067472
22       sex_oldpeak2  252.018554
57      thal_oldpeak2  249.956960
47    exang_trestbps2  243.633903
7             thalach  188.320472
17           oldpeak2  173.655779
36       fbs_oldpeak2  168.263772
29      age2_oldpeak2  144.589877
48        exang_chol2  133.138795
54     thal_trestbps2  117.651546
55         thal_chol2  115.347416
59            thal_ca   89.759404
19      sex_trestbps2   76.862612
24             sex_ca   74.517786
9             oldpeak   72.644253
11                 ca   66.440765
2                  cp   62.598098
31            age2_ca   56.744328
26     age2_trestbps2   55.832959
39         restecg_cp   52.800287
38             fbs_ca   41.644681
8               exang   38.914377
27         age2_chol2   38.830584
20          sex_chol2   35.577403
32             fbs_cp   32.163397
42   restecg_thalach2   30.466393
49     exang_thalach2   27.389828
4             

### Regressive Feature Elimination using Logistic Regression

In [28]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_features_max, step=10, verbose=5)
rfe_selector.fit(train, target)
rfe_support = rfe_selector.get_support()
rfe_feature = train.loc[:,rfe_support].columns.tolist()
features_best.append(rfe_feature)

Fitting estimator with 60 features.
Fitting estimator with 50 features.
Fitting estimator with 40 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### Variance Threshold

In [29]:
selector = VarianceThreshold(threshold=10)
np.shape(selector.fit_transform(data))
features_best.append(list(np.array(data.columns)[selector.get_support(indices=False)]))

In [30]:
features_best

[['age',
  'sex',
  'cp',
  'trestbps',
  'chol',
  'fbs',
  'restecg',
  'thalach',
  'exang',
  'oldpeak',
  'slope',
  'ca',
  'thal',
  'target',
  'thalach2',
  'sex_cp',
  'sex_oldpeak2',
  'sex_ca',
  'fbs_cp',
  'fbs_thalach2',
  'fbs_oldpeak2',
  'fbs_slope',
  'fbs_ca',
  'restecg_cp',
  'restecg_oldpeak2',
  'exang_cp',
  'exang_oldpeak2',
  'exang_slope',
  'thal_oldpeak2'],
 ['age',
  'trestbps',
  'chol',
  'thalach',
  'ca',
  'sex_ca',
  'age2_cp',
  'age2_ca',
  'fbs_cp',
  'restecg_cp',
  'exang_chol2',
  'thal_cp',
  'thal_chol2',
  'thal_oldpeak2',
  'thal_slope'],
 ['age',
  'trestbps',
  'chol',
  'thalach',
  'sex_oldpeak2',
  'sex_ca',
  'age2_cp',
  'age2_oldpeak2',
  'age2_ca',
  'fbs_trestbps2',
  'restecg_trestbps2',
  'exang_ca',
  'thal_cp',
  'thal_chol2',
  'thal_oldpeak2',
  'thal_slope'],
 ['exang_oldpeak2',
  'exang_ca',
  'sex_oldpeak2',
  'thal_oldpeak2',
  'exang_trestbps2',
  'thalach',
  'oldpeak2',
  'fbs_oldpeak2',
  'age2_oldpeak2',
  'exang_c

In [32]:
main_cols = []
main_cols_opt = {feature_name : 0 for feature_name in data.columns.tolist()}
for i in range(len(features_best)):
    for feature_name in features_best[i]:
        main_cols_opt[feature_name] += 1
df_main_cols_opt = pd.DataFrame.from_dict(main_cols_opt, orient='index', columns=['Num'])
df_main_cols_opt.sort_values(by=['Num'], ascending=False).head(num_features_opt)

Unnamed: 0,Num
thal_oldpeak2,6
age,5
sex_oldpeak2,5
trestbps,5
chol,5
thal_chol2,5
thalach,5
age2_ca,5
sex_ca,5
exang_chol2,4


In [33]:
main_cols = df_main_cols_opt.nlargest(num_features_opt, 'Num').index.tolist()
if not 'target' in main_cols:
    main_cols.append('target')
main_cols

['thal_oldpeak2',
 'age',
 'trestbps',
 'chol',
 'thalach',
 'sex_oldpeak2',
 'sex_ca',
 'age2_ca',
 'thal_chol2',
 'ca',
 'age2_cp',
 'age2_oldpeak2',
 'fbs_cp',
 'fbs_oldpeak2',
 'restecg_cp',
 'restecg_trestbps2',
 'exang_chol2',
 'cp',
 'age2_chol2',
 'fbs_ca',
 'exang_cp',
 'exang_oldpeak2',
 'exang_slope',
 'exang_ca',
 'thal_cp',
 'target']

In [34]:
dataset.to_csv("Dataset_after_FS.csv")