In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LassoCV, LogisticRegression

In [5]:
data = pd.read_csv('/content/drive/MyDrive/Heart Dataset/heart.csv')

In [None]:
X = dataset.iloc[:, 3:-1].values
y = dataset.iloc[:, -1].values

In [6]:
categorical_columns = []
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = data.columns.values.tolist()
for col in features:
    if data[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

[]

In [7]:
for col in categorical_columns:
    if col in data.columns:
        le = LabelEncoder()
        le.fit(list(data[col].astype(str).values))
        data[col] = le.transform(list(data[col].astype(str).values))

In [8]:
train = data.copy()
target = train.pop('target')

In [9]:
num_features_opt = 10   # the number of features that we need to choose as a result
num_features_max = 13   # the somewhat excessive number of features, which we will choose at each stage
features_best = []

### Pearson correlation

In [10]:
threshold = 0.9

In [11]:
def highlight(value):
    if value > threshold:
        style = 'background-color: red'
    else:
        style = 'background-color: black'
    return style

# Absolute value correlation matrix
corr_matrix = data.corr().abs().round(2)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.style.format("{:.2f}").applymap(highlight)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,,0.1,0.07,0.28,0.21,0.12,0.12,0.4,0.1,0.21,0.17,0.28,0.07,0.23
sex,,,0.05,0.06,0.2,0.05,0.06,0.04,0.14,0.1,0.03,0.12,0.21,0.28
cp,,,,0.05,0.08,0.09,0.04,0.3,0.39,0.15,0.12,0.18,0.16,0.43
trestbps,,,,,0.12,0.18,0.11,0.05,0.07,0.19,0.12,0.1,0.06,0.14
chol,,,,,,0.01,0.15,0.01,0.07,0.05,0.0,0.07,0.1,0.09
fbs,,,,,,,0.08,0.01,0.03,0.01,0.06,0.14,0.03,0.03
restecg,,,,,,,,0.04,0.07,0.06,0.09,0.07,0.01,0.14
thalach,,,,,,,,,0.38,0.34,0.39,0.21,0.1,0.42
exang,,,,,,,,,,0.29,0.26,0.12,0.21,0.44
oldpeak,,,,,,,,,,,0.58,0.22,0.21,0.43


In [12]:
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]
features_filtered = data.drop(columns = collinear_features)
print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])
features_best.append(features_filtered.columns.tolist())

The number of features that passed the collinearity threshold:  14


### Linear SVC

In [13]:
lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(train, target)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) if model.get_support()[i]])
features_best.append(X_selected_df.columns.tolist())

### Lasso

In [14]:
lasso = LassoCV(cv=3).fit(train, target)
model = SelectFromModel(lasso, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) if model.get_support()[i]])
features_best.append(X_selected_df.columns.tolist())

### SelectKBest using Chi2

In [15]:
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(train, target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']  #naming the dataframe columns
features_best.append(featureScores.nlargest(num_features_max,'Score')['Feature'].tolist())
print(featureScores.nlargest(len(dfcolumns),'Score')) 

     Feature       Score
7    thalach  188.320472
9    oldpeak   72.644253
11        ca   66.440765
2         cp   62.598098
8      exang   38.914377
4       chol   23.936394
0        age   23.286624
3   trestbps   14.823925
10     slope    9.804095
1        sex    7.576835
12      thal    5.791853
6    restecg    2.978271
5        fbs    0.202934


### Regressive Feature Elimination using Logistic Regression

In [16]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_features_max, step=10, verbose=5)
rfe_selector.fit(train, target)
rfe_support = rfe_selector.get_support()
rfe_feature = train.loc[:,rfe_support].columns.tolist()
features_best.append(rfe_feature)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Variance Threshold

In [17]:
selector = VarianceThreshold(threshold=10)
np.shape(selector.fit_transform(data))
features_best.append(list(np.array(data.columns)[selector.get_support(indices=False)]))

In [18]:
features_best

[['age',
  'sex',
  'cp',
  'trestbps',
  'chol',
  'fbs',
  'restecg',
  'thalach',
  'exang',
  'oldpeak',
  'slope',
  'ca',
  'thal',
  'target'],
 ['age',
  'sex',
  'cp',
  'trestbps',
  'chol',
  'restecg',
  'thalach',
  'exang',
  'oldpeak',
  'slope',
  'ca',
  'thal'],
 ['age',
  'sex',
  'cp',
  'trestbps',
  'chol',
  'restecg',
  'thalach',
  'exang',
  'oldpeak',
  'slope',
  'ca',
  'thal'],
 ['thalach',
  'oldpeak',
  'ca',
  'cp',
  'exang',
  'chol',
  'age',
  'trestbps',
  'slope',
  'sex',
  'thal',
  'restecg',
  'fbs'],
 ['age',
  'sex',
  'cp',
  'trestbps',
  'chol',
  'fbs',
  'restecg',
  'thalach',
  'exang',
  'oldpeak',
  'slope',
  'ca',
  'thal'],
 ['age', 'trestbps', 'chol', 'thalach']]

In [19]:
main_cols = []
main_cols_opt = {feature_name : 0 for feature_name in data.columns.tolist()}
for i in range(len(features_best)):
    for feature_name in features_best[i]:
        main_cols_opt[feature_name] += 1
df_main_cols_opt = pd.DataFrame.from_dict(main_cols_opt, orient='index', columns=['Num'])
df_main_cols_opt.sort_values(by=['Num'], ascending=False).head(num_features_opt)

Unnamed: 0,Num
age,6
trestbps,6
chol,6
thalach,6
sex,5
cp,5
restecg,5
exang,5
oldpeak,5
slope,5


In [20]:
main_cols = df_main_cols_opt.nlargest(num_features_opt, 'Num').index.tolist()
if not 'target' in main_cols:
    main_cols.append('target')
main_cols

['age',
 'trestbps',
 'chol',
 'thalach',
 'sex',
 'cp',
 'restecg',
 'exang',
 'oldpeak',
 'slope',
 'target']

In [21]:
dataset.to_csv("Dataset_after_FS.csv")

In [None]:
test0 = dataset.sample(frac = 0.2)

train0 = dataset.drop(test0.index)

test0.to_csv("/content/drive/MyDrive/Heart Dataset/test1.csv")
train0.to_csv("/content/drive/MyDrive/Heart Dataset/train1.csv")