![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)

# <center> Machine Learning Methods </center>
## <center> Exercise 03 - Hearth Disease Classification </center>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethod/Exercises/Exercise03_Classification.ipynb)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import numpy             as np
import pandas            as pd
import seaborn           as sns
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc('font', **{'size' : 16})
# np.random.seed(1)

### Load the dataset:
https://www.kaggle.com/cherngs/heart-disease-cleveland-uci?select=heart_cleveland_upload.csv

In [3]:
dData = pd.read_csv('heart_cleveland_upload.csv')
dData

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


### Display target balance:

In [4]:
lFullName = ['Age', 'Sex', 'ChestPainType', 'RestingBloodPressure', 'Cholesterol', 'FastingBloodSugar', 'RestECG', 'MaxHeartRateAchieved', 'ExerciseInducedAngina', 'StDepression', 'StSlope', 'NumMajorVessels', 'Thalassemia', 'Target']
space     = max(map(len, dData.columns))
for (shortName, fullName) in zip(dData.columns, lFullName):
    print(f'{shortName:{space}s} = {fullName}')

age       = Age
sex       = Sex
cp        = ChestPainType
trestbps  = RestingBloodPressure
chol      = Cholesterol
fbs       = FastingBloodSugar
restecg   = RestECG
thalach   = MaxHeartRateAchieved
exang     = ExerciseInducedAngina
oldpeak   = StDepression
slope     = StSlope
ca        = NumMajorVessels
thal      = Thalassemia
condition = Target


### Get data and normalization:

In [5]:
mX = dData.drop(columns='condition').values
vY = dData['condition'].values

mX.shape, vY.shape

((297, 13), (297,))

In [6]:
#-- Normalize data:
mX -= mX.mean(0)
mX /= mX.std(0)

### Basic classification:
Let us try several default classifiers

In [7]:
from sklearn.linear_model    import LogisticRegression
from sklearn.svm             import SVC
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, KFold

#-- Classifiers:
#-- You can set the hyper-parameters as well
lModels  = []
lModels += [(SVC                   (C=1), 'SVM'                )]
lModels += [(LogisticRegression    (),    'Logistic Regression')]
lModels += [(KNeighborsClassifier  (),    'KNN'                )]
lModels += [(DecisionTreeClassifier(),    'Tree'               )]


N = len(vY)
for (oClassifier, name) in lModels:
    vHatY    = cross_val_predict(oClassifier, mX, vY, cv=KFold(N))
    accuracy = np.mean(vY == vHatY)
    print(f'{name:19s} = {100*accuracy:2.2f}%')

SVM                 = 82.83%
Logistic Regression = 83.16%
KNN                 = 82.49%
Tree                = 69.70%


### Exercise A:
* Use `KNeighborsClassifier` and get above 84% leave-one-out cross validation accuracy.  
(Play with the hyper-parameters)

In [8]:
#-- Your solution
# https://medium.com/@erikgreenj/k-neighbors-classifier-with-gridsearchcv-basics-3c445ddeb657
from sklearn.model_selection import GridSearchCV

grid_params = {
    'n_neighbors' : np.arange(1, 21, 2),
    'metric' : ['euclidean', 'manhattan'],
}

N = len(vY)
gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose=2,
    #cv = KFold(N), 
    n_jobs = -1)

gs_result = gs.fit(mX, vY)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [9]:
print(f'{100*gs_result.best_score_:2.2f}%')
print(gs_result.best_estimator_)
print(gs_result.best_params_)

83.18%
KNeighborsClassifier(metric='euclidean', n_neighbors=17)
{'metric': 'euclidean', 'n_neighbors': 17}


In [10]:
dData2         = dData.copy()
dData2.columns = lFullName
dData2

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestECG,MaxHeartRateAchieved,ExerciseInducedAngina,StDepression,StSlope,NumMajorVessels,Thalassemia,Target
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


### Meaningful categorical data:
(Based on the data description)

In [11]:
dData2.loc[dData2['Sex']                   == 0, 'Sex']                   = 'female'
dData2.loc[dData2['Sex']                   == 1, 'Sex']                   = 'male'
  
dData2.loc[dData2['ChestPainType']         == 0, 'ChestPainType']         = 'asymptomatic'
dData2.loc[dData2['ChestPainType']         == 1, 'ChestPainType']         = 'atypical angina'
dData2.loc[dData2['ChestPainType']         == 2, 'ChestPainType']         = 'non-anginal pain'
dData2.loc[dData2['ChestPainType']         == 3, 'ChestPainType']         = 'typical angina'

dData2.loc[dData2['FastingBloodSugar']     == 0, 'FastingBloodSugar']     = 'lower than 120mg/ml'
dData2.loc[dData2['FastingBloodSugar']     == 1, 'FastingBloodSugar']     = 'greater than 120mg/ml'

dData2.loc[dData2['RestECG']               == 0, 'RestECG']               = 'normal'
dData2.loc[dData2['RestECG']               == 1, 'RestECG']               = 'ST-T wave abnormality'
dData2.loc[dData2['RestECG']               == 2, 'RestECG']               = 'left ventricular hypertrophy'

dData2.loc[dData2['ExerciseInducedAngina'] == 0, 'ExerciseInducedAngina'] = 'no'
dData2.loc[dData2['ExerciseInducedAngina'] == 1, 'ExerciseInducedAngina'] = 'yes'

dData2.loc[dData2['StSlope']               == 0, 'StSlope']               = 'upsloping'
dData2.loc[dData2['StSlope']               == 1, 'StSlope']               = 'flat'
dData2.loc[dData2['StSlope']               == 2, 'StSlope']               = 'downsloping'

dData2.loc[dData2['Thalassemia']           == 0, 'Thalassemia']           = 'normal'
dData2.loc[dData2['Thalassemia']           == 1, 'Thalassemia']           = 'fixed defect'
dData2.loc[dData2['Thalassemia']           == 2, 'Thalassemia']           = 'reversable defect'

dData2.loc[dData2['Target']                == 0, 'Target']                = 'no heart disease'
dData2.loc[dData2['Target']                == 1, 'Target']                = 'heart disease'
dData2

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestECG,MaxHeartRateAchieved,ExerciseInducedAngina,StDepression,StSlope,NumMajorVessels,Thalassemia,Target
0,69,male,asymptomatic,160,234,greater than 120mg/ml,left ventricular hypertrophy,131,no,0.1,flat,1,normal,no heart disease
1,69,female,asymptomatic,140,239,lower than 120mg/ml,normal,151,no,1.8,upsloping,2,normal,no heart disease
2,66,female,asymptomatic,150,226,lower than 120mg/ml,normal,114,no,2.6,downsloping,0,normal,no heart disease
3,65,male,asymptomatic,138,282,greater than 120mg/ml,left ventricular hypertrophy,174,no,1.4,flat,1,normal,heart disease
4,64,male,asymptomatic,110,211,lower than 120mg/ml,left ventricular hypertrophy,144,yes,1.8,flat,0,normal,no heart disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,male,typical angina,152,223,lower than 120mg/ml,normal,181,no,0.0,upsloping,0,reversable defect,heart disease
293,39,male,typical angina,118,219,lower than 120mg/ml,normal,140,no,1.2,flat,0,reversable defect,heart disease
294,35,male,typical angina,120,198,lower than 120mg/ml,normal,130,yes,1.6,flat,0,reversable defect,heart disease
295,35,female,typical angina,138,183,lower than 120mg/ml,normal,182,no,1.4,upsloping,0,normal,no heart disease


### Convert non-numeric features to dummy features:

In [12]:
dData3 = pd.get_dummies(dData2, drop_first=True)
dData3

Unnamed: 0,Age,RestingBloodPressure,Cholesterol,MaxHeartRateAchieved,StDepression,NumMajorVessels,Sex_male,ChestPainType_atypical angina,ChestPainType_non-anginal pain,ChestPainType_typical angina,FastingBloodSugar_lower than 120mg/ml,RestECG_left ventricular hypertrophy,RestECG_normal,ExerciseInducedAngina_yes,StSlope_flat,StSlope_upsloping,Thalassemia_normal,Thalassemia_reversable defect,Target_no heart disease
0,69,160,234,131,0.1,1,1,0,0,0,0,1,0,0,1,0,1,0,1
1,69,140,239,151,1.8,2,0,0,0,0,1,0,1,0,0,1,1,0,1
2,66,150,226,114,2.6,0,0,0,0,0,1,0,1,0,0,0,1,0,1
3,65,138,282,174,1.4,1,1,0,0,0,0,1,0,0,1,0,1,0,0
4,64,110,211,144,1.8,0,1,0,0,0,1,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,152,223,181,0.0,0,1,0,0,1,1,0,1,0,0,1,0,1,0
293,39,118,219,140,1.2,0,1,0,0,1,1,0,1,0,1,0,0,1,0
294,35,120,198,130,1.6,0,1,0,0,1,1,0,1,1,1,0,0,1,0
295,35,138,183,182,1.4,0,0,0,0,1,1,0,1,0,0,1,1,0,1


In [13]:
dData3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Age                                    297 non-null    int64  
 1   RestingBloodPressure                   297 non-null    int64  
 2   Cholesterol                            297 non-null    int64  
 3   MaxHeartRateAchieved                   297 non-null    int64  
 4   StDepression                           297 non-null    float64
 5   NumMajorVessels                        297 non-null    int64  
 6   Sex_male                               297 non-null    uint8  
 7   ChestPainType_atypical angina          297 non-null    uint8  
 8   ChestPainType_non-anginal pain         297 non-null    uint8  
 9   ChestPainType_typical angina           297 non-null    uint8  
 10  FastingBloodSugar_lower than 120mg/ml  297 non-null    uint8  
 11  RestEC

### Exercise B:
1. Use `dData3` and get above 85% leave-one-out cross validation accuracy.
2. You are allowed you to only 6 features (from the given 18).  
Get more than 80% accuracy with only 6 features.  
What is the right approach for this task?

In [14]:
#-- Your solution to 1
# https://scikit-learn.org/stable/modules/feature_selection.html
mX3  = dData3.drop(columns='Target_no heart disease')
mX3 -= mX3.mean(0)
mX3 /= mX3.std(0)
mX3.shape

gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose = 3,
    #cv = KFold(N, shuffle=True),
    n_jobs = -1)
gs_result = gs.fit(mX3, vY)
print(f'{100*gs_result.best_score_:2.2f}%')
print(gs_result.best_estimator_)
print(gs_result.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
79.17%
KNeighborsClassifier(metric='manhattan', n_neighbors=17)
{'metric': 'manhattan', 'n_neighbors': 17}


In [15]:
#-- Your solution to 2

In [16]:
# Try 1 - Using SelectFromModel : failed since the KNeighborsClassifier does not have any `coef_` or `feature_importances_` attribute

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

# from sklearn.feature_selection import SelectFromModel

# selector = SelectFromModel(estimator=gs_result.best_estimator_, max_features=6).fit(mX3, vY)
# selector.estimator_
# selector.fit_transform(mX3, vY)

# ValueError: when `importance_getter=='auto'`, the underlying estimator KNeighborsClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [17]:
# Try 2 - Using scikit-learn SequentialFeatureSelector

# http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
# https://towardsdatascience.com/new-features-of-scikit-learn-fbbfe7652bfb
from sklearn.feature_selection import SequentialFeatureSelector
n_features = 6
sfs_forward = SequentialFeatureSelector(gs_result.best_estimator_, 
                                n_features_to_select = n_features, 
                                direction='forward',
                                #cv = KFold(N, shuffle=True)
                                )
sfs_forward.fit(mX3, vY)
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(mX3.columns[sfs_forward.get_support()])))

gs_result = gs.fit(sfs_forward.transform(mX3), vY)
print(f'{100*gs_result.best_score_:2.2f}%')
print(gs_result.best_estimator_)
print(gs_result.best_params_)




Top 6 features selected by forward sequential selection:['Cholesterol', 'MaxHeartRateAchieved', 'StDepression', 'NumMajorVessels', 'Thalassemia_normal', 'Thalassemia_reversable defect']
Fitting 5 folds for each of 20 candidates, totalling 100 fits
81.51%
KNeighborsClassifier(metric='euclidean', n_neighbors=13)
{'metric': 'euclidean', 'n_neighbors': 13}


In [18]:
sfs_backward = SequentialFeatureSelector(gs_result.best_estimator_, 
                                n_features_to_select = n_features, 
                                direction='backward',
                                #cv = KFold(N, shuffle=True)
                                )
sfs_backward.fit(mX3, vY)

print("Top {} features selected by baclward sequential selection:{}"\
      .format(n_features, list(mX3.columns[sfs_forward.get_support()])))


gs_result = gs.fit(sfs_backward.transform(mX3), vY)
print(f'{100*gs_result.best_score_:2.2f}%')
print(gs_result.best_estimator_)
print(gs_result.best_params_)

Top 6 features selected by baclward sequential selection:['Cholesterol', 'MaxHeartRateAchieved', 'StDepression', 'NumMajorVessels', 'Thalassemia_normal', 'Thalassemia_reversable defect']
Fitting 5 folds for each of 20 candidates, totalling 100 fits
83.86%
KNeighborsClassifier(metric='euclidean', n_neighbors=11)
{'metric': 'euclidean', 'n_neighbors': 11}


In [19]:
# Try 3 - Using the mlxtend SequentialFeatureSelector (enables to get the score without additional fit)

# Finding the score WITHOUT perfomring with only one fit

# https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html
# http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/

# pip install mlxtend

from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as mlxtend_sfs


In [20]:
sfs1_forward = mlxtend_sfs(gs_result.best_estimator_,
           k_features=6,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           #cv=KFold(N, shuffle=True),
           )

sfs1_forward = sfs1_forward.fit(mX3, vY)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.3s finished

[2021-07-15 21:44:29] Features: 1/6 -- score: 0.7109604519774011[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    0.3s finished

[2021-07-15 21:44:29] Features: 2/6 -- score: 0.7410169491525423[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.2s finished

[2021-07-15 21:44:29] Features: 3/6 -- score: 0.7613559322033898[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [21]:
print(sfs1_forward.k_score_)
print(sfs1_forward.k_feature_names_)

0.815084745762712
('Cholesterol', 'MaxHeartRateAchieved', 'StDepression', 'NumMajorVessels', 'ExerciseInducedAngina_yes', 'Thalassemia_normal')


In [25]:
sfs1_backward = mlxtend_sfs(gs_result.best_estimator_,
           k_features=6,
           forward=False,
           floating=False,
           verbose=2,
           scoring='accuracy',
           #cv=KFold(N, shuffle=True),
           )
sfs1_backward = sfs1_backward.fit(mX3, vY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.3s finished

[2021-07-15 21:46:04] Features: 17/6 -- score: 0.801638418079096[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    0.2s finished

[2021-07-15 21:46:05] Features: 16/6 -- score: 0.8216949152542373[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.3s finished

[2021-07-15 21:46:05] Features: 15/6 -- score: 0.8217514124293783[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

In [26]:
print(sfs1_backward.k_score_)
print(sfs1_backward.k_feature_names_)

0.8251977401129944
('StDepression', 'NumMajorVessels', 'ChestPainType_typical angina', 'RestECG_left ventricular hypertrophy', 'ExerciseInducedAngina_yes', 'Thalassemia_normal')


In [135]:
# Try 4 - Old School

from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import accuracy_score

# Initialztion:
dfAccuracy = pd.DataFrame(columns=['Feature', 'Accuracy'])
reduced_feature_list = []
feature_list_for_prediction = mX3.columns

while len(reduced_feature_list) != n_features:
    dfAccuracy = dfAccuracy.iloc[0:0]
    for index, col in enumerate(feature_list_for_prediction):
        current_cols = [col] + reduced_feature_list
        y_pred = cross_val_predict(gs_result.best_estimator_, mX3[current_cols], vY)
        dfAccuracy.loc[index] =[col, accuracy_score(vY, y_pred)]
    print(current_cols, dfAccuracy.Accuracy.max())
    max_feature = dfAccuracy.Feature.iloc[dfAccuracy.Accuracy.idxmax()]
    feature_list_for_prediction = feature_list_for_prediction.drop(max_feature)
    reduced_feature_list.append(max_feature)
    


['Thalassemia_reversable defect'] 0.7441077441077442
['Thalassemia_reversable defect', 'NumMajorVessels'] 0.7676767676767676
['Thalassemia_reversable defect', 'NumMajorVessels', 'Sex_male'] 0.7878787878787878
['Thalassemia_reversable defect', 'NumMajorVessels', 'Sex_male', 'StSlope_upsloping'] 0.8080808080808081
['Thalassemia_normal', 'NumMajorVessels', 'Sex_male', 'StSlope_upsloping', 'Thalassemia_reversable defect'] 0.8215488215488216
['Thalassemia_normal', 'NumMajorVessels', 'Sex_male', 'StSlope_upsloping', 'Thalassemia_reversable defect', 'ChestPainType_typical angina'] 0.8451178451178452


Unnamed: 0,ExerciseInducedAngina_yes,StDepression,NumMajorVessels,Thalassemia_normal,MaxHeartRateAchieved,Cholesterol
0,-0.695246,-0.819430,0.344243,0.899025,-0.810726,-0.256746
1,-0.695246,0.638393,1.409246,0.899025,0.061054,-0.160588
2,-0.695246,1.324427,-0.720760,0.899025,-1.551739,-0.410599
3,-0.695246,0.295376,0.344243,0.899025,1.063601,0.666374
4,1.433497,0.638393,-0.720760,0.899025,-0.244069,-0.699074
...,...,...,...,...,...,...
292,-0.695246,-0.905184,-0.720760,-1.108572,1.368724,-0.468294
293,-0.695246,0.123867,-0.720760,-1.108572,-0.418425,-0.545221
294,1.433497,0.466884,-0.720760,-1.108572,-0.854315,-0.949086
295,-0.695246,0.295376,-0.720760,0.899025,1.412313,-1.237561


In [29]:
dData3.columns

Index(['Age', 'RestingBloodPressure', 'Cholesterol', 'MaxHeartRateAchieved',
       'StDepression', 'NumMajorVessels', 'Sex_male',
       'ChestPainType_atypical angina', 'ChestPainType_non-anginal pain',
       'ChestPainType_typical angina', 'FastingBloodSugar_lower than 120mg/ml',
       'RestECG_left ventricular hypertrophy', 'RestECG_normal',
       'ExerciseInducedAngina_yes', 'StSlope_flat', 'StSlope_upsloping',
       'Thalassemia_normal', 'Thalassemia_reversable defect',
       'Target_no heart disease'],
      dtype='object')

In [37]:
dData3

Unnamed: 0,Age,RestingBloodPressure,Cholesterol,MaxHeartRateAchieved,StDepression,NumMajorVessels,Sex_male,ChestPainType_atypical angina,ChestPainType_non-anginal pain,ChestPainType_typical angina,FastingBloodSugar_lower than 120mg/ml,RestECG_left ventricular hypertrophy,RestECG_normal,ExerciseInducedAngina_yes,StSlope_flat,StSlope_upsloping,Thalassemia_normal,Thalassemia_reversable defect,Target_no heart disease
0,69,160,234,131,0.1,1,1,0,0,0,0,1,0,0,1,0,1,0,1
1,69,140,239,151,1.8,2,0,0,0,0,1,0,1,0,0,1,1,0,1
2,66,150,226,114,2.6,0,0,0,0,0,1,0,1,0,0,0,1,0,1
3,65,138,282,174,1.4,1,1,0,0,0,0,1,0,0,1,0,1,0,0
4,64,110,211,144,1.8,0,1,0,0,0,1,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,152,223,181,0.0,0,1,0,0,1,1,0,1,0,0,1,0,1,0
293,39,118,219,140,1.2,0,1,0,0,1,1,0,1,0,1,0,0,1,0
294,35,120,198,130,1.6,0,1,0,0,1,1,0,1,1,1,0,0,1,0
295,35,138,183,182,1.4,0,0,0,0,1,1,0,1,0,0,1,1,0,1


In [30]:
list(sfs1_backward.k_feature_names_)

['StDepression',
 'NumMajorVessels',
 'ChestPainType_typical angina',
 'RestECG_left ventricular hypertrophy',
 'ExerciseInducedAngina_yes',
 'Thalassemia_normal']

In [32]:
#-- Choose 6 features
lCol   = list(sfs1_backward.k_feature_names_)
dData4 = dData3[lCol]
dData4

Unnamed: 0,StDepression,NumMajorVessels,ChestPainType_typical angina,RestECG_left ventricular hypertrophy,ExerciseInducedAngina_yes,Thalassemia_normal
0,0.1,1,0,1,0,1
1,1.8,2,0,0,0,1
2,2.6,0,0,0,0,1
3,1.4,1,0,1,0,1
4,1.8,0,0,1,1,1
...,...,...,...,...,...,...
292,0.0,0,1,0,0,0
293,1.2,0,1,0,0,0
294,1.6,0,1,0,1,0
295,1.4,0,1,0,0,1


In [32]:
mX3  = dData3.drop(columns='Target_no heart disease')
mX3 -= mX3.mean(0)
mX3 /= mX3.std(0)
mX3.shape

(297, 18)

### ELI5 (Explain Like I'm Five):
https://eli5.readthedocs.io/en/latest/index.html

In [23]:
import eli5

oLinearSVM = SVC(kernel='linear', C=3).fit(mX3, vY)
eli5.show_weights(oLinearSVM, feature_names=dData3.columns[:-1].tolist(), target_names=['no hearth disease', 'hearth disease'])

Weight?,Feature
0.954,NumMajorVessels
0.804,ChestPainType_typical angina
0.473,Sex_male
0.427,ChestPainType_atypical angina
0.321,StDepression
0.313,RestingBloodPressure
0.301,Thalassemia_reversable defect
0.255,StSlope_flat
0.253,ExerciseInducedAngina_yes
0.233,FastingBloodSugar_lower than 120mg/ml


### Grid Search:

In [35]:
from sklearn.model_selection import GridSearchCV

dSvmParams = {'kernel': ('linear',),
              'C'     : np.linspace(0.01, 10, 21)}
oSearch    = GridSearchCV(SVC(), dSvmParams, cv=KFold(N, shuffle=True)).fit(mX3, vY)

print(f'Linear best parameters CV score = {oSearch.best_score_}')
print(oSearch.best_params_)

Linear best parameters CV score = 0.8518518518518519
{'C': 3.0069999999999997, 'kernel': 'linear'}


In [39]:
from sklearn.model_selection import GridSearchCV

vσ         = np.linspace(30, 50, 11)
dSvmParams = {'kernel': ('rbf',),
              'C'     : np.linspace(30, 40, 3),
              'gamma' : 1 / (2 * vσ**2)}
oSearch    = GridSearchCV(SVC(), dSvmParams, cv=KFold(N, shuffle=True)).fit(mX3, vY)

print(f'RBF best parameters CV score = {oSearch.best_score_}')
print(oSearch.best_params_)

RBF best parameters CV score = 0.8518518518518519
{'C': 30.0, 'gamma': 0.0002834467120181406, 'kernel': 'rbf'}
