![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)

# <center> Machine Learning Methods </center>
## <center> Exercise 03 - Hearth Disease Classification </center>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethod/Exercises/Exercise03_Classification.ipynb)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))



In [2]:
import numpy             as np
import pandas            as pd
import seaborn           as sns
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc('font', **{'size' : 16})

pd.set_option('display.max_colwidth', 0)

# np.random.seed(1)

### Load the dataset:
https://www.kaggle.com/cherngs/heart-disease-cleveland-uci?select=heart_cleveland_upload.csv

In [3]:
dData = pd.read_csv('heart_cleveland_upload.csv')
dData

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


### Display target balance:

In [4]:
lFullName = ['Age', 'Sex', 'ChestPainType', 'RestingBloodPressure', 'Cholesterol', 'FastingBloodSugar', 'RestECG', 'MaxHeartRateAchieved', 'ExerciseInducedAngina', 'StDepression', 'StSlope', 'NumMajorVessels', 'Thalassemia', 'Target']
space     = max(map(len, dData.columns))
for (shortName, fullName) in zip(dData.columns, lFullName):
    print(f'{shortName:{space}s} = {fullName}')

age       = Age
sex       = Sex
cp        = ChestPainType
trestbps  = RestingBloodPressure
chol      = Cholesterol
fbs       = FastingBloodSugar
restecg   = RestECG
thalach   = MaxHeartRateAchieved
exang     = ExerciseInducedAngina
oldpeak   = StDepression
slope     = StSlope
ca        = NumMajorVessels
thal      = Thalassemia
condition = Target


### Get data and normalization:

In [5]:
mX = dData.drop(columns='condition').values
vY = dData['condition'].values

N = len(vY)

mX.shape, vY.shape

((297, 13), (297,))

In [6]:
#-- Normalize data:
mX -= mX.mean(0)
mX /= mX.std(0)

### Basic classification:
Let us try several default classifiers

In [7]:
from sklearn.linear_model    import LogisticRegression
from sklearn.svm             import SVC
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, KFold

# https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
svc_grid_params = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]


# https://www.kaggle.com/enespolat/grid-search-with-logistic-regression
lr_grid_params=  {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge


knn_grid_params = {
    'n_neighbors' : np.arange(1, 21, 2),
    'metric' : ['euclidean', 'manhattan'],
}

# https://stackoverflow.com/questions/38709690/scikit-learn-using-gridsearchcv-on-decisiontreeclassifier

dtc_grid_params = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}



#-- Classifiers:
#-- You can set the hyper-parameters as well
lModels  = []
lModels += [(SVC                   (C=1), 'SVM'                 ,svc_grid_params)]
lModels += [(LogisticRegression    (),    'Logistic Regression' , lr_grid_params)]
lModels += [(KNeighborsClassifier  (),    'KNN'                , knn_grid_params)]
lModels += [(DecisionTreeClassifier(),    'Tree'               , dtc_grid_params)]


#N = len(vY)
for (oClassifier, name, params) in lModels:
    vHatY    = cross_val_predict(oClassifier, mX, vY, cv=KFold(N))
    accuracy = np.mean(vY == vHatY)
    print(f'{name:19s} = {100*accuracy:2.2f}%')

SVM                 = 82.83%
Logistic Regression = 83.16%
KNN                 = 82.49%
Tree                = 72.39%


In [8]:
lModels

[(SVC(C=1),
  'SVM',
  [{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]},
   {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]),
 (LogisticRegression(),
  'Logistic Regression',
  {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
   'penalty': ['l1', 'l2']}),
 (KNeighborsClassifier(),
  'KNN',
  {'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
   'metric': ['euclidean', 'manhattan']}),
 (DecisionTreeClassifier(),
  'Tree',
  {'criterion': ['gini', 'entropy'],
   'max_depth': [4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    15,
    20,
    30,
    40,
    50,
    70,
    90,
    120,
    150]})]

### Exercise A:
* Use `KNeighborsClassifier` and get above 84% leave-one-out cross validation accuracy.  
(Play with the hyper-parameters)

In [9]:
#-- Your solution

dfAnswers = pd.DataFrame(columns=['Exercise', 'Decription', 'Answer_1', 'Answer_2', 'Answer_3'])
dfAnswers.style.set_properties(**{'text-align': 'left'})
dfAnswers.style.highlight_max(color = 'lightgreen', axis = 0)


cv = KFold(N, shuffle=True)
# cv = KFold(5, shuffle=True)

# https://medium.com/@erikgreenj/k-neighbors-classifier-with-gridsearchcv-basics-3c445ddeb657

from sklearn.model_selection import GridSearchCV

knn_grid_params = {
    'n_neighbors' : np.arange(1, 21, 2),
    'metric' : ['euclidean', 'manhattan'],
}


gs = GridSearchCV(
    KNeighborsClassifier(),
    knn_grid_params,
    verbose=2,
    cv=cv, 
    n_jobs=-1)

gs_result = gs.fit(mX, vY)


Fitting 297 folds for each of 20 candidates, totalling 5940 fits


In [10]:

dfAnswers.loc[len(dfAnswers)] = [
    'A',
    'KNeighborsClassifier',
    'Original Data',
    f'Score: {100*gs_result.best_score_:2.2f}%'
    '',
    '',
    ]

In [11]:
dfAnswers

Unnamed: 0,Exercise,Decription,Answer_1,Answer_2,Answer_3
0,A,KNeighborsClassifier,Original Data,Score: 84.85%,


In [12]:
dData2         = dData.copy()
dData2.columns = lFullName
dData2

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestECG,MaxHeartRateAchieved,ExerciseInducedAngina,StDepression,StSlope,NumMajorVessels,Thalassemia,Target
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


### Meaningful categorical data:
(Based on the data description)

In [13]:
dData2.loc[dData2['Sex']                   == 0, 'Sex']                   = 'female'
dData2.loc[dData2['Sex']                   == 1, 'Sex']                   = 'male'
  
dData2.loc[dData2['ChestPainType']         == 0, 'ChestPainType']         = 'asymptomatic'
dData2.loc[dData2['ChestPainType']         == 1, 'ChestPainType']         = 'atypical angina'
dData2.loc[dData2['ChestPainType']         == 2, 'ChestPainType']         = 'non-anginal pain'
dData2.loc[dData2['ChestPainType']         == 3, 'ChestPainType']         = 'typical angina'

dData2.loc[dData2['FastingBloodSugar']     == 0, 'FastingBloodSugar']     = 'lower than 120mg/ml'
dData2.loc[dData2['FastingBloodSugar']     == 1, 'FastingBloodSugar']     = 'greater than 120mg/ml'

dData2.loc[dData2['RestECG']               == 0, 'RestECG']               = 'normal'
dData2.loc[dData2['RestECG']               == 1, 'RestECG']               = 'ST-T wave abnormality'
dData2.loc[dData2['RestECG']               == 2, 'RestECG']               = 'left ventricular hypertrophy'

dData2.loc[dData2['ExerciseInducedAngina'] == 0, 'ExerciseInducedAngina'] = 'no'
dData2.loc[dData2['ExerciseInducedAngina'] == 1, 'ExerciseInducedAngina'] = 'yes'

dData2.loc[dData2['StSlope']               == 0, 'StSlope']               = 'upsloping'
dData2.loc[dData2['StSlope']               == 1, 'StSlope']               = 'flat'
dData2.loc[dData2['StSlope']               == 2, 'StSlope']               = 'downsloping'

dData2.loc[dData2['Thalassemia']           == 0, 'Thalassemia']           = 'normal'
dData2.loc[dData2['Thalassemia']           == 1, 'Thalassemia']           = 'fixed defect'
dData2.loc[dData2['Thalassemia']           == 2, 'Thalassemia']           = 'reversable defect'

dData2.loc[dData2['Target']                == 0, 'Target']                = 'no heart disease'
dData2.loc[dData2['Target']                == 1, 'Target']                = 'heart disease'
dData2

Unnamed: 0,Age,Sex,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestECG,MaxHeartRateAchieved,ExerciseInducedAngina,StDepression,StSlope,NumMajorVessels,Thalassemia,Target
0,69,male,asymptomatic,160,234,greater than 120mg/ml,left ventricular hypertrophy,131,no,0.1,flat,1,normal,no heart disease
1,69,female,asymptomatic,140,239,lower than 120mg/ml,normal,151,no,1.8,upsloping,2,normal,no heart disease
2,66,female,asymptomatic,150,226,lower than 120mg/ml,normal,114,no,2.6,downsloping,0,normal,no heart disease
3,65,male,asymptomatic,138,282,greater than 120mg/ml,left ventricular hypertrophy,174,no,1.4,flat,1,normal,heart disease
4,64,male,asymptomatic,110,211,lower than 120mg/ml,left ventricular hypertrophy,144,yes,1.8,flat,0,normal,no heart disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,male,typical angina,152,223,lower than 120mg/ml,normal,181,no,0.0,upsloping,0,reversable defect,heart disease
293,39,male,typical angina,118,219,lower than 120mg/ml,normal,140,no,1.2,flat,0,reversable defect,heart disease
294,35,male,typical angina,120,198,lower than 120mg/ml,normal,130,yes,1.6,flat,0,reversable defect,heart disease
295,35,female,typical angina,138,183,lower than 120mg/ml,normal,182,no,1.4,upsloping,0,normal,no heart disease


### Convert non-numeric features to dummy features:

In [14]:
dData3 = pd.get_dummies(dData2, drop_first=True)
dData3

Unnamed: 0,Age,RestingBloodPressure,Cholesterol,MaxHeartRateAchieved,StDepression,NumMajorVessels,Sex_male,ChestPainType_atypical angina,ChestPainType_non-anginal pain,ChestPainType_typical angina,FastingBloodSugar_lower than 120mg/ml,RestECG_left ventricular hypertrophy,RestECG_normal,ExerciseInducedAngina_yes,StSlope_flat,StSlope_upsloping,Thalassemia_normal,Thalassemia_reversable defect,Target_no heart disease
0,69,160,234,131,0.1,1,1,0,0,0,0,1,0,0,1,0,1,0,1
1,69,140,239,151,1.8,2,0,0,0,0,1,0,1,0,0,1,1,0,1
2,66,150,226,114,2.6,0,0,0,0,0,1,0,1,0,0,0,1,0,1
3,65,138,282,174,1.4,1,1,0,0,0,0,1,0,0,1,0,1,0,0
4,64,110,211,144,1.8,0,1,0,0,0,1,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,152,223,181,0.0,0,1,0,0,1,1,0,1,0,0,1,0,1,0
293,39,118,219,140,1.2,0,1,0,0,1,1,0,1,0,1,0,0,1,0
294,35,120,198,130,1.6,0,1,0,0,1,1,0,1,1,1,0,0,1,0
295,35,138,183,182,1.4,0,0,0,0,1,1,0,1,0,0,1,1,0,1


In [15]:
dData3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Age                                    297 non-null    int64  
 1   RestingBloodPressure                   297 non-null    int64  
 2   Cholesterol                            297 non-null    int64  
 3   MaxHeartRateAchieved                   297 non-null    int64  
 4   StDepression                           297 non-null    float64
 5   NumMajorVessels                        297 non-null    int64  
 6   Sex_male                               297 non-null    uint8  
 7   ChestPainType_atypical angina          297 non-null    uint8  
 8   ChestPainType_non-anginal pain         297 non-null    uint8  
 9   ChestPainType_typical angina           297 non-null    uint8  
 10  FastingBloodSugar_lower than 120mg/ml  297 non-null    uint8  
 11  RestEC

### Exercise B:
1. Use `dData3` and get above 85% leave-one-out cross validation accuracy.
2. You are allowed you to only 6 features (from the given 18).  
Get more than 80% accuracy with only 6 features.  
What is the right approach for this task?

In [16]:
#-- Your solution to 1

# https://scikit-learn.org/stable/modules/feature_selection.html

mX3  = dData3.drop(columns='Target_no heart disease')
mX3 -= mX3.mean(0)
mX3 /= mX3.std(0)
mX3.shape

lGsModels  = []

# cv = KFold(5, shuffle=True)

for (oClassifier, name, grid_params) in lModels:
    gs_res = GridSearchCV(
                        oClassifier,
                        grid_params,
                        verbose=3,
                        cv=cv,
                        n_jobs=-1
                    ).fit(mX3, vY)
    print(f'{name:19s} = {100*gs_res.best_score_:2.2f}, {gs_res.best_params_} %')
    lGsModels += [gs_res.best_estimator_]
    dfAnswers.loc[len(dfAnswers)] = [
        'B-1',
        name,
        'dummies',
        f'Score: {100*gs_res.best_score_:2.2f}%',
        gs_res.best_params_,
    ]


Fitting 297 folds for each of 12 candidates, totalling 3564 fits
SVM                 = 85.19, {'C': 10, 'kernel': 'linear'} %
Fitting 297 folds for each of 14 candidates, totalling 4158 fits
        nan 0.83838384        nan 0.83838384        nan 0.83838384
        nan 0.83501684]
Logistic Regression = 84.18, {'C': 0.1, 'penalty': 'l2'} %
Fitting 297 folds for each of 20 candidates, totalling 5940 fits
KNN                 = 83.50, {'metric': 'euclidean', 'n_neighbors': 9} %
Fitting 297 folds for each of 36 candidates, totalling 10692 fits
Tree                = 77.44, {'criterion': 'gini', 'max_depth': 4} %


In [17]:
dfAnswers

Unnamed: 0,Exercise,Decription,Answer_1,Answer_2,Answer_3
0,A,KNeighborsClassifier,Original Data,Score: 84.85%,
1,B-1,SVM,dummies,Score: 85.19%,"{'C': 10, 'kernel': 'linear'}"
2,B-1,Logistic Regression,dummies,Score: 84.18%,"{'C': 0.1, 'penalty': 'l2'}"
3,B-1,KNN,dummies,Score: 83.50%,"{'metric': 'euclidean', 'n_neighbors': 9}"
4,B-1,Tree,dummies,Score: 77.44%,"{'criterion': 'gini', 'max_depth': 4}"


In [18]:
# Try 1 - Using SelectFromModel : failed since the KNeighborsClassifier does not have any `coef_` or `feature_importances_` attribute

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

# from sklearn.feature_selection import SelectFromModel

# selector = SelectFromModel(estimator=gs_result.best_estimator_, max_features=6).fit(mX3, vY)
# selector.estimator_
# selector.fit_transform(mX3, vY)

# ValueError: when `importance_getter=='auto'`, the underlying estimator KNeighborsClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [19]:
#-- Your solution to 2

In [20]:
# http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
# https://towardsdatascience.com/new-features-of-scikit-learn-fbbfe7652bfb

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as mlxtend_sfs

n_features = 6
# cv = KFold(5, shuffle=True)

def classication_with_sfs(model, direction, cv):
    sfs = SequentialFeatureSelector(lGsModel, 
                                n_features_to_select=n_features, 
                                direction=direction,
                                cv=cv,
                                )

    sfs.fit(mX3, vY)
    vHatY    = cross_val_predict(lGsModel, sfs.transform(mX3), vY, cv=cv)
    score = 100*accuracy_score(vY, vHatY)
    print(f'lGsModel {model} {direction}: Score: {score:2.2f}%')
    dfAnswers.loc[len(dfAnswers)] = [
            'B-2',
            'sfs, ' + direction,
            model,
            f'{score:2.2f}%',
            np.arange(sfs.n_features_in_)[sfs.get_support()],
            ]

def classication_with_mlxtend_sfs(model, forward, cv):
    xsfs = mlxtend_sfs(
        model,
        k_features=n_features,
        forward=forward,
        floating=False,
        verbose=0,
        scoring='accuracy',
        cv=cv,
        ).fit(mX3,vY)
    
    print(f'lGsModel (mlextend): {lGsModel} {forward}: Score: {100*xsfs.k_score_:2.2f}%')
    forward_str = 'forward' if forward else 'backward'
    dfAnswers.loc[len(dfAnswers)] = [
            'B-2',
            'xsfs, ' + forward_str,
            model,
            f'{100*xsfs.k_score_:2.2f}%',
            xsfs.k_feature_idx_,
            ]

from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import accuracy_score

def calc_explicit_loop(model):
    dfAccuracy = pd.DataFrame(columns=['Feature', 'Accuracy'])
    reduced_feature_list = []
    feature_list_for_prediction = mX3.columns

    while len(reduced_feature_list) != n_features:
        dfAccuracy = dfAccuracy.iloc[0:0]
        for index, col in enumerate(feature_list_for_prediction):
            current_cols = [col] + reduced_feature_list
            y_pred = cross_val_predict(model, mX3[current_cols], vY, cv=cv)
            dfAccuracy.loc[index] =[col, accuracy_score(vY, y_pred)]
            max_feature = dfAccuracy.Feature.iloc[dfAccuracy.Accuracy.idxmax()]
        feature_list_for_prediction = feature_list_for_prediction.drop(max_feature)
        reduced_feature_list.append(max_feature)
        print(f'Model {model}, Score: {100*accuracy_score(vY, y_pred):2.2f}%')
        dfAnswers.loc[len(dfAnswers)] = [
            'B-2',
            'Explicit loop',
            model,
            f'{100*accuracy_score(vY, y_pred):2.2f}%',
            [mX3.columns.get_loc(col) for col in current_cols],
            ]


for lGsModel in lGsModels:
    classication_with_sfs(lGsModel, 'forward', cv)
    classication_with_sfs(lGsModel, 'backward', cv)

    classication_with_mlxtend_sfs(lGsModel, True, cv)
    classication_with_mlxtend_sfs(lGsModel, False, cv)

    calc_explicit_loop(lGsModel)

lGsModel SVC(C=10, kernel='linear') forward: Score: 80.13%
lGsModel SVC(C=10, kernel='linear') backward: Score: 86.53%
lGsModel (mlextend): SVC(C=10, kernel='linear') True: Score: 80.13%
lGsModel (mlextend): SVC(C=10, kernel='linear') False: Score: 86.53%
Model SVC(C=10, kernel='linear'), Score: 74.41%
Model SVC(C=10, kernel='linear'), Score: 76.43%
Model SVC(C=10, kernel='linear'), Score: 76.43%
Model SVC(C=10, kernel='linear'), Score: 76.43%
Model SVC(C=10, kernel='linear'), Score: 76.43%
Model SVC(C=10, kernel='linear'), Score: 76.43%
lGsModel LogisticRegression(C=0.1) forward: Score: 85.19%
lGsModel LogisticRegression(C=0.1) backward: Score: 85.86%
lGsModel (mlextend): LogisticRegression(C=0.1) True: Score: 85.19%
lGsModel (mlextend): LogisticRegression(C=0.1) False: Score: 85.86%
Model LogisticRegression(C=0.1), Score: 74.41%
Model LogisticRegression(C=0.1), Score: 76.43%
Model LogisticRegression(C=0.1), Score: 79.12%
Model LogisticRegression(C=0.1), Score: 84.18%
Model LogisticRe

In [21]:
dfAnswers

Unnamed: 0,Exercise,Decription,Answer_1,Answer_2,Answer_3
0,A,KNeighborsClassifier,Original Data,Score: 84.85%,
1,B-1,SVM,dummies,Score: 85.19%,"{'C': 10, 'kernel': 'linear'}"
2,B-1,Logistic Regression,dummies,Score: 84.18%,"{'C': 0.1, 'penalty': 'l2'}"
3,B-1,KNN,dummies,Score: 83.50%,"{'metric': 'euclidean', 'n_neighbors': 9}"
4,B-1,Tree,dummies,Score: 77.44%,"{'criterion': 'gini', 'max_depth': 4}"
5,B-2,"sfs, forward","SVC(C=10, kernel='linear')",80.13%,"[0, 1, 2, 3, 5, 16]"
6,B-2,"sfs, backward","SVC(C=10, kernel='linear')",86.53%,"[5, 6, 9, 13, 15, 17]"
7,B-2,"xsfs, forward","SVC(C=10, kernel='linear')",80.13%,"(0, 1, 2, 3, 5, 16)"
8,B-2,"xsfs, backward","SVC(C=10, kernel='linear')",86.53%,"(5, 6, 9, 13, 15, 17)"
9,B-2,Explicit loop,"SVC(C=10, kernel='linear')",74.41%,[17]


In [26]:
dfAnswers[dfAnswers.Exercise=='B-2'].sort_values('Answer_2', ascending=False)

Unnamed: 0,Exercise,Decription,Answer_1,Answer_2,Answer_3
8,B-2,"xsfs, backward","SVC(C=10, kernel='linear')",86.53%,"(5, 6, 9, 13, 15, 17)"
6,B-2,"sfs, backward","SVC(C=10, kernel='linear')",86.53%,"[5, 6, 9, 13, 15, 17]"
18,B-2,"xsfs, backward",LogisticRegression(C=0.1),85.86%,"(3, 5, 9, 13, 15, 17)"
16,B-2,"sfs, backward",LogisticRegression(C=0.1),85.86%,"[1, 5, 9, 10, 15, 16]"
15,B-2,"sfs, forward",LogisticRegression(C=0.1),85.19%,"[5, 6, 7, 8, 9, 16]"
17,B-2,"xsfs, forward",LogisticRegression(C=0.1),85.19%,"(5, 6, 7, 8, 9, 16)"
42,B-2,Explicit loop,DecisionTreeClassifier(max_depth=4),84.85%,"[17, 16, 9, 5]"
37,B-2,"xsfs, forward",DecisionTreeClassifier(max_depth=4),84.85%,"(5, 7, 8, 9, 16, 17)"
35,B-2,"sfs, forward",DecisionTreeClassifier(max_depth=4),84.85%,"[5, 7, 8, 9, 16, 17]"
26,B-2,"sfs, backward","KNeighborsClassifier(metric='euclidean', n_neighbors=9)",84.85%,"[5, 9, 13, 14, 15, 16]"


In [22]:
col_idx = (dfAnswers[dfAnswers.Exercise=='B-2'].sort_values('Answer_2', ascending=False)).iloc[0].Answer_3

In [23]:
#-- Choose 6 features
lCol   = dData3.columns.values[np.asarray(col_idx)]
dData4 = dData3[lCol]
dData4

Unnamed: 0,NumMajorVessels,Sex_male,ChestPainType_typical angina,ExerciseInducedAngina_yes,StSlope_upsloping,Thalassemia_reversable defect
0,1,1,0,0,0,0
1,2,0,0,0,1,0
2,0,0,0,0,0,0
3,1,1,0,0,0,0
4,0,1,0,1,0,0
...,...,...,...,...,...,...
292,0,1,1,0,1,1
293,0,1,1,0,0,1
294,0,1,1,1,0,1
295,0,0,1,0,1,0


### ELI5 (Explain Like I'm Five):
https://eli5.readthedocs.io/en/latest/index.html

In [24]:
pip install eli5

Note: you may need to restart the kernel to use updated packages.


In [25]:
import eli5

oLinearSVM = SVC(kernel='linear', C=3).fit(mX3, vY)
eli5.show_weights(oLinearSVM, feature_names=dData3.columns[:-1].tolist(), target_names=['no hearth disease', 'hearth disease'])

AttributeError: type object 'h5py.h5.H5PYConfig' has no attribute '__reduce_cython__'

### Grid Search:

In [None]:
from sklearn.model_selection import GridSearchCV

dSvmParams = {'kernel': ('linear',),
              'C'     : np.linspace(0.01, 10, 21)}
oSearch    = GridSearchCV(SVC(), dSvmParams, cv=KFold(N, shuffle=True)).fit(mX3, vY)

print(f'Linear best parameters CV score = {oSearch.best_score_}')
print(oSearch.best_params_)

In [None]:
from sklearn.model_selection import GridSearchCV

vσ         = np.linspace(30, 50, 11)
dSvmParams = {'kernel': ('rbf',),
              'C'     : np.linspace(30, 40, 3),
              'gamma' : 1 / (2 * vσ**2)}
oSearch    = GridSearchCV(SVC(), dSvmParams, cv=KFold(N, shuffle=True)).fit(mX3, vY)

print(f'RBF best parameters CV score = {oSearch.best_score_}')
print(oSearch.best_params_)