In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from warnings import filterwarnings

In [2]:
data = pd.read_csv('Heart_disease_cleaned.csv')
data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,45,110,264,0,132,1.2,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
893,68,144,193,1,141,3.4,1,0,1,1,...,0,0,0,1,0,1,0,0,1,0
894,57,130,131,0,115,1.2,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
895,57,130,236,0,174,0.0,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0


In [5]:
train_data = data.drop('HeartDisease',axis='columns')
labels = data['HeartDisease']

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_data,labels,
                                                 test_size=0.25, 
                                                 random_state=41)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((672, 20), (225, 20), (672,), (225,))

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train_n = sc.fit_transform(x_train)
test_data= sc.fit_transform(x_test)
x_train_n

array([[-0.6512643 ,  1.53649857,  0.92994557, ..., -0.2515773 ,
         1.0059702 , -0.89263175],
       [ 0.80203137, -0.7016785 , -1.85625793, ..., -0.2515773 ,
         1.0059702 , -0.89263175],
       [-1.89694631, -0.7016785 ,  0.0395094 , ..., -0.2515773 ,
         1.0059702 , -0.89263175],
       ...,
       [ 0.80203137, -0.42190637,  0.93952015, ..., -0.2515773 ,
        -0.99406523,  1.1202828 ],
       [ 0.2829972 , -0.98145064, -1.85625793, ..., -0.2515773 ,
        -0.99406523,  1.1202828 ],
       [ 0.2829972 ,  0.9769543 ,  0.18312814, ..., -0.2515773 ,
         1.0059702 , -0.89263175]])

## Using K Fold Cross validation
Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [21]:
from sklearn.model_selection import cross_val_score

cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),x_train,y_train, cv=5)

array([0.86666667, 0.86666667, 0.86567164, 0.85074627, 0.90298507])

In [23]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),x_train,y_train, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.5729132117191817,
 'rbf_10': 0.5788833609729132,
 'rbf_20': 0.5788833609729132,
 'linear_1': 0.8631066887783305,
 'linear_10': 0.870547263681592,
 'linear_20': 0.8541846323935876}

## Using GridSearchCV
GridSearchCV does exactly same thing as for loop above but in a single line of code

In [24]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(x_train,y_train)
clf.cv_results_

{'mean_fit_time': array([5.60617447e-02, 3.52040539e+00, 5.05882263e-02, 2.94665753e+01,
        4.00897026e-02, 6.22521341e+01]),
 'std_fit_time': array([4.00889032e-03, 9.49194777e-01, 5.68980649e-03, 4.40983273e+00,
        1.19089138e-02, 1.53393626e+01]),
 'mean_score_time': array([0.01590276, 0.00458755, 0.01336226, 0.00537467, 0.00892305,
        0.00422039]),
 'std_score_time': array([0.00556012, 0.00404697, 0.00422806, 0.0045865 , 0.00238476,
        0.00263079]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'ker

In [25]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.056062,0.004009,0.015903,0.00556,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.592593,0.555556,0.559701,0.589552,0.567164,0.572913,0.015317,6
1,3.520405,0.949195,0.004588,0.004047,1,linear,"{'C': 1, 'kernel': 'linear'}",0.851852,0.866667,0.865672,0.835821,0.895522,0.863107,0.019704,2
2,0.050588,0.00569,0.013362,0.004228,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.6,0.548148,0.529851,0.619403,0.597015,0.578883,0.033957,4
3,29.466575,4.409833,0.005375,0.004586,10,linear,"{'C': 10, 'kernel': 'linear'}",0.866667,0.866667,0.865672,0.850746,0.902985,0.870547,0.01731,1
4,0.04009,0.011909,0.008923,0.002385,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.6,0.548148,0.529851,0.619403,0.597015,0.578883,0.033957,4
5,62.252134,15.339363,0.00422,0.002631,20,linear,"{'C': 20, 'kernel': 'linear'}",0.844444,0.851852,0.865672,0.843284,0.865672,0.854185,0.009829,3


In [26]:
clf.best_params_

{'C': 10, 'kernel': 'linear'}

In [27]:
clf.best_score_


0.870547263681592

##  different models with different hyperparameters

In [14]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [16]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train,y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.870547,"{'C': 10, 'kernel': 'linear'}"
1,random_forest,0.846733,{'n_estimators': 5}
2,logistic_regression,0.861614,{'C': 1}


In [17]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014281,0.004718,0.002931,0.004973,1,{'C': 1},0.851852,0.866667,0.865672,0.828358,0.895522,0.861614,0.021877,1
1,0.011199,0.001862,0.00528,0.002939,5,{'C': 5},0.851852,0.866667,0.865672,0.835821,0.88806,0.861614,0.017332,2
2,0.007995,0.004611,0.002245,0.00449,10,{'C': 10},0.851852,0.866667,0.865672,0.835821,0.88806,0.861614,0.017332,2


In [18]:
clf.best_params_

{'C': 1}