In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from warnings import filterwarnings

In [2]:
filterwarnings('ignore')

In [3]:
data = pd.read_csv('heart.csv')

In [4]:
num_col = ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']
cat_col = list(data.columns)
for col in num_col:
    cat_col.remove(col)

In [5]:
data_no_outliers = data
for col in num_col:
    min_thresh = data_no_outliers[col].mean() - 3 * data_no_outliers[col].std()
    max_thresh = data_no_outliers[col].mean() + 3 * data_no_outliers[col].std()
    data_no_outliers = data_no_outliers[(data_no_outliers[col] > min_thresh) & (data_no_outliers[col] < max_thresh)]

data = data_no_outliers

In [6]:
le = LabelEncoder()
for col in cat_col:
    data[col] = le.fit_transform(data[col])

In [7]:
target = data[['HeartDisease']]
data = data.drop(['HeartDisease'],axis='columns')

In [8]:
data = pd.get_dummies(data,columns=cat_col[:-1],dtype=int)

In [9]:
ss = StandardScaler()
data = ss.fit_transform(data)
target = target.to_numpy()

In [10]:
pca = PCA(.95)
data_pca = pca.fit_transform(data)
data_pca.shape

(899, 13)

In [11]:
model_params = {

    'svm': {
        'model': SVC(),
        'params': {
            'kernel': ['linear', 'rbf'],
            'C': [1,10,20,50],
            'gamma': ['auto', 'scale']
        }
    },


    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy']
        }
    },


    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10,20,50,100],
            'criterion': ['gini','entropy','log_loss']
        }
    },


    'log_reg': {
        'model': LogisticRegression(),
        'params': {
            'penalty': ['l1','l2','elasticnet'],
            'solver': ['lbfgs','sag','saga']
        }
    },


    'gaussian_nb': {
        'model': GaussianNB(),
        'params': {

        }
    },

}

In [12]:
stats = []
for model_name, model_config in model_params.items():
    gcv = GridSearchCV(model_config['model'],model_config['params'],cv=10,return_train_score=False)
    gcv.fit(data,target)
    stats.append(
        {'model_name': model_name,
         'best_params': gcv.best_params_,
         'best_score': gcv.best_score_})

In [13]:
pd.DataFrame(stats)

Unnamed: 0,model_name,best_params,best_score
0,svm,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",0.849725
1,decision_tree,{'criterion': 'entropy'},0.77965
2,random_forest,"{'criterion': 'entropy', 'n_estimators': 50}",0.856454
3,log_reg,"{'penalty': 'l2', 'solver': 'lbfgs'}",0.850861
4,gaussian_nb,{},0.848677


In [14]:
stats = []
for model_name, model_config in model_params.items():
    gcv = GridSearchCV(model_config['model'],model_config['params'],cv=10,return_train_score=False)
    gcv.fit(data_pca,target)
    stats.append(
        {'model_name': model_name,
         'best_params': gcv.best_params_,
         'best_score': gcv.best_score_})

In [15]:
pd.DataFrame(stats)

Unnamed: 0,model_name,best_params,best_score
0,svm,"{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}",0.844145
1,decision_tree,{'criterion': 'gini'},0.790799
2,random_forest,"{'criterion': 'gini', 'n_estimators': 50}",0.848677
3,log_reg,"{'penalty': 'l1', 'solver': 'saga'}",0.848627
4,gaussian_nb,{},0.837516
