In [1]:
import numpy as np
import pandas as pd
from sklearn import tree, metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy import misc
import collections
from matplotlib import pyplot as plt

In [2]:
def model_report(test, predict, model_name): 
    print('Accuracy Score of ' + model_name, accuracy_score(test, predict))
    fpr, tpr, thresholds = metrics.roc_curve(test, predict, pos_label=1)
    print('AUC Score of ' + model_name, metrics.auc(fpr, tpr))
    print(classification_report(test, predict))
    
def logistic_regression_model(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression()
    
    # Get the best parameter
    params = {'penalty':['l1','l2'],
              'solver': ['liblinear'],
             'C':[0.01, 0.1, 1, 10, 100],
             'class_weight':['balanced',None]}
    model = GridSearchCV(log, param_grid=params, cv=10, iid=True)
    
    # result of the model
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print('The best parameter is ', model.best_params_)
    model_report(predict, y_test, "Logistic Regression")
    
    
def svc_model(X_train, X_test, y_train, y_test):
    from sklearn.svm import SVC
    svc = SVC()
    
    # Get the best parameter
    params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'gamma': ['auto']}
    model = GridSearchCV(svc, param_grid=params, cv=10, iid=True)
    
    # result of the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('The best parameter is ', model.best_params_)
    model_report(y_pred, y_test, "SVM")
    
    
def decision_tree_model(X_train, X_test, y_train, y_test):
    params = {"max_depth": [1, 2, 3, 4, None],
              "max_features": [1, 2, 3, 4, None],
              "min_samples_leaf": np.arange(1, 9),
              "criterion": ["gini", "entropy"]}
    tree = DecisionTreeClassifier()
    model = GridSearchCV(tree, param_grid=params, cv=10, iid=True)
    
    # result of the model
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print('The best parameter is ', model.best_params_)
    model_report(predict, y_test, "Decision Tree")



In [3]:
def run_all_model():
    data = pd.read_csv('heart_1.csv')
    X = data.drop('target', axis = 1)
    Y = data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
    print("********************Logistic Regression*****************************")
    logistic_regression_model(X_train, X_test, y_train, y_test)
    print("********************SVM*****************************")
    svc_model(X_train, X_test, y_train, y_test)
    print("********************Decision Tree*****************************")
    decision_tree_model(X_train, X_test, y_train, y_test)


In [4]:
def run_all_model_with_preprocessing():
    data = pd.read_csv('heart_1.csv')
    data['cp'] = data['cp'].astype('object')
    data['slope'] = data['slope'].astype('object')
    data['thal'] = data['thal'].astype('object')
    data['restecg'] = data['restecg'].astype('object')
    data = pd.get_dummies(data)
    X = data.drop('target', axis = 1)
    Y = data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
    # Use StandardScaler to preprocess
    from sklearn.preprocessing import StandardScaler
    standard_scaler = StandardScaler()

    X_train_2 = standard_scaler.fit_transform(X_train)
    
    X_train_2 = pd.DataFrame(X_train_2)

    X_test_2 = standard_scaler.transform(X_test)
    X_test_2 = pd.DataFrame(X_test_2)
    print("********************Logistic Regression*****************************")
    logistic_regression_model(X_train_2, X_test_2, y_train, y_test)
    print("********************SVM*****************************")
    svc_model(X_train_2, X_test_2, y_train, y_test)
    print("********************Decision Tree*****************************")
    decision_tree_model(X_train_2, X_test_2, y_train, y_test)

In [5]:
run_all_model()

********************Logistic Regression*****************************
The best parameter is  {'C': 1, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy Score of Logistic Regression 0.8131868131868132
AUC Score of Logistic Regression 0.811764705882353
              precision    recall  f1-score   support

           0       0.78      0.80      0.79        40
           1       0.84      0.82      0.83        51

    accuracy                           0.81        91
   macro avg       0.81      0.81      0.81        91
weighted avg       0.81      0.81      0.81        91

********************SVM*****************************
The best parameter is  {'gamma': 'auto', 'kernel': 'linear'}
Accuracy Score of SVM 0.8131868131868132
AUC Score of SVM 0.811764705882353
              precision    recall  f1-score   support

           0       0.78      0.80      0.79        40
           1       0.84      0.82      0.83        51

    accuracy                           0.81     

In [6]:
run_all_model_with_preprocessing()

********************Logistic Regression*****************************
The best parameter is  {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy Score of Logistic Regression 0.8571428571428571
AUC Score of Logistic Regression 0.8558994197292069
              precision    recall  f1-score   support

           0       0.88      0.82      0.85        44
           1       0.84      0.89      0.87        47

    accuracy                           0.86        91
   macro avg       0.86      0.86      0.86        91
weighted avg       0.86      0.86      0.86        91

********************SVM*****************************
The best parameter is  {'gamma': 'auto', 'kernel': 'sigmoid'}
Accuracy Score of SVM 0.8131868131868132
AUC Score of SVM 0.8112244897959184
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        42
           1       0.82      0.84      0.83        49

    accuracy                           0.8

In [7]:
def calculate_feature_importance():
    # Calcuate feature importance using logistic regression model
    data = pd.read_csv('heart_1.csv')
    data['cp'] = data['cp'].astype('object')
    data['slope'] = data['slope'].astype('object')
    data['thal'] = data['thal'].astype('object')
    data['restecg'] = data['restecg'].astype('object')
    data = pd.get_dummies(data)
    print(data.columns)
    X = data.drop('target', axis = 1)
    Y = data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
    # Use StandardScaler to preprocess
    from sklearn.preprocessing import StandardScaler
    standard_scaler = StandardScaler()
    X_train = standard_scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train)

    X_test = standard_scaler.transform(X_test)
    X_test = pd.DataFrame(X_test)
    
    #use best parameters we get from previous steps
    best_log = LogisticRegression(
        penalty='l2',
        C=0.01,
        solver='liblinear',
        class_weight=None,
    )
    best_log.fit(X_train, y_train)
    feature_importance = abs(best_log.coef_[0])
    sorted_index = np.argsort(feature_importance)
    sorted_features = []
    for i in sorted_index:
        sorted_features.append(X.columns[i])
    print("Sort features ascendingly based on feauture importance: ")
    print(sorted_features)

In [8]:
calculate_feature_importance()

Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'ca', 'target', 'cp_0', 'cp_1', 'cp_2', 'cp_3', 'restecg_0',
       'restecg_1', 'restecg_2', 'slope_0', 'slope_1', 'slope_2', 'thal_0',
       'thal_1', 'thal_2', 'thal_3'],
      dtype='object')
Sort features ascendingly based on feauture importance: 
['thal_1', 'slope_0', 'trestbps', 'chol', 'thal_0', 'fbs', 'cp_1', 'restecg_0', 'restecg_2', 'age', 'cp_3', 'restecg_1', 'slope_1', 'slope_2', 'thalach', 'sex', 'cp_2', 'oldpeak', 'exang', 'cp_0', 'thal_2', 'thal_3', 'ca']
