In [1]:
# Our code imports
from Models import Model
from Metrics import accuracy, fmeasure, roc_auc
from Processing import Processor
from Pipelines import ModelGridBuilder, AnalysisPipeline

# Standard lib imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans

# General Testing

In [2]:
heart = pd.read_csv('https://www.dropbox.com/s/jpnyx41u7wpa41m/heart_attack_clean.csv?dl=1')
heart_X = heart.drop(columns='output')
heart_Y = heart['output']

In [3]:
logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=[.25, .5, 1, 1.5, 2])
svcSearchBuilder = ModelGridBuilder('SVC', parameters=[.25, .5, 1, 1.5, 2])
ldaSearchBuilder = ModelGridBuilder('LDA')
models = logisticSearchBuilder.get_models() + svcSearchBuilder.get_models() + ldaSearchBuilder.get_models()

modelAnalysis = AnalysisPipeline(heart_X, heart_Y, models, roc_auc)
modelAnalysis.process_data(split_type='random', train_prop=.8, dev_prop=.1)
modelAnalysis.fit_models()

In [None]:
modelAnalysis

In [4]:
# Declare a processor object
processor = Processor(heart_X, heart_Y)
# Standardize dummify, and pad with B0. Numeric data is infered if no column names provided
processor.process_data(numeric=[])
# Grab train, dev, and test set indices
processor.calculate_train_dev_test_split()
train_X, train_Y, dev_X, dev_Y, test_X, test_Y = processor.get_train_dev_test_sets(ret_numpy=True)
train_X.shape, dev_X.shape, test_X.shape, train_Y.shape, dev_Y.shape, test_Y.shape

((218, 14), (28, 14), (27, 14), (218,), (28,), (27,))

**Test Logistic**

In [5]:
logistic = Model('Logistic')
logistic.fit(train_X, train_Y, etas=[.01])

Gradient converged w/ 26 iterations and eta = 0.01


In [29]:
logistic.coef_

array([ 0.43049373, -0.24494049, -0.35145458, -0.30755228,  0.80157621,
        1.32409691, -0.89360318, -1.44110098,  0.80462346,  0.59638294,
        0.47058831,  0.14408393,  0.48099673, -0.19458693])

In [21]:
compute_metrics(logistic.predict(heart_X), heart_Y)

Accuracy: 0.799; Precision: 0.801; Recall: 0.829; f1: 0.815; ROC-AUC: 0.796


In [22]:
clf = LogisticRegression(fit_intercept=False, penalty='none')
clf.fit(heart_X, heart_Y)
clf.coef_, clf.score(heart_X, heart_Y)

(array([[ 0.37354234, -0.2427223 , -0.35312769, -0.31360559,  0.79589884,
          1.3091975 , -0.93565516, -1.46690184,  0.78711929,  0.57430834,
          0.47901655,  0.26171764,  0.59844848, -0.48662378]]),
 0.7985347985347986)

**Test Logistic Lasso**

In [9]:
logistic_lasso = Model('Logistic Lasso', lamb=1)
logistic_lasso.fit(heart_X, heart_Y)

Eta: 0.1; Iterations: 75000
Eta: 0.01; Iterations: 75000
Eta: 0.001; Iterations: 75000
Gradient converged w/ 1999 iterations and eta = 0.0001


In [11]:
logistic_lasso.coef_

array([ 0.51786504, -0.24378054, -0.32628711, -0.28039243,  0.75477116,
        1.11543986, -0.79737482, -1.33575536,  0.5307092 ,  0.57387384,
        0.14963736,  0.00987222,  0.35943055, -0.25103774])

In [13]:
compute_metrics(logistic_lasso.predict(heart_X), heart_Y)

Accuracy: 0.802; Precision: 0.799; Recall: 0.842; f1: 0.82; ROC-AUC: 0.799


In [25]:
# Check against sklearn
clf = LogisticRegression(fit_intercept=False, penalty='l2')
clf.fit(heart_X, heart_Y)
clf.coef_, clf.score(heart_X, heart_Y)

(array([[ 0.37698672, -0.22992372, -0.32117822, -0.27393535,  0.76172782,
          1.17964603, -0.80265931, -1.30033705,  0.70206826,  0.58295456,
          0.39230095,  0.12310419,  0.45560806, -0.20172553]]),
 0.7985347985347986)

**Test SVC**

In [4]:
svc = Model('SVC', lamb=1)
svc.fit(heart_X, heart_Y), svc.coef_

Eta: 0.1; Iterations: 1206
Gradient converged w/ 123 iterations and eta = 0.01


(None,
 array([ 0.0601239 , -0.24464632, -0.14514566, -0.08528606,  0.39045293,
         0.16411657, -0.12738526, -0.26793471,  0.09151584,  0.16312756,
         0.00323745, -0.07152445,  0.11616785, -0.03130467]))

In [5]:
svc.predict(heart_X)

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1])

In [36]:
compute_metrics(svc.predict(heart_X), heart_Y)

Accuracy: 0.736; Precision: 0.768; Recall: 0.726; f1: 0.746; ROC-AUC: 0.737


In [41]:
# Check against sklearn
clf = SVC(kernel='linear', C=1, degree=0)
clf.fit(heart_X, heart_Y)
clf.coef_, clf.intercept_, clf.score(heart_X, heart_Y)

(array([[-7.77156117e-16, -1.32571390e-01, -3.51887594e-01,
         -2.44453562e-01,  5.41043423e-01,  8.27604167e-01,
         -8.27604167e-01, -1.24547361e+00,  5.36317391e-01,
          4.06879039e-01,  3.02277181e-01,  5.48845951e-03,
          4.40598727e-02, -4.95483322e-02]]),
 array([0.8322204]),
 0.8021978021978022)

**Test LDA**

In [3]:
lda = Model('LDA')
lda.fit(heart_X, heart_Y)
lda.coef_

(array([-0.22330363, -0.31878649, -0.24470013,  0.7353733 ,  7.5546875 ,
         6.1640625 , -1.96484375,  0.98828125,  0.77734375,  0.58984375,
         2.625     ,  3.1875    ,  2.0625    ]),
 8.796416881801354)

In [4]:
compute_metrics(lda.predict(heart_X), heart_Y)

Accuracy: 0.78; Precision: 0.803; Recall: 0.781; f1: 0.792; ROC-AUC: 0.78


In [5]:
# Check against sklearn
clf = LinearDiscriminantAnalysis(solver='svd')
clf.fit(heart_X.T[1:].T, heart_Y)
clf.coef_, clf.intercept_, clf.score(heart_X.T[1:].T, heart_Y)

(array([[-0.22330363, -0.31878649, -0.24470013,  0.7353733 ,  1.04828114,
         -1.04828114, -1.70068584,  0.94194233,  0.8034785 ,  0.65550057,
         -0.14629208,  0.20392029, -0.9575273 ]]),
 array([0.94102802]),
 0.7948717948717948)