In [1]:
from importlib import reload
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
import fca
from datetime import datetime
from sklearn.linear_model import LogisticRegression
import copy

In [2]:
#https://archive.ics.uci.edu/ml/datasets/Heart+Disease
data = pd.read_csv('heart-disease-uci.zip')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Attribute Information:
1. age
2. sex
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar > 120 mg/dl
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

# Scaling

In [3]:
data = pd.read_csv('heart-disease-uci.zip')
data = data.dropna()

intervals = 5
for attr in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']:
    min_val = data[attr].min()
    max_val = data[attr].max()
    gap = max_val - min_val
    k = 0
    for i in np.linspace(min_val + gap / intervals, max_val - gap / intervals, intervals):
        data[attr + '_' + str(k)] = (data[attr] >= i).astype(int)
        k += 1
    data = data.drop(attr, axis=1)
    
for attr in ['cp', 'restecg', 'slope', 'ca', 'thal']:
    for i in data[attr].unique():
        data[attr + '_' + str(i)] = (data[attr] == i).astype(int)
    data = data.drop(attr, axis=1)

In [4]:
data.head()

Unnamed: 0,sex,fbs,exang,target,age_0,age_1,age_2,age_3,age_4,trestbps_0,...,slope_1,ca_0,ca_2,ca_1,ca_3,ca_4,thal_1,thal_2,thal_3,thal_0
0,1,1,0,1,1,1,1,1,0,1,...,0,1,0,0,0,0,1,0,0,0
1,1,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
2,0,0,0,1,1,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,1,0,0,1,1,1,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,0,0,1,1,1,1,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0


# Lazy classifying

In [5]:
results = {'accuracy': [], 'precision': [], 'recall': [], 'seconds': [], 'params': []}
reload(fca)

columns = list(data.columns)
columns.remove('target')

X = data.loc[:, columns]
y = data.target
for i in ['random', 'true', 'false']:
    for j in np.linspace(0.1, 0.9, 5):
        results = {'accuracy': [], 'precision': [], 'recall': [], 'seconds': []}
        clf = fca.LazyClassifier(threshold=j, bias=i, random=True, sample_share=0.3)
        
        print()
        print('Parameters:', clf)
        print()
        
        for k in range(10):

            X_train, X_test, y_train, y_test = model_selection\
                .train_test_split(X, y, test_size=0.33, random_state=k)

            clf.fit(X_train, y_train)

            s = datetime.now()
            y_pred = clf.predict(X_test)
            f = datetime.now()

            results['accuracy'].append(metrics.accuracy_score(y_test, y_pred))
            results['precision'].append(metrics.precision_score(y_test, y_pred))
            results['recall'].append(metrics.recall_score(y_test, y_pred))
            results['seconds'].append((f - s).seconds)
           

        print(pd.DataFrame(results))
        print()
        print('Accuracy:', pd.DataFrame(results)['accuracy'].mean())


Parameters: LazyClassifier(bias='random', random=True, sample_share=0.3, threshold=0.1)

   accuracy  precision    recall  seconds
0      0.85   0.911111  0.788462        6
1      0.73   0.829268  0.629630        6
2      0.92   0.901639  0.964912        4
3      0.87   0.866667  0.912281        5
4      0.83   0.901961  0.793103        6
5      0.81   0.948718  0.685185        6
6      0.83   0.816327  0.833333        5
7      0.78   0.788462  0.788462        5
8      0.78   0.895833  0.716667        5
9      0.74   0.860000  0.693548        4

Accuracy: 0.8140000000000001

Parameters: LazyClassifier(bias='random', random=True, sample_share=0.3,
               threshold=0.30000000000000004)

   accuracy  precision    recall  seconds
0      0.82   0.925000  0.711538        5
1      0.71   0.931034  0.500000        5
2      0.86   0.864407  0.894737        4
3      0.81   0.851852  0.807018        4
4      0.79   0.911111  0.706897        5
5      0.76   0.941176  0.592593        5
6  

   accuracy  precision    recall  seconds
0      0.81   0.851064  0.769231        4
1      0.71   0.820513  0.592593        4
2      0.85   0.920000  0.807018        4
3      0.84   0.847458  0.877193        5
4      0.80   0.895833  0.741379        5
5      0.81   0.948718  0.685185        4
6      0.83   0.844444  0.791667        4
7      0.84   0.833333  0.865385        5
8      0.74   0.925000  0.616667        5
9      0.76   0.839286  0.758065        4

Accuracy: 0.799


In [6]:
reload(fca)

columns = list(data.columns)
columns.remove('target')

X = data.loc[:, columns]
y = data.target

results = {'accuracy': [], 'precision': [], 'recall': [], 'seconds': []}
clf = fca.LazyClassifier(threshold=0.9, bias='false')
for k in range(10):

    X_train, X_test, y_train, y_test = model_selection\
        .train_test_split(X, y, test_size=0.33, random_state=k)

    clf.fit(X_train, y_train)

    s = datetime.now()
    y_pred = clf.predict(X_test)
    f = datetime.now()

    results['accuracy'].append(metrics.accuracy_score(y_test, y_pred))
    results['precision'].append(metrics.precision_score(y_test, y_pred))
    results['recall'].append(metrics.recall_score(y_test, y_pred))
    results['seconds'].append((f - s).seconds)

print(pd.DataFrame(results))
print()
print('Accuracy:', pd.DataFrame(results)['accuracy'].mean())

   accuracy  precision    recall  seconds
0      0.83   0.906977  0.750000       16
1      0.75   0.853659  0.648148       16
2      0.85   0.920000  0.807018       17
3      0.83   0.884615  0.807018       20
4      0.80   0.895833  0.741379       16
5      0.83   0.930233  0.740741       16
6      0.84   0.863636  0.791667       18
7      0.85   0.849057  0.865385       17
8      0.79   0.933333  0.700000       17
9      0.72   0.826923  0.693548       17

Accuracy: 0.8089999999999999


# Classical logistic regression

In [7]:
data = pd.read_csv('heart-disease-uci.zip')
data = data.dropna()

columns = list(data.columns)
columns.remove('target')

results = {'accuracy': [], 'precision': [], 'recall': [], 'seconds': []}
reload(fca)
clf = LogisticRegression(solver='lbfgs', random_state=0)
for k in range(10):

    X_train, X_test, y_train, y_test = model_selection\
        .train_test_split(X, y, test_size=0.33, random_state=k)

    clf.fit(X_train, y_train)

    s = datetime.now()
    y_pred = clf.predict(X_test)
    f = datetime.now()

    results['accuracy'].append(metrics.accuracy_score(y_test, y_pred))
    results['precision'].append(metrics.precision_score(y_test, y_pred))
    results['recall'].append(metrics.recall_score(y_test, y_pred))
    results['seconds'].append((f - s).seconds)

print(pd.DataFrame(results))
print()
print('Accuracy:', pd.DataFrame(results)['accuracy'].mean())

   accuracy  precision    recall  seconds
0      0.83   0.843137  0.826923        0
1      0.79   0.823529  0.777778        0
2      0.87   0.854839  0.929825        0
3      0.88   0.868852  0.929825        0
4      0.81   0.830508  0.844828        0
5      0.85   0.842105  0.888889        0
6      0.83   0.792453  0.875000        0
7      0.81   0.789474  0.865385        0
8      0.81   0.872727  0.800000        0
9      0.82   0.866667  0.838710        0

Accuracy: 0.8300000000000001
