In [1]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score,precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib.colors import LinearSegmentedColormap

In [2]:
from model import *

In [3]:
colors = [
    # '#84C3B7',
          '#FFFFFF', 
          '#E68B81', 
          ]
color_map = LinearSegmentedColormap.from_list('custom_colormap', colors)

In [4]:
def eval(clf, X, y_true, prefix='',labels=['Healthy Control','Lung Cancer'],plot = False):
    y_proba = clf.predict_proba(X)
    y_pred = np.asarray([np.argmax(row) for row in y_proba])
    print(f'{prefix} :confusion_matrix: \n', confusion_matrix(y_true, y_pred))
    if y_proba.shape[1] == 2:
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm[0][0],cm[0][1],cm[1][0],cm[1][1]
        print(f'{prefix}: accuracy: ', (tp+tn)/(tp+tn+fp+fn))
        print(f'{prefix}: precision: ', tp/(tp+fp))
        print(f'{prefix}: recall: ', tp/(tp+fn))
        print(f'{prefix}: roc: ', roc_auc_score(y_true, y_proba[:, 1], average='macro'))
        print(f'{prefix}: f1:', f1_score(y_true, y_pred, average='weighted'))
    else:
        print(f'{prefix}: accuracy: ', accuracy_score(y_true, y_pred))
        print(f'{prefix}: precision: ', precision_score(y_true, y_pred, average='weighted'))
        print(f'{prefix}: recall: ', recall_score(y_true, y_pred, average='weighted'))
        print(f'{prefix}: roc: ', roc_auc_score(y_true, y_proba, multi_class='ovr'))
        print(f'{prefix}: f1:', f1_score(y_true, y_pred, average='weighted',))

    if plot == True:
        plt.figure(figsize=(8, 6))
        heatmap = sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt="d",cmap=color_map)

        heatmap.set_xticklabels(labels)
        heatmap.set_yticklabels(labels)
        heatmap.set_ylabel('True label')
        heatmap.set_xlabel('Predicted label')
        heatmap.set_title('Confusion Matrix')

        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        filename = f"{timestamp}_confusion_matrix_{prefix}.svg"
        plt.savefig(filename,format='svg')


In [16]:
data = pd.read_csv('data.csv',index_col=0)
X, y = data.iloc[:,:550], data.iloc[:,-1].to_numpy()

In [8]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,541,542,543,544,545,546,547,548,549,550
195,0.001015,0.000945,0.000871,0.000805,0.000735,0.00069,0.000609,0.000561,0.000487,0.000441,...,0.000435,0.000456,0.000475,0.000496,0.000524,0.000549,0.000565,0.000585,0.000618,0.000647
18,0.001029,0.000903,0.000856,0.000725,0.000774,0.000699,0.000659,0.000512,0.000491,0.00045,...,0.000371,0.000367,0.000397,0.000401,0.000442,0.000458,0.000458,0.000506,0.000502,0.000547
93,0.00105,0.000975,0.000898,0.000829,0.000747,0.000676,0.000618,0.000546,0.000472,0.000421,...,0.000433,0.000448,0.000475,0.000497,0.000527,0.000542,0.00057,0.000588,0.000626,0.000647
67,0.000992,0.000925,0.000864,0.000798,0.000735,0.000677,0.000595,0.000552,0.000483,0.000428,...,0.000486,0.000507,0.000527,0.000552,0.000577,0.000599,0.000617,0.00065,0.000686,0.000711
45,0.000975,0.000925,0.000823,0.000776,0.00069,0.000605,0.000532,0.000489,0.000411,0.000388,...,0.000384,0.000414,0.000432,0.000446,0.000468,0.000486,0.000518,0.000534,0.000572,0.000584


In [10]:
y[:5]

array([1, 1, 0, 1, 1])

In [23]:
clfs = [
        RandomForestClassifier(n_estimators=65, n_jobs=-1, criterion='gini',random_state=99),
        RandomForestClassifier(n_estimators=65, n_jobs=-1, criterion='entropy', random_state=666),
        ExtraTreesClassifier(n_estimators=65, n_jobs=-1, criterion='gini', random_state=42),
        ExtraTreesClassifier(n_estimators=65, n_jobs=-1, criterion='entropy', random_state=666),
        KNeighborsClassifier(n_neighbors=3, n_jobs=-1,weights='uniform'),
        KNeighborsClassifier(n_neighbors=3, n_jobs=-1, weights='distance'),
        xgb.XGBClassifier(n_estimators=65, learning_rate=0.1, max_depth=3),
    ]
msclf = MultiStack(clfs, LogisticRegression(), nStack=2, cv=5,nRepeat=3)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)
msclf.fit(X_train, y_train)


X.shape (156, 550)
y_m.shape (156, 7)
X.shape (156, 557)
y_m.shape (156, 7)
X.shape (156, 564)
y_m.shape (156, 7)


In [25]:
eval(msclf, X_test, y_test, prefix='Test',labels=['Healthy Control','Lung Cancer'],plot=False)

Test :confusion_matrix: 
 [[13  4]
 [ 0 50]]
Test: accuracy:  0.9402985074626866
Test: precision:  0.9259259259259259
Test: recall:  1.0
Test: roc:  0.9529411764705882
Test: f1: 0.9374665135859165
