# Attempt to replicate performance
In this notebook, we replicate the machine learning comparison performed in the paper. Unfortunately, we are not able to replicate the performance reported. In fact, only two methods are able to consistently outperform random guessing. Moreover, these two methods show an AUC of ~0.55 at best, far below the performance found in the paper of AUC 0.79.

In [92]:
import pathlib
import pickle
import re

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier, LogisticRegression, ElasticNet
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

In [3]:
# Load patient class labels
case_control_df = pd.read_excel('../controlcase.xlsx')
patient_id_to_case = case_control_df[['DummyID', 'Class']].set_index('DummyID')['Class'].to_dict()

In [4]:
# Path to feature summary statistic data
data_path = pathlib.Path('../data/secure/')

In [5]:
# Arbitrary path to features directory in order to extract names
# of relevant features
feature_path = next(data_path.glob('*/*/*/feature/'))

feature_names = list()
for filename in feature_path.glob('*.nii.gz'):
    feature_name = re.search('(?<=_norm_).+(?=\.nii\.gz)', filename.name).group()
    feature_names.append(feature_name)
    
feature_names = sorted(feature_names)

In [6]:
# Aggregate the data into feature vectors
# Note: Median was not used in the paper
feature_vectors = []
classes = []

for sheet_path in data_path.glob('*/*/sheet/'):
    patient_id = int(sheet_path.parent.parent.name)
    
    mean_path = next(sheet_path.glob('*_mean.csv'))
#     median_path = next(sheet_path.glob('*_median.csv'))
    std_path = next(sheet_path.glob('*_std.csv'))
    
    mean_df = pd.read_csv(mean_path)
#     median_df = pd.read_csv(median_path)
    std_df = pd.read_csv(std_path)
    
    mean_df.dropna(inplace=True)
    std_df.dropna(inplace=True)
    if len(mean_df) == 0 or len(std_df) == 0:
        continue
    
    # Order feature vector as mean, std with all values in the same order
    feat = np.concatenate((
        mean_df.iloc[0,:][feature_names].values.flatten(), 
        std_df.iloc[0,:][feature_names].values.flatten()
    ))
    feature_vectors.append(feat)
    classes.append(patient_id_to_case[patient_id])

In [7]:
# Combine data into two numpy arrays
x = np.zeros((len(feature_vectors), len(feature_vectors[0])))
for i, v in enumerate(feature_vectors):
    x[i] = v
y = np.array(classes)

In [112]:
# Train/test split. Verify that its roughly stratified.
# Can't shuffle because some patients have multiple samples
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False)

In [9]:
y_train.sum() / len(y_train)

0.18900343642611683

In [10]:
y_test.sum() / len(y_test)

0.2328767123287671

## Naive machine learning methods
The following pipeline gave the best and most robust performance: [min/max scaling, PCA, gradient boosting classifier].

In [114]:
for model in [LogisticRegression(), SGDClassifier(), 
              RandomForestClassifier(n_estimators=20), 
              GradientBoostingClassifier(), AdaBoostClassifier()]:
    print(model)
    mmscaler = MinMaxScaler()
    scaler = PCA()
    pipe = Pipeline([('minmax', mmscaler), ('PCA', scaler), ('ML model', model)])

    pipe.fit(X_train, y_train)
    accuracy = pipe.score(X_test, y_test)
    print(f"Accuracy: {accuracy}")

    preds = pipe.predict(X_test)
    auc = roc_auc_score(y_test, preds)
    print(f"AUC: {auc}\n")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy: 0.771689497716895
AUC: 0.5098039215686274

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Accuracy: 0.410958904109589
AUC: 0.5273109243697479

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, 



Accuracy: 0.776255707762557
AUC: 0.5469187675070027

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Accuracy: 0.7168949771689498
AUC: 0.4877450980392157



## Elastic net feature selection
Unlike in the paper, elastic net narrowed to 10 features. Only after reducing the L1 penalty by 50% do 12 features appear. 

See regressor docs:
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

In [95]:
elnet = ElasticNet(alpha=0.5)
elnet.fit(X_train, y_train)

selected_coefs = list()
for i, coef in enumerate(elnet.coef_):
    if np.abs(coef) > 1e-9:
        if i < len(feature_names):
            name = feature_names[i]
            print(coef, name)
        else:
            name = feature_names[i % len(feature_names)]
            print(coef, name)
        selected_coefs.append(name)
        
print(len(selected_coefs))

1.3294124516672958e-07 cooccurrence_win_97_sliding_97_numbin_128_offset_17_clusterProminence
2.2654627417148872e-05 cooccurrence_win_97_sliding_97_numbin_128_offset_17_clusterShade
-2.2188121038542043e-08 cooccurrence_win_97_sliding_97_numbin_128_offset_17_correlation
-8.120668485534963e-05 cooccurrence_win_97_sliding_97_numbin_128_offset_17_energy
0.00013606320831356636 cooccurrence_win_97_sliding_97_numbin_128_offset_17_entropy
4.8163651208789366e-05 cooccurrence_win_97_sliding_97_numbin_128_offset_17_haralickCorrelation
-1.7441680255954657e-07 cooccurrence_win_97_sliding_97_numbin_128_offset_17_inertia
2.3826922148237065e-05 cooccurrence_win_97_sliding_97_numbin_128_offset_17_inverseDifferenceMoment
-2.952618964204289e-09 edgeenhance_win_97_sliding_97_Eta_10_epsi_10_radius_edge_8_radius_8_edge_enhance
-3.658555851167091e-05 graylevel_win_97_sliding_97_numbin_128_5th
0.00025342873072403005 graylevel_win_97_sliding_97_numbin_128_5thmean
2.0116432573701287e-05 graylevel_win_97_sliding_

## Predict using elastic net selected features

No method is able to replicate the performance of the MinMax/PCA/GradientBoosting pipeline above, which used all available features as input.

In [96]:
# Cut down train/test data to those features selected by elastic net
X_train = X_train[:, np.abs(elnet.coef_) > 1e-9]
X_test = X_test[:, np.abs(elnet.coef_) > 1e-9]

In [110]:
# ML models without any scaling or preprocessing
for model in [LogisticRegression(), SGDClassifier(), 
              RandomForestClassifier(n_estimators=20), 
              GradientBoostingClassifier(), AdaBoostClassifier()]:
    
    print(model)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"Accuracy: {accuracy}")

    preds = model.predict(X_test)
    auc = roc_auc_score(y_test, preds)
    print(f"AUC: {auc}\n")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy: 0.7671232876712328
AUC: 0.5

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Accuracy: 0.7671232876712328
AUC: 0.5

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
 



Accuracy: 0.7579908675799086
AUC: 0.507703081232493



In [98]:
# ML pipeline with MinMaxScaler scaling
for model in [LogisticRegression(), SGDClassifier(), 
              RandomForestClassifier(n_estimators=20), 
              GradientBoostingClassifier(), AdaBoostClassifier()]:
    print(model)
    scaler = MinMaxScaler()
    pipe = Pipeline([('scale', scaler), ('ML model', model)])

    pipe.fit(X_train, y_train)
    accuracy = pipe.score(X_test, y_test)
    print(f"Accuracy: {accuracy}")

    preds = pipe.predict(X_test)
    auc = roc_auc_score(y_test, preds)
    print(f"AUC: {auc}\n")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy: 0.7671232876712328
AUC: 0.5

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Accuracy: 0.2328767123287671
AUC: 0.5

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
 



Accuracy: 0.7579908675799086
AUC: 0.507703081232493

