In [1]:
import afq_transform as afqt
import itertools
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
from sklearn.pipeline import Pipeline

In [2]:
nodes = pd.read_csv('data/nodes.csv')
targets = pd.read_csv('data/subjects.csv', index_col='subjectID').drop(['Unnamed: 0'], axis='columns')
targets

Unnamed: 0_level_0,ALSFRS,ALSFRSbulbar,age,class,diseaseduration,gender
subjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
subject_000,33,12,54,ALS,10,F
subject_001,18,8,69,ALS,36,M
subject_002,27,9,55,ALS,23,F
subject_003,32,12,52,ALS,15,M
subject_004,19,4,58,ALS,6,F
subject_005,33,8,67,ALS,48,F
subject_006,21,5,43,ALS,5,M
subject_007,38,8,67,ALS,3,M
subject_008,29,9,65,ALS,15,M
subject_009,25,9,72,ALS,1,F


In [3]:
transformer = afqt.AFQFeatureTransformer()
x, groups, cols = transformer.transform(nodes)

In [4]:
label_sets = afqt.multicol2sets(cols)
label_sets

array([{'Callosum Forceps Major', 'ad', '0'},
       {'Callosum Forceps Major', '1', 'ad'},
       {'Callosum Forceps Major', '2', 'ad'}, ...,
       {'volume', 'Right Uncinate', 'Uncinate', '97'},
       {'volume', 'Right Uncinate', 'Uncinate', '98'},
       {'volume', 'Right Uncinate', 'Uncinate', '99'}], dtype=object)

In [5]:
metric_idx = cols.names.index('metric')
tract_idx = cols.names.index('tractID')

tracts_and_metrics = list(
    itertools.product(cols.levels[metric_idx], cols.levels[tract_idx])
)

tracts_only = [(x, ) for x in cols.levels[tract_idx]]

handed_tracts = [x for x in tracts_only if 'Left' in x[0] or 'Right' in x[0]]

tracts_symm = list(set([
    (x[0].replace('Left ', '').replace('Right ', ''), )
    for x in handed_tracts
]))

group_labels = tracts_and_metrics + tracts_symm + tracts_only

# Get the feature group importances for classifying the subject class

In [6]:
y = targets['class']
y = y.map(lambda h: int(h == 'ALS')).values
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [7]:
importances = afqt.get_random_forest_group_scores(
    x, y, group_labels, label_sets, type='classifier'
)

100%|██████████| 100/100 [06:24<00:00,  3.85s/it]


In [8]:
for f, s in importances:
    print('{i:+7.5f} : {f!s}'.format(i=s, f=f))

+0.18714 : ('Corticospinal',)
+0.15180 : ('Right Corticospinal',)
+0.04361 : ('cl', 'Right Corticospinal')
+0.03262 : ('fa', 'Right Corticospinal')
+0.01339 : ('Left SLF',)
+0.01186 : ('rd', 'Right Corticospinal')
+0.00652 : ('torsion', 'Left IFOF')
+0.00618 : ('fa', 'Left Cingulum Cingulate')
+0.00522 : ('fa', 'Left Arcuate')
+0.00516 : ('torsion', 'Left Cingulum Cingulate')
+0.00508 : ('curvature', 'Left Arcuate')
+0.00501 : ('torsion', 'Right ILF')
+0.00491 : ('torsion', 'Left Uncinate')
+0.00488 : ('curvature', 'Right Cingulum Cingulate')
+0.00324 : ('torsion', 'Left Thalamic Radiation')
+0.00313 : ('ad', 'Right Uncinate')
+0.00248 : ('volume', 'Left Corticospinal')
+0.00211 : ('curvature', 'Left SLF')
+0.00206 : ('curvature', 'Right Arcuate')
+0.00195 : ('md', 'Right Cingulum Cingulate')
+0.00188 : ('fa', 'Right ILF')
+0.00181 : ('torsion', 'Left ILF')
+0.00166 : ('md', 'Right Corticospinal')
+0.00141 : ('ad', 'Right Corticospinal')
+0.00125 : ('cl', 'Left Cingulum Hippocampus')
+

# Restrict the feature groups to the non-overlapping tract-metric groups

In [9]:
tract_metric_importances = afqt.get_random_forest_group_scores(
    x, y, tracts_and_metrics, label_sets
)

100%|██████████| 100/100 [05:19<00:00,  3.20s/it]


In [14]:
for idx, (f, s) in enumerate(tract_metric_importances):
    print('{idx:03d}. {i:+7.5f} : {f!s}'.format(idx=idx+1, i=s, f=f))

001. +0.04307 : ('fa', 'Right Corticospinal')
002. +0.03956 : ('cl', 'Right Corticospinal')
003. +0.02490 : ('rd', 'Right Corticospinal')
004. +0.01778 : ('rd', 'Left Corticospinal')
005. +0.01240 : ('cl', 'Left Corticospinal')
006. +0.01234 : ('curvature', 'Left SLF')
007. +0.01112 : ('curvature', 'Left IFOF')
008. +0.00994 : ('curvature', 'Left Arcuate')
009. +0.00870 : ('md', 'Left Cingulum Hippocampus')
010. +0.00786 : ('fa', 'Left Corticospinal')
011. +0.00759 : ('fa', 'Right Uncinate')
012. +0.00697 : ('ad', 'Right Uncinate')
013. +0.00677 : ('ad', 'Left Cingulum Hippocampus')
014. +0.00674 : ('curvature', 'Callosum Forceps Minor')
015. +0.00669 : ('curvature', 'Left ILF')
016. +0.00644 : ('curvature', 'Right Thalamic Radiation')
017. +0.00627 : ('torsion', 'Left SLF')
018. +0.00621 : ('torsion', 'Right ILF')
019. +0.00602 : ('fa', 'Right Arcuate')
020. +0.00602 : ('md', 'Left Arcuate')
021. +0.00552 : ('torsion', 'Callosum Forceps Major')
022. +0.00549 : ('curvature', 'Left Unci

# Perform cross-validation to select the best $N$ non-overlapping groups

In [11]:
important_labels = [
    pair[0] for pair in tract_metric_importances
]

extractor = afqt.TopNGroupsExtractor()

rf = RandomForestClassifier(
    n_estimators=100,
)

steps = [
    ('extract', afqt.TopNGroupsExtractor()),
    ('forest', rf)
]

classify_pipeline = Pipeline(steps)

## Fit the gridsearch using accuracy

In [17]:
param_grid = {
    'extract__all_labels': [label_sets],
    'extract__labels_by_importance': [important_labels],
    'extract__top_n': np.arange(10, 161, 10),
    'forest__criterion': ['gini', 'entropy'],
    'forest__max_features': [10, 100, None],
    'forest__max_depth': [10, None],
}

scoring = {
    'AUC': 'roc_auc',
    'Accuracy': 'accuracy',
}

forest_classify_cv = GridSearchCV(
    classify_pipeline, param_grid=param_grid,
    scoring=scoring, cv=5, refit='Accuracy',
    verbose=1
)

In [18]:
forest_classify_cv.fit(x, y)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed: 30.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('extract', TopNGroupsExtractor(all_labels=None, labels_by_importance=None, top_n=10)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impur...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'extract__labels_by_importance': [[('fa', 'Right Corticospinal'), ('cl', 'Right Corticospinal'), ('rd', 'Right Corticospinal'), ('rd', 'Left Corticospinal'), ('cl', 'Left Corticospinal'), ('curvature', 'Left SLF'), ('curvature', 'Left IFOF'), ('curvature', 'Left Arcuate'), ('md', 'Left C...99'}], dtype=object)], 'forest__max_features': [10, None], 'forest__criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit='Accuracy',
    

In [19]:
forest_classify_cv.best_params_

{'extract__all_labels': array([{'Callosum Forceps Major', 'ad', '0'},
        {'Callosum Forceps Major', '1', 'ad'},
        {'Callosum Forceps Major', '2', 'ad'}, ...,
        {'volume', 'Right Uncinate', 'Uncinate', '97'},
        {'volume', 'Right Uncinate', 'Uncinate', '98'},
        {'volume', 'Right Uncinate', 'Uncinate', '99'}], dtype=object),
 'extract__labels_by_importance': [('fa', 'Right Corticospinal'),
  ('cl', 'Right Corticospinal'),
  ('rd', 'Right Corticospinal'),
  ('rd', 'Left Corticospinal'),
  ('cl', 'Left Corticospinal'),
  ('curvature', 'Left SLF'),
  ('curvature', 'Left IFOF'),
  ('curvature', 'Left Arcuate'),
  ('md', 'Left Cingulum Hippocampus'),
  ('fa', 'Left Corticospinal'),
  ('fa', 'Right Uncinate'),
  ('ad', 'Right Uncinate'),
  ('ad', 'Left Cingulum Hippocampus'),
  ('curvature', 'Callosum Forceps Minor'),
  ('curvature', 'Left ILF'),
  ('curvature', 'Right Thalamic Radiation'),
  ('torsion', 'Left SLF'),
  ('torsion', 'Right ILF'),
  ('fa', 'Right Arcua

In [22]:
forest_classify_cv.best_score_

0.85416666666666663

In [23]:
np.max(forest_classify_cv.cv_results_['mean_test_Accuracy'])

0.85416666666666663