In [1]:
import afqinsight as afqi
import cloudknot as ck
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
from sklearn.pipeline import Pipeline

%matplotlib inline



In [2]:
nodes = pd.read_csv('../afqinsight/data/nodes.csv')
targets = pd.read_csv('../afqinsight/data/subjects.csv', index_col='subjectID').drop(['Unnamed: 0'], axis='columns')
targets

Unnamed: 0_level_0,ALSFRS,ALSFRSbulbar,age,class,diseaseduration,gender
subjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
subject_000,33,12,54,ALS,10,F
subject_001,18,8,69,ALS,36,M
subject_002,27,9,55,ALS,23,F
subject_003,32,12,52,ALS,15,M
subject_004,19,4,58,ALS,6,F
subject_005,33,8,67,ALS,48,F
subject_006,21,5,43,ALS,5,M
subject_007,38,8,67,ALS,3,M
subject_008,29,9,65,ALS,15,M
subject_009,25,9,72,ALS,1,F


In [3]:
transformer = afqi.AFQFeatureTransformer()
x, groups, cols = transformer.transform(nodes)

In [4]:
label_sets = afqi.multicol2sets(cols)
label_sets

array([{'0', 'ad', 'Callosum Forceps Major'},
       {'ad', 'Callosum Forceps Major', '1'},
       {'2', 'ad', 'Callosum Forceps Major'}, ...,
       {'volume', 'Right Uncinate', 'Uncinate', '97'},
       {'volume', 'Right Uncinate', 'Uncinate', '98'},
       {'volume', 'Right Uncinate', '99', 'Uncinate'}], dtype=object)

In [5]:
metric_idx = cols.names.index('metric')
tract_idx = cols.names.index('tractID')

tracts_and_metrics = list(
    itertools.product(cols.levels[metric_idx], cols.levels[tract_idx])
)

tracts_only = [(x, ) for x in cols.levels[tract_idx]]

handed_tracts = [x for x in tracts_only if 'Left' in x[0] or 'Right' in x[0]]

tracts_symm = list(set([
    (x[0].replace('Left ', '').replace('Right ', ''), )
    for x in handed_tracts
]))

group_labels = tracts_and_metrics + tracts_symm + tracts_only

# Get the feature group importances for classifying the subject class

In [6]:
y = targets['class']
y = y.map(lambda h: int(h == 'ALS')).values
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [7]:
importances = afqi.get_random_forest_group_scores(
    x, y, group_labels, label_sets, type='classifier', n_splits=200
)

100%|██████████| 200/200 [15:10<00:00,  4.55s/it]


In [8]:
for f, s in importances:
    print('{i:+7.5f} : {f!s}'.format(i=s, f=f))

+0.09837 : ('Corticospinal',)
+0.07356 : ('Right Corticospinal',)
+0.03378 : ('cl', 'Right Corticospinal')
+0.02843 : ('fa', 'Right Corticospinal')
+0.02014 : ('rd', 'Left Corticospinal')
+0.01157 : ('rd', 'Right Corticospinal')
+0.01061 : ('SLF',)
+0.00923 : ('md', 'Right Corticospinal')
+0.00868 : ('fa', 'Left SLF')
+0.00810 : ('rd', 'Left SLF')
+0.00740 : ('cl', 'Left Corticospinal')
+0.00720 : ('md', 'Left Thalamic Radiation')
+0.00640 : ('fa', 'Left Corticospinal')
+0.00608 : ('curvature', 'Left SLF')
+0.00408 : ('volume', 'Callosum Forceps Major')
+0.00404 : ('md', 'Left Arcuate')
+0.00360 : ('torsion', 'Left Uncinate')
+0.00323 : ('curvature', 'Left IFOF')
+0.00309 : ('volume', 'Right Cingulum Hippocampus')
+0.00300 : ('curvature', 'Left Uncinate')
+0.00282 : ('rd', 'Left ILF')
+0.00277 : ('curvature', 'Left Cingulum Hippocampus')
+0.00257 : ('cl', 'Right Arcuate')
+0.00202 : ('torsion', 'Right SLF')
+0.00182 : ('md', 'Callosum Forceps Minor')
+0.00095 : ('rd', 'Right SLF')
+0.0

# Restrict the feature groups to the non-overlapping tract-metric groups

In [9]:
tract_metric_importances = afqi.get_random_forest_group_scores(
    x, y, tracts_and_metrics, label_sets, n_splits=200
)

100%|██████████| 200/200 [11:11<00:00,  3.36s/it]


In [10]:
for idx, (f, s) in enumerate(tract_metric_importances):
    print('{idx:03d}. {i:+7.5f} : {f!s}'.format(idx=idx+1, i=s, f=f))

001. +0.01931 : ('cl', 'Right Corticospinal')
002. +0.01013 : ('fa', 'Right Corticospinal')
003. +0.00744 : ('rd', 'Right Corticospinal')
004. +0.00546 : ('md', 'Left Arcuate')
005. +0.00534 : ('ad', 'Right Cingulum Cingulate')
006. +0.00513 : ('md', 'Left Thalamic Radiation')
007. +0.00474 : ('ad', 'Right Uncinate')
008. +0.00458 : ('volume', 'Right Cingulum Cingulate')
009. +0.00425 : ('volume', 'Right Arcuate')
010. +0.00402 : ('rd', 'Left Corticospinal')
011. +0.00341 : ('cl', 'Right Uncinate')
012. +0.00339 : ('fa', 'Left Cingulum Cingulate')
013. +0.00269 : ('fa', 'Callosum Forceps Major')
014. +0.00239 : ('torsion', 'Right ILF')
015. +0.00221 : ('curvature', 'Left Uncinate')
016. +0.00219 : ('volume', 'Right Uncinate')
017. +0.00208 : ('md', 'Left Uncinate')
018. +0.00207 : ('curvature', 'Right Thalamic Radiation')
019. +0.00194 : ('torsion', 'Left Cingulum Hippocampus')
020. +0.00189 : ('md', 'Left Corticospinal')
021. +0.00182 : ('ad', 'Left SLF')
022. +0.00154 : ('ad', 'Left 

# Perform cross-validation to select the best $N$ non-overlapping groups

In [13]:
def grid_search_groups_results(x, y, top_n, label_sets, important_labels):
    import afqinsight as afqi
    from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    
    extractor = afqi.TopNGroupsExtractor()

    rf = RandomForestClassifier(
        n_estimators=100, criterion='entropy'
    )

    steps = [
        ('extract', extractor),
        ('forest', rf)
    ]

    classify_pipeline = Pipeline(steps)

    param_grid = {
        'extract__all_labels': [label_sets],
        'extract__labels_by_importance': [important_labels],
        'extract__top_n': [top_n],
        'forest__max_features': ['log2', 'sqrt'],
        'forest__max_depth': [5, 10],
    }
    
    scoring = {
        'AUC': 'roc_auc',
        'Accuracy': 'accuracy',
    }

    cv_generator = RepeatedStratifiedKFold(n_splits=3, n_repeats=40)

    forest_classify_cv = GridSearchCV(
        classify_pipeline, param_grid=param_grid,
        scoring=scoring, cv=cv_generator, refit='Accuracy',
        return_train_score=True, verbose=1
    )
    
    forest_classify_cv.fit(x, y)
    
    return forest_classify_cv.cv_results_

In [46]:
ck.set_region('us-east-2')

In [49]:
knot = ck.Knot(
    name='gridsearch_cv_groups',
    image_github_installs=('https://github.com/richford/AFQ-Insight.git'),
    func=grid_search_groups_results
)

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [51]:
important_labels = [
    pair[0] for pair in tract_metric_importances
]

In [52]:
args = [(x_train, y_train, n, label_sets, important_labels) for n in np.arange(1, 161)]

In [53]:
cvgrids = knot.map(args, starmap=True)

In [54]:
cvjob = knot.jobs[0]

In [73]:
cvjob.status

{'arrayProperties': {'size': 160,
  'statusSummary': {'FAILED': 0,
   'PENDING': 0,
   'RUNNABLE': 0,
   'RUNNING': 0,
   'STARTING': 0,
   'SUBMITTED': 0,
   'SUCCEEDED': 160}},
 'attempts': [],
 'status': 'SUCCEEDED',
 'statusReason': None}

In [89]:
print('Running: ', cvgrids.running())
print('Done:    ', cvgrids.done())

Running:  False
Done:     True


In [90]:
all_cv_grids = cvgrids.result()

In [None]:
len(all_cv_grids)

In [97]:
df_list = []

for d in all_cv_grids:
    df_list.append(pd.DataFrame(d))

cv_df = pd.concat(df_list, ignore_index=True)
cv_df

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_AUC,mean_test_Accuracy,mean_train_AUC,mean_train_Accuracy,param_extract__all_labels,param_extract__labels_by_importance,param_extract__top_n,param_forest__max_depth,...,split9_test_AUC,split9_test_Accuracy,split9_train_AUC,split9_train_Accuracy,std_fit_time,std_score_time,std_test_AUC,std_test_Accuracy,std_train_AUC,std_train_Accuracy
0,0.129105,0.018709,0.892779,0.788816,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",1,5,...,1.000000,1.000000,1.0,1.0,0.007742,0.000620,0.081915,0.099797,1.755417e-17,0.0
1,0.169899,0.025670,0.888048,0.783553,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",1,5,...,1.000000,1.000000,1.0,1.0,0.045699,0.006711,0.085755,0.099311,1.013490e-17,0.0
2,0.234329,0.038271,0.891197,0.787500,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",1,10,...,0.979592,0.928571,1.0,1.0,0.038498,0.017190,0.087836,0.107023,2.026981e-17,0.0
3,0.235339,0.037265,0.888033,0.784868,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",1,10,...,1.000000,0.928571,1.0,1.0,0.031951,0.004852,0.085147,0.105858,1.013490e-17,0.0
4,0.257103,0.042874,0.912704,0.819079,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",2,5,...,1.000000,0.928571,1.0,1.0,0.026858,0.005416,0.070778,0.098019,1.755417e-17,0.0
5,0.261318,0.041331,0.902820,0.809211,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",2,5,...,1.000000,0.928571,1.0,1.0,0.035410,0.005191,0.082589,0.094796,1.755417e-17,0.0
6,0.273970,0.043768,0.916949,0.817105,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",2,10,...,1.000000,0.928571,1.0,1.0,0.021445,0.009931,0.068040,0.098152,1.755417e-17,0.0
7,0.260483,0.043876,0.908145,0.807237,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {1, Callosum...","[(cl, Right Corticospinal), (fa, Right Cortico...",2,10,...,1.000000,1.000000,1.0,1.0,0.033576,0.006711,0.074269,0.090888,0.000000e+00,0.0
8,0.153097,0.030939,0.905592,0.809211,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {ad, Callosu...","[(cl, Right Corticospinal), (fa, Right Cortico...",3,5,...,0.734694,0.571429,1.0,1.0,0.029206,0.007594,0.074786,0.098880,1.755417e-17,0.0
9,0.239304,0.049709,0.895081,0.791447,1.0,1.0,"[{0, ad, Callosum Forceps Major}, {ad, Callosu...","[(cl, Right Corticospinal), (fa, Right Cortico...",3,5,...,0.724490,0.642857,1.0,1.0,0.044481,0.007510,0.081040,0.101046,2.482534e-17,0.0


In [98]:
cv_df.drop(['param_extract__all_labels', 'param_extract__labels_by_importance'], axis='columns', inplace=True)

In [None]:
# cv_df.to_pickle('rf_classify_top_n_cv_results_160.pkl')

KeyboardInterrupt: 

In [None]:
mask = cv_df['param_forest__max_features'] is None
cv_df[mask].plot(x='param_extract__top_n', y='mean_test_AUC', label='Test AUC')
cv_df[mask].plot(x='param_extract__top_n', y='mean_test_Accuracy', label='Test Accuracy')