In [1]:
import afq_transform as afqt
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
from sklearn.pipeline import Pipeline



In [2]:
nodes = pd.read_csv('data/nodes.csv')
targets = pd.read_csv('data/subjects.csv', index_col='subjectID').drop(['Unnamed: 0'], axis='columns')
targets

Unnamed: 0_level_0,ALSFRS,ALSFRSbulbar,age,class,diseaseduration,gender
subjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
subject_000,33,12,54,ALS,10,F
subject_001,18,8,69,ALS,36,M
subject_002,27,9,55,ALS,23,F
subject_003,32,12,52,ALS,15,M
subject_004,19,4,58,ALS,6,F
subject_005,33,8,67,ALS,48,F
subject_006,21,5,43,ALS,5,M
subject_007,38,8,67,ALS,3,M
subject_008,29,9,65,ALS,15,M
subject_009,25,9,72,ALS,1,F


In [3]:
transformer = afqt.AFQFeatureTransformer()
x, groups, cols = transformer.transform(nodes)

In [4]:
label_sets = afqt.multicol2sets(cols)
label_sets

array([{'0', 'Callosum Forceps Major', 'ad'},
       {'Callosum Forceps Major', 'ad', '1'},
       {'2', 'Callosum Forceps Major', 'ad'}, ...,
       {'Right Uncinate', 'Uncinate', 'volume', '97'},
       {'Right Uncinate', 'Uncinate', 'volume', '98'},
       {'Right Uncinate', 'Uncinate', 'volume', '99'}], dtype=object)

In [5]:
metric_idx = cols.names.index('metric')
tract_idx = cols.names.index('tractID')

tracts_and_metrics = list(
    itertools.product(cols.levels[metric_idx], cols.levels[tract_idx])
)

tracts_only = [(x, ) for x in cols.levels[tract_idx]]

handed_tracts = [x for x in tracts_only if 'Left' in x[0] or 'Right' in x[0]]

tracts_symm = list(set([
    (x[0].replace('Left ', '').replace('Right ', ''), )
    for x in handed_tracts
]))

group_labels = tracts_and_metrics + tracts_symm + tracts_only

# Get the feature group importances for classifying the subject class

In [6]:
y = targets['ALSFRS'].values
y = y[y > 0]
x = x[y > 0]
y

  app.launch_new_instance()


array([33, 18, 27, 32, 19, 33, 21, 38, 29, 25, 33, 30, 21, 20, 18, 37, 33,
       29, 15, 24, 25, 34, 34, 35])

In [9]:
importances = afqt.get_random_forest_group_scores(
    x, y, group_labels, label_sets, type='regressor', n_splits=100
)

100%|██████████| 100/100 [06:57<00:00,  4.18s/it]


In [10]:
for idx, (f, s) in enumerate(importances):
    print('{idx:03d}. {i:+7.5f} : {f!s}'.format(idx=idx+1, i=s, f=f))

001. +0.02770 : ('Corticospinal',)
002. +0.01998 : ('SLF',)
003. +0.01940 : ('IFOF',)
004. +0.01428 : ('Left Corticospinal',)
005. +0.01413 : ('ILF',)
006. +0.01406 : ('Left IFOF',)
007. +0.01305 : ('Left SLF',)
008. +0.01079 : ('Arcuate',)
009. +0.00896 : ('Right SLF',)
010. +0.00734 : ('Right Corticospinal',)
011. +0.00595 : ('rd', 'Left Corticospinal')
012. +0.00585 : ('md', 'Left IFOF')
013. +0.00580 : ('ad', 'Left IFOF')
014. +0.00489 : ('Right Arcuate',)
015. +0.00442 : ('Left Cingulum Hippocampus',)
016. +0.00419 : ('Right Thalamic Radiation',)
017. +0.00418 : ('rd', 'Left ILF')
018. +0.00397 : ('Left ILF',)
019. +0.00389 : ('curvature', 'Left IFOF')
020. +0.00365 : ('curvature', 'Right SLF')
021. +0.00365 : ('cl', 'Left SLF')
022. +0.00360 : ('md', 'Left Arcuate')
023. +0.00359 : ('md', 'Left Corticospinal')
024. +0.00355 : ('Thalamic Radiation',)
025. +0.00312 : ('ad', 'Left SLF')
026. +0.00302 : ('Right ILF',)
027. +0.00297 : ('torsion', 'Left ILF')
028. +0.00290 : ('md', 'Le

# Restrict the feature groups to the non-overlapping tract-metric groups

In [None]:
tract_metric_importances = afqt.get_random_forest_group_scores(
    x, y, tracts_and_metrics, label_sets, type='regressor'
)

In [None]:
for idx, (f, s) in enumerate(tract_metric_importances):
    print('{idx:03d}. {i:+7.5f} : {f!s}'.format(idx=idx+1, i=s, f=f))

# Perform cross-validation to select the best $N$ non-overlapping groups

In [None]:
important_labels = [
    pair[0] for pair in tract_metric_importances
]

extractor = afqt.TopNGroupsExtractor()

rf = RandomForestRegressor(
    n_estimators=100,
)

steps = [
    ('extract', afqt.TopNGroupsExtractor()),
    ('forest', rf)
]

classify_pipeline = Pipeline(steps)

## Fit the gridsearch using accuracy

In [None]:
param_grid = {
    'extract__all_labels': [label_sets],
    'extract__labels_by_importance': [important_labels],
    'extract__top_n': np.arange(10, 161, 10),
    'forest__max_features': [10, 100, None],
    'forest__max_depth': [10, None],
}

scoring = {
    'MSE': 'neg_mean_squared_error',
    'R2': 'r2',
}

forest_regress_cv = GridSearchCV(
    classify_pipeline, param_grid=param_grid,
    scoring=scoring, cv=5, refit='R2',
    verbose=2
)

In [None]:
forest_regress_cv.fit(x, y)

In [None]:
forest_regress_cv.best_params_

In [None]:
forest_regress_cv.best_score_

In [None]:
y_pred = forest_regress_cv.best_estimator_.predict(x)

In [None]:
plt.plot(y, color='blue')
plt.plot(y_pred, color='red')