In [1]:
import os
import numpy as np
import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
from sklearn import manifold
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import ShuffleSplit, LeaveOneOut
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import permutation_test_score

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

In [2]:
voxels = pd.read_csv("fszstatcope2_nvoxels_nz.csv", index_col=0)
behav = pd.read_csv("behav.txt", sep='\t', index_col=0)
print(np.mean(behav.index == voxels.index)) #should be 1.0
voxels_g27 = voxels.iloc[:, (voxels.iloc[1,:] > 27).values]
remove_feature_idx = []

for idx, col in enumerate(voxels_g27.columns):
    for ss in ["Vent", "Stem", "Cerebellum", "CSF", "White", "plexus"]:
        if ss in col:
            remove_feature_idx.append(idx)
            
remove_feature_idx = np.unique(remove_feature_idx)
print(voxels_g27.shape[1] - remove_feature_idx.shape[0]) # should be 58
voxels_data = voxels_g27.iloc[:, np.setdiff1d(np.arange(voxels_g27.shape[1]), remove_feature_idx)]
roi_data = pd.read_csv("fszstatcope2_means_nz.csv", index_col=0).loc[:, voxels_data.columns]
print(np.all(roi_data.columns == voxels_data.columns)) # should be true

# validation set
vset_roi_data = pd.read_csv("fszstatcope2_means_nz_valset.csv", index_col=0).loc[:, voxels_data.columns]
vset_voxel_data = pd.read_csv("fszstatcope2_nvoxels_nz_valset.csv", index_col=0).loc[:, voxels_data.columns]
vbehav = pd.read_csv("valset_behav.csv", index_col=0)

1.0
58
True


In [3]:
def projection(data , covars):
    """for removing linear effect of covariates"""
    X, C = data.values, covars.values
    P = np.eye(C.shape[0]) - C.dot(np.linalg.pinv(C.T.dot(C))).dot(C.T)
    return pd.DataFrame(P.dot(X), columns=data.columns, index=data.index)

data = projection(roi_data, behav.loc[:, ["gender", "iq", "composite_motion"]])
vset_data = projection(vset_roi_data, vbehav.loc[:, ["sex","iq", "composite_motion"]])

In [4]:
def evensplit_resampler(y, n_resamples):
    N, shuff_idx = y.shape[0], []
    if N % 2 == 0:
        class_one_N, class_two_N = N/2, N/2
    else:
        class_one_N = np.floor(N/2)
        class_two_N = N - class_one_N
    for idx in np.arange(n_resamples):
        class_one_idx, class_two_idx = np.where(y == 1)[0], np.where(y == 0)[0]
        idxs = np.array([
            np.random.choice(class_one_idx, size=int(class_one_N), replace=True),
            np.random.choice(class_two_idx, size=int(class_two_N), replace=True)
        ]).ravel()
        for shuff_iter in np.arange(10):
            np.random.shuffle(idxs)
        shuff_idx.append(idxs)
    return shuff_idx

In [5]:
X = data.values
y = behav.young_kid.values

results = {
    "pred":[], "ytrue":[], "nf":[], 
    "feature_names":[], "valset_score":[]
}

In [6]:
thresh = 0.01
cv = LeaveOneOut()
clf = RandomForestClassifier()

for idx, (train, test) in enumerate(cv.split(X, y)):
    sfm = SelectFromModel(clf, threshold=thresh)
    sfm.fit(X[train], y[train])
    n_features = sfm.transform(X[train]).shape[1]
    
    while n_features > 10:
        sfm.threshold += 0.01
        X_transform = sfm.transform(X[train])
        n_features = X_transform.shape[1]
        
    try:  
        clf.fit(X_transform, y[train])
        results["pred"].append(clf.predict(sfm.transform(X[test])))
        results["ytrue"].append(y[test])
        results["nf"].append(n_features)
        results["feature_names"].append(data.columns.values[sfm.get_support()])
        
        results["valset_score"].append(
            roc_auc_score(
                vbehav.young_kid.values,
                clf.predict(sfm.transform(vset_data.values))
            )
        )
    except:
        pass


In [7]:
roc_auc_score(
    np.array(results["ytrue"]).ravel(),
    np.array(results["pred"]).ravel()
)

0.5713725490196079

In [8]:
from collections import Counter

In [9]:
Counter([i for x in results["feature_names"] for i in x])

Counter({'ctx_rh_G_oc-temp_med-Parahip.nii.gz': 9,
         'ctx_lh_G_temp_sup-Plan_polar.nii.gz': 7,
         'Right-Amygdala.nii.gz': 21,
         'ctx_rh_G_front_middle.nii.gz': 36,
         'ctx_rh_G_precentral.nii.gz': 45,
         'ctx_lh_S_suborbital.nii.gz': 19,
         'ctx_rh_S_orbital-H_Shaped.nii.gz': 43,
         'ctx_lh_S_temporal_transverse.nii.gz': 29,
         'ctx_rh_Lat_Fis-post.nii.gz': 9,
         'ctx_rh_G_postcentral.nii.gz': 61,
         'Right-Hippocampus.nii.gz': 4,
         'ctx_lh_G_and_S_cingul-Ant.nii.gz': 21,
         'ctx_rh_G_orbital.nii.gz': 16,
         'ctx_rh_G_subcallosal.nii.gz': 4,
         'ctx_lh_S_temporal_inf.nii.gz': 10,
         'Left-Putamen.nii.gz': 8,
         'ctx_lh_Lat_Fis-post.nii.gz': 16,
         'ctx_lh_G_oc-temp_med-Parahip.nii.gz': 6,
         'ctx_lh_G_front_middle.nii.gz': 8,
         'Right-Caudate.nii.gz': 4,
         'ctx_rh_G_temporal_middle.nii.gz': 8,
         'ctx_lh_G_precentral.nii.gz': 14,
         'ctx_rh_G_and_S_t

In [10]:
np.mean(results["valset_score"])

0.4531119465329991