In [1]:
import os
import h5py
import numpy as np
import pandas as pd
from collections import Counter

% matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import LeaveOneOut, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

In [27]:
regions = dict(
    
    lfp = [
        "ctx_lh_S_front_sup.nii.gz",
        "ctx_lh_G_precentral.nii.gz",
        "ctx_lh_G_front_middle.nii.gz",
        "ctx_lh_G_and_S_cingul-Ant.nii.gz"
    ],
    
    rfp = [
        "ctx_rh_G_front_sup.nii.gz",
        "ctx_rh_G_postcentral.nii.gz",
        "ctx_rh_G_front_middle.nii.gz",
        "ctx_rh_G_front_inf-Orbital.nii.gz",
        "ctx_rh_G_and_S_transv_frontopol.nii.gz",
        "ctx_rh_G_precentral.nii.gz",
        "ctx_rh_S_front_middle.nii.gz",
        "ctx_rh_G_and_S_cingul-Ant.nii.gz"
    ],
    
    lt = [
        "ctx_lh_S_temporal_inf.nii.gz",
        "ctx_lh_G_temp_sup-G_T_transv.nii.gz",
        "ctx_lh_G_temporal_middle.nii.gz",
        "ctx_lh_G_temp_sup-Plan_polar.nii.gz",
        "ctx_lh_G_temp_sup-Lateral.nii.gz",
        "ctx_lh_G_temporal_inf.nii.gz",
        "ctx_lh_G_temp_sup-Plan_tempo.nii.gz",
        "ctx_lh_S_temporal_sup.nii.gz"
    ],
    
    rt = [
        "ctx_rh_G_temp_sup-Lateral.nii.gz",
        "ctx_rh_S_temporal_transverse.nii.gz",
        "ctx_rh_G_temp_sup-Plan_polar.nii.gz",
        "ctx_rh_G_temporal_middle.nii.gz",
        "ctx_rh_S_temporal_sup.nii.gz",
        "ctx_rh_G_temp_sup-Plan_tempo.nii.gz",
        "ctx_rh_G_temporal_inf.nii.gz",
        "ctx_rh_S_temporal_inf.nii.gz",
        "ctx_rh_G_temp_sup-G_T_transv.nii.gz"
    ],
    
    sc = [
        "Right-Thalamus-Proper.nii.gz",
        "Left-Thalamus-Proper.nii.gz",
        "Right-Accumbens-area.nii.gz",
        "Left-Accumbens-area.nii.gz",
        "Right-Caudate.nii.gz",
        "Left-Caudate.nii.gz",
        "Right-Putamen.nii.gz",
        "Left-Putamen.nii.gz",
        "Right-Amygdala.nii.gz",
        "Left-Amygdala.nii.gz",
        "Right-Hippocampus.nii.gz",
        "Left-Hippocampus.nii.gz"
    ]
    
)

In [19]:
voxels = pd.read_csv("fszstatcope2_nvoxels_nz.csv", index_col=0)
behav = pd.read_csv("behav.txt", sep='\t', index_col=0)
print(np.mean(behav.index == voxels.index)) #should be 1.0
behav2g = behav[np.logical_or(behav.young_kid == 1, behav.adult == 1)]
data2g = voxels[np.logical_or(behav.young_kid == 1, behav.adult == 1)]

1.0


In [4]:
def projection(data , covars):
    """for removing linear effect of covariates"""
    X, C = data.values, covars.values
    P = np.eye(C.shape[0]) - C.dot(np.linalg.pinv(C.T.dot(C))).dot(C.T)
    return pd.DataFrame(P.dot(X), columns=data.columns, index=data.index)

In [5]:
def random_forest_model(data, y, cv, clf):
    X, P = data.values, data.shape[1]
    results = dict(pred=[], ytrue=[], fi=[], fidx=[])
    results_null = dict(pred=[], ytrue=[], fi=[], fidx=[])
    
    for idx, (train, test) in enumerate(cv.split(X, y)):   
        clf.fit(X[train], y[train])
        results["pred"].append(clf.predict(X[test]))
        results["ytrue"].append(y[test])
        results["fi"].append(clf.feature_importances_)
        results["fidx"].append([True for i in range(P)])
        
        # this is to compute the null model
        try:
            y_shuff = np.copy(y[train])
            np.random.shuffle(y_shuff)
            clf.fit(X[train], y_shuff)
            results_null["pred"].append(clf.predict(X[test]))
            results_null["ytrue"].append(y[test])
            results_null["fi"].append(clf.feature_importances_)
            results_null["fidx"].append([True for i in range(P)])
        except:
            print("couldn't compute null model")
        
    
    roc = roc_auc_score(np.array(results["ytrue"]).ravel(), np.array(results["pred"]).ravel())
    
    # this is to compute the null model
    try:
        roc_null = roc_auc_score(
            np.array(results_null["ytrue"]).ravel(),
            np.array(results_null["pred"]).ravel()
        )
    except:
        print("couldn't compute null roc score")
        
    return results, roc, results_null, roc_null

(47, 118)

In [28]:
data_sets = {}

for key, group in regions.items():
    data_sets[key] = projection(
        data2g.loc[:, group], 
        behav2g.loc[:, ["gender", "iq", "composite_motion"]]
    )

print(data_sets.keys())

dict_keys(['lfp', 'rfp', 'lt', 'rt', 'sc'])


In [32]:
niters=10
rocs = {"lfp":[], "rfp":[], "lt":[], "rt":[], "sc":[]}
rocs_null = {"lfp":[], "rfp":[], "lt":[], "rt":[], "sc":[]}
res = {"lfp":[], "rfp":[], "lt":[], "rt":[], "sc":[]}
res_null = {"lfp":[], "rfp":[], "lt":[], "rt":[], "sc":[]}
 
for key, val in data_sets.items():
    for i_iter in np.arange(niters):
        cv=LeaveOneOut()
        clf=RandomForestClassifier(n_estimators=1000)
        
        res_g, roc_g, resnull_g, rocnull_g = random_forest_model(
            val, 
            behav2g.young_kid.values,
            cv,
            clf
        )
        
        rocs[key].append(roc_g)
        rocs_null[key].append(rocnull_g)
        res[key].append(res_g)
        res_null[key].append(resnull_g)

In [41]:
rocs_null

{'lfp': [0.44999999999999996,
  0.5272727272727273,
  0.6554545454545453,
  0.43272727272727274,
  0.5672727272727274,
  0.44454545454545447,
  0.5072727272727273,
  0.48454545454545456,
  0.6181818181818183,
  0.4218181818181818],
 'rfp': [0.5527272727272727,
  0.4645454545454546,
  0.47272727272727266,
  0.5472727272727274,
  0.5781818181818182,
  0.5700000000000001,
  0.4245454545454545,
  0.4872727272727273,
  0.5527272727272727,
  0.55],
 'lt': [0.48454545454545456,
  0.580909090909091,
  0.4645454545454546,
  0.5527272727272727,
  0.5927272727272728,
  0.6072727272727273,
  0.43909090909090914,
  0.600909090909091,
  0.3990909090909091,
  0.42727272727272725],
 'rt': [0.3936363636363637,
  0.3590909090909091,
  0.55,
  0.4645454545454546,
  0.4818181818181818,
  0.5072727272727273,
  0.4645454545454546,
  0.4954545454545454,
  0.44454545454545447,
  0.51],
 'sc': [0.6527272727272727,
  0.5327272727272727,
  0.49909090909090914,
  0.5327272727272727,
  0.5872727272727273,
  0.5527

In [35]:
data_sets.keys()

dict_keys(['lfp', 'rfp', 'lt', 'rt', 'sc'])

In [42]:
cv = LeaveOneOut()
clf = RandomForestClassifier(n_estimators=1000)
ypred = []
ytrue = []

X = data_sets["lfp"].values
y = behav2g.young_kid.values.copy()
for train, test in cv.split(X, y):
    clf.fit(X[train], y[train])
    ypred.append(clf.predict(X[test]))
    ytrue.append(y[test])

In [43]:
print(
    roc_auc_score(
        np.array(ytrue).ravel(), 
        np.array(ypred).ravel()
    )
)

0.6754545454545453


In [44]:
cv = LeaveOneOut()
clf = RandomForestClassifier(n_estimators=1000)
ypred = []
ytrue = []

X = data_sets["rfp"].values
y = behav2g.young_kid.values.copy()

for train, test in cv.split(X, y):
    clf.fit(X[train], y[train])
    ypred.append(clf.predict(X[test]))
    ytrue.append(y[test])
    
print(
    roc_auc_score(
        np.array(ytrue).ravel(), 
        np.array(ypred).ravel()
    )
)

0.6754545454545453


In [45]:
cv = LeaveOneOut()
clf = RandomForestClassifier(n_estimators=1000)
ypred = []
ytrue = []

X = data_sets["lt"].values
y = behav2g.young_kid.values.copy()

for train, test in cv.split(X, y):
    clf.fit(X[train], y[train])
    ypred.append(clf.predict(X[test]))
    ytrue.append(y[test])
    
print(
    roc_auc_score(
        np.array(ytrue).ravel(), 
        np.array(ypred).ravel()
    )
)

0.6754545454545453


In [46]:
cv = LeaveOneOut()
clf = RandomForestClassifier(n_estimators=1000)
ypred = []
ytrue = []

X = data_sets["rt"].values
y = behav2g.young_kid.values.copy()

for train, test in cv.split(X, y):
    clf.fit(X[train], y[train])
    ypred.append(clf.predict(X[test]))
    ytrue.append(y[test])
    
print(
    roc_auc_score(
        np.array(ytrue).ravel(), 
        np.array(ypred).ravel()
    )
)

0.6754545454545453


In [47]:
cv = LeaveOneOut()
clf = RandomForestClassifier(n_estimators=1000)
ypred = []
ytrue = []

X = data_sets["sc"].values
y = behav2g.young_kid.values.copy()

for train, test in cv.split(X, y):
    clf.fit(X[train], y[train])
    ypred.append(clf.predict(X[test]))
    ytrue.append(y[test])
    
print(
    roc_auc_score(
        np.array(ytrue).ravel(), 
        np.array(ypred).ravel()
    )
)

0.6754545454545453


In [48]:
for key, val in data_sets.items():
    print(key, "\t", val.shape)

lfp 	 (47, 4)
rfp 	 (47, 8)
lt 	 (47, 8)
rt 	 (47, 9)
sc 	 (47, 12)


In [55]:
np.array(res["lfp"][0]["fi"]).mean(0)

array([0.24980795, 0.24923482, 0.25047727, 0.25047995])

In [56]:
np.array(res["rfp"][0]["fi"]).mean(0)

array([0.12510914, 0.12562666, 0.12444553, 0.12474343, 0.12428653,
       0.12534908, 0.12557131, 0.12486831])

In [57]:
np.array(res["lt"][0]["fi"]).mean(0)

array([0.12570894, 0.12456601, 0.12540947, 0.12450691, 0.12459881,
       0.12457208, 0.12568254, 0.12495524])

In [58]:
np.array(res["rt"][0]["fi"]).mean(0)

array([0.10977668, 0.11120313, 0.11143813, 0.11147274, 0.11126023,
       0.11164601, 0.1108153 , 0.11071083, 0.11167695])

In [59]:
np.array(res["sc"][0]["fi"]).mean(0)

array([0.0828093 , 0.08318331, 0.08328274, 0.0836781 , 0.08349906,
       0.08294614, 0.08366214, 0.08359392, 0.08316498, 0.08311352,
       0.08359311, 0.08347369])