### Importing Python Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import robust_scale
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import zscore
from tqdm import tqdm
from IPython.display import display as printdf
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)

### Stub to initialize folder paths and define helper functions

In [2]:
path = "../ABCD 3.0/"
to_path = '../data/'
read_file = lambda file_name: pd.read_csv(file_name, delimiter='\t', skiprows=[1], index_col='subjectkey', low_memory=False)
baseline = lambda df: df[df['eventname']=='baseline_year_1_arm_1']
followup = lambda df: df[df['eventname']=='1_year_follow_up_y_arm_1']

### Code to extract and pre-process Structural MRI ABCD 3.0 Data

In [3]:
smrip101 = baseline(read_file(path+'abcd_smrip101.txt'))
a = pd.Series(smrip101.columns)
cols = a[a.str.contains('smri_vol_cdk_')].values
smrip101 = smrip101[cols]
print('smrip101 shape:', smrip101.shape)

smrip201 = baseline(read_file(path+'abcd_smrip201.txt'))
a = pd.Series(smrip201.columns)
cols = a[a.str.contains('smri_vol_scs_')].values
smrip201 = smrip201[cols]
print('smrip201 shape:', smrip201.shape)

fsqc = baseline(read_file(path + 'freesqc01.txt'))[['fsqc_qc']]
print('fsqc shape:', fsqc.shape)

d1 = smrip101.merge(smrip201, on='subjectkey', how='outer', validate='1:1')
print('structural union shape:', d1.shape)
d2 = d1.merge(fsqc, on='subjectkey', how='inner', validate='1:1')
d2['fsqc_qc'] = d2['fsqc_qc'].fillna(0)
structural = d2.drop(['smri_vol_scs_lesionlh', 'smri_vol_scs_lesionrh', 'smri_vol_scs_wmhintlh', 'smri_vol_scs_wmhintrh'], axis=1)
print('final shape:', structural.shape)

structural = structural[structural['fsqc_qc'] == 1].drop(['fsqc_qc'], axis=1)
structural = structural.astype('float', errors='ignore')
print('structural post freesurfer qc shape:', structural.shape)

smrip101 shape: (11736, 71)
smrip201 shape: (11736, 46)
fsqc shape: (11761, 1)
structural union shape: (11736, 117)
final shape: (11736, 114)
structural post freesurfer qc shape: (11261, 113)


### Code to extract and pre-process Functional MRI ABCD 3.0 Data

In [4]:
mrirstv02 = baseline(read_file(path+'abcd_mrirstv02.txt'))[['rsfmri_var_meanmotion', 'rsfmri_var_ntpoints']]
print('mrirstv02 shape:', mrirstv02.shape)

betnet02 = baseline(read_file(path+'abcd_betnet02.txt')).iloc[:,21:-2]
features = []
for col in betnet02.columns:
    elements = col.split('_')
    if 'n' in elements:
        continue
    else:
        features.append(col)
betnet02 = betnet02[features]
print('betnet02 shape:', betnet02.shape)

subset = ['aglh', 'agrh', 'hplh', 'hprh', 'aalh', 'aarh', 'ptlh', 'ptrh', 'cdelh', 'cderh']
mrirscor02 = baseline(read_file(path + 'mrirscor02.txt'))
features = []
for col in list(mrirscor02.columns):
    elements = col.split('_')
    if('none' in elements):
        continue
    for region in subset:
        if(region in elements):
            features.append(col)
mrirscor02 = mrirscor02[features]

d1 = mrirstv02.merge(betnet02, on='subjectkey', how='outer', validate='1:1')
d2 = d1.merge(mrirscor02, on='subjectkey', how='outer', validate='1:1')
print('functional union shape:', d2.shape)
d3 = d2.merge(fsqc, on='subjectkey', how='inner', validate='1:1')
d3['fsqc_qc'] = d3['fsqc_qc'].fillna(0)
functional = d3
print('final shape:', functional.shape)

functional = functional[functional['fsqc_qc'] == 1].drop(['fsqc_qc'], axis=1)
print('functional post freesurfer qc shape:', functional.shape)

exclude_subjects = set()
fmriqc01 = baseline(read_file(path+'fmriqc01.txt'))[['fmri_postqc_b0warp', 'fmri_postqc_imgqual', 'fmri_postqc_cutoff']]
SK = set(fmriqc01.index.values)
imputer = SimpleImputer(strategy='constant')
fmriqc01[:] = imputer.fit_transform(fmriqc01)
sk = set(fmriqc01[(fmriqc01['fmri_postqc_b0warp']<=1.5) & (fmriqc01['fmri_postqc_imgqual']<=1.5) & (fmriqc01['fmri_postqc_cutoff']<=1.5)].index.values)
excluded_subjects = SK - sk
exclude_subjects = exclude_subjects.union(excluded_subjects)

mrirstv02 = baseline(read_file(path+'abcd_mrirstv02.txt'))[['rsfmri_var_meanmotion', 'rsfmri_var_ntpoints']]
SK = set(mrirstv02.index.values)
sk = set(mrirstv02[mrirstv02['rsfmri_var_ntpoints']>375].index.values)
excluded_subjects = SK - sk
exclude_subjects = exclude_subjects.union(excluded_subjects)

indexes_to_keep = list(set(functional.index.values) - exclude_subjects)
functional = functional.loc[indexes_to_keep]
functional = functional.astype('float', errors='ignore')
print('functional post other qc filtering shape:', functional.shape)

mrirstv02 shape: (11309, 2)
betnet02 shape: (11309, 144)
functional union shape: (11309, 266)
final shape: (11307, 267)
functional post freesurfer qc shape: (10880, 266)
functional post other qc filtering shape: (9375, 266)


### Code to extract and pre-process Non-Brain ABCD 3.0 Data

In [5]:
ant01 = baseline(read_file(path+'abcd_ant01.txt'))[['anthroweightcalc','anthroheightcalc']]
m = ant01['anthroweightcalc']
h2 = ant01['anthroheightcalc']**2
ant01['bmi'] = (m/h2)*703
ant01 = ant01[['bmi']]
print('ant01 shape:', ant01.shape)

ppdms01 = baseline(read_file(path+'abcd_ppdms01.txt'))
ppdms01 = ppdms01.replace(999.0, 0)
ppdms01 = ppdms01.replace(np.nan, 0)
ppdms01['pubertal_score'] = ppdms01.apply(lambda x : x['pds_1_p'] + x['pds_2_p'] + x['pds_3_p'] + x['pds_m4_p'] + x['pds_m5_p']  if (x['pubertal_sex_p']==1.0) else x['pds_1_p'] + x['pds_2_p'] + x['pds_3_p'] + x['pds_f4_p'] + x['pds_f5b_p'], axis=1, result_type='reduce')
ppdms01 = ppdms01[['pubertal_score']].astype('int')
ppdms01 = ppdms01.replace(0, np.nan)
print('ppdms01 shape:', ppdms01.shape)

medsy01 = baseline(read_file(path+'medsy01.txt'))
a = pd.Series(medsy01.columns)
cols = a[a.str.contains('_24')].values
medsy01 = medsy01[cols]
medsy01 = medsy01.replace(999.0, 0)
medsy01 = medsy01.fillna(0)
f = lambda x: 1 if x>=1 else 0
cols = medsy01.columns
a = pd.Series(medsy01.columns)
rx_cols = a[a.str.contains('rx')].values
otc_cols = a[a.str.contains('otc')].values
medsy01['rx_24']=medsy01[rx_cols].sum(axis=1).apply(f)
medsy01['otc_24']=medsy01[otc_cols].sum(axis=1).apply(f)
medsy01['caff_24'] = medsy01['caff_24'].astype(int)
medsy01 = medsy01[['rx_24', 'otc_24', 'caff_24']]
print('medsy01 shape:', medsy01.shape)

pdem02 = baseline(read_file(path+'pdem02.txt'))
cols = ['demo_brthdat_v2','demo_ed_v2',
        'demo_race_a_p___10','demo_race_a_p___11','demo_race_a_p___12',
        'demo_race_a_p___13','demo_race_a_p___14','demo_race_a_p___15',
        'demo_race_a_p___16','demo_race_a_p___17','demo_race_a_p___18',
        'demo_race_a_p___19','demo_race_a_p___20','demo_race_a_p___21',
        'demo_race_a_p___22','demo_race_a_p___23', 
        'demo_prnt_marital_v2','demo_prnt_ed_v2','demo_prnt_income_v2',
        'demo_prnt_prtnr_v2','demo_prtnr_ed_v2','demo_comb_income_v2']
pdem02['race_white'] = pdem02['demo_race_a_p___10']
pdem02['race_mixed'] = pdem02[['demo_race_a_p___11','demo_race_a_p___12','demo_race_a_p___13',
                               'demo_race_a_p___14','demo_race_a_p___15','demo_race_a_p___16',
                               'demo_race_a_p___17','demo_race_a_p___18','demo_race_a_p___19',
                               'demo_race_a_p___20','demo_race_a_p___21','demo_race_a_p___22',
                               'demo_race_a_p___23']].sum(axis=1)
pdem02['race_mixed'] = pdem02['race_mixed'].apply(f)
pdem02['demo_prnt_ed_v2'] = pdem02['demo_prnt_ed_v2'].replace(999, 0) 
pdem02['demo_prnt_ed_v2'] = pdem02['demo_prnt_ed_v2'].replace(777, 0)
pdem02['demo_prnt_ed_v2'] = pdem02['demo_prnt_ed_v2'].replace(np.nan, 0)
pdem02['demo_prtnr_ed_v2'] = pdem02['demo_prtnr_ed_v2'].replace(999, 0) 
pdem02['demo_prtnr_ed_v2'] = pdem02['demo_prtnr_ed_v2'].replace(777, 0)
pdem02['demo_prtnr_ed_v2'] = pdem02['demo_prtnr_ed_v2'].replace(np.nan, 0)
pdem02['parent_edu_max'] = pdem02[['demo_prnt_ed_v2','demo_prtnr_ed_v2']].max(axis=1)
pdem02['parent_edu_max'] = pdem02['parent_edu_max'].replace(0, np.nan)
pdem02 = pdem02[['demo_brthdat_v2','demo_ed_v2','race_white','race_mixed',
                 'demo_prnt_marital_v2','parent_edu_max','demo_prnt_prtnr_v2',
                 'demo_comb_income_v2']]
pdem02 = pdem02.replace(999.0, np.nan)
pdem02 = pdem02.replace(777.0, np.nan)
print('pdem02 shape:', pdem02.shape)

sds01 = baseline(read_file(path+'abcd_sds01.txt'))[['sleepdisturb1_p']]
sds01 = sds01.reset_index().drop_duplicates().set_index('subjectkey')
print('sds01 shape:', sds01.shape)

stq01 = baseline(read_file(path+'stq01.txt'))[['screentime2_p_hours']]
print('stq01 shape:', stq01.shape)

fes02 = baseline(read_file(path+'fes02.txt'))[['fam_enviro1_p','fam_enviro2r_p', 'fam_enviro3_p',
                                                'fam_enviro4r_p','fam_enviro5_p', 'fam_enviro6_p',                                                
                                                'fam_enviro7r_p','fam_enviro8_p', 'fam_enviro9r_p']]
fes02['fam_enviro_sum'] = fes02.sum(axis=1)
fes02 = fes02[['fam_enviro_sum']].astype('int')
fes02 = fes02.reset_index().drop_duplicates().set_index('subjectkey')
print('fes02 shape:', fes02.shape)

lt01 = baseline(read_file(path + 'abcd_lt01.txt'))[['sex']]
f = lambda x: 1 if x=='F' else x
m = lambda x: 0 if x=='M' else x
lt01['sex'] = lt01['sex'].apply(m).apply(f)
print('lt01 shape:', lt01.shape)

# asrs01 = baseline(read_file(path+'abcd_asrs01.txt'))[['asr_scr_anxdep_t']]
# print('asrs01 shape:', asrs01.shape)

fhxp102 = baseline(read_file(path+'fhxp102.txt'))[['fam_history_q6a_depression', 'fam_history_q6d_depression']]
fhxp102 = fhxp102.replace(np.nan, 0)
fhxp102 = fhxp102.replace(999.0, np.nan)
fhxp102 = fhxp102.reset_index().drop_duplicates().set_index('subjectkey')
fhxp102['fam_history_depression'] = np.logical_or(fhxp102['fam_history_q6a_depression'], fhxp102['fam_history_q6d_depression'])
fhxp102 = fhxp102.drop(['fam_history_q6a_depression', 'fam_history_q6d_depression'], axis=1).astype('int')
print('fhxp102 shape:', fhxp102.shape)

d1 = ant01.merge(ppdms01, on='subjectkey', how='outer', validate='1:1')
d2 = d1.merge(medsy01, on='subjectkey', how='outer', validate='1:1')
d3 = d2.merge(pdem02, on='subjectkey', how='outer', validate='1:1')
d4 = d3.merge(sds01, on='subjectkey', how='outer', validate='1:1')
d5 = d4.merge(stq01, on='subjectkey', how='outer', validate='1:1') 
d6 = d5.merge(lt01, on='subjectkey', how='outer', validate='1:1')
# d7 = d6.merge(asrs01, on='subjectkey', how='outer', validate='1:1')
d7 = d6.merge(fhxp102, on='subjectkey', how='outer', validate='1:1')
covariates = d7.merge(fes02, on='subjectkey', how='outer', validate='1:1') 

covariates = covariates.astype('float', errors='ignore')
print('final shape:', covariates.shape)

ant01 shape: (11878, 1)
ppdms01 shape: (11878, 1)
medsy01 shape: (11878, 3)
pdem02 shape: (11878, 8)
sds01 shape: (11878, 1)
stq01 shape: (11878, 1)
fes02 shape: (11878, 1)
lt01 shape: (11878, 1)
fhxp102 shape: (11878, 1)
final shape: (11878, 18)


### Code to remove twins & triplets from the analyses

In [6]:
acspsw03 = baseline(read_file(path+'acspsw03.txt'))[['rel_family_id', 'rel_group_id']]
use_keys = []
unique_family = acspsw03['rel_family_id'].unique()
for uf in tqdm(unique_family):
    use_keys = use_keys + list(acspsw03[acspsw03['rel_family_id']==uf].reset_index().groupby(['rel_group_id']).min()['subjectkey'])
print("# subjects (with no twins or triplets):",len(use_keys))

smri = structural.copy(deep=True)
fmri = functional.copy(deep=True)
covr = covariates.copy(deep=True)
smri_covr = structural.merge(covariates, on='subjectkey', how='inner', validate='1:1')
fmri_covr = functional.merge(covariates, on='subjectkey', how='inner', validate='1:1')
smri_fmri_covr = covariates.merge(structural, on='subjectkey', how='inner', validate='1:1')
smri_fmri_covr = smri_fmri_covr.merge(functional, on='subjectkey', how='inner', validate='1:1')

print('\n--- shapes with twins and triplets ---\n')
print('smri shape           :', smri.shape)
print('fmri shape           :', fmri.shape)
print('covr shape           :', covr.shape)
print('smri_cov shape       :', smri_covr.shape)
print('fmri_covr shape      :', fmri_covr.shape)
print('smri_fmri_covr shape :', smri_fmri_covr.shape)

smri = smri.loc[smri.index.intersection(use_keys), :]; smri.index.name = 'subjectkey'
fmri = fmri.loc[fmri.index.intersection(use_keys), :]; fmri.index.name = 'subjectkey'
covr = covr.loc[covr.index.intersection(use_keys), :]; covr.index.name = 'subjectkey'
smri_covr = smri_covr.loc[smri_covr.index.intersection(use_keys), :]; smri_covr.index.name = 'subjectkey'
fmri_covr = fmri_covr.loc[fmri_covr.index.intersection(use_keys), :]; fmri_covr.index.name = 'subjectkey'
smri_fmri_covr = smri_fmri_covr.loc[smri_fmri_covr.index.intersection(use_keys), :]; smri_fmri_covr.index.name = 'subjectkey'

print('\n--- shapes without twins and triplets ---\n')
print('smri shape           :', smri.shape)
print('fmri shape           :', fmri.shape)
print('covr shape           :', covr.shape)
print('smri_cov shape       :', smri_covr.shape)
print('fmri_covr shape      :', fmri_covr.shape)
print('smri_fmri_covr shape :', smri_fmri_covr.shape)

100%|██████████| 9856/9856 [00:30<00:00, 326.41it/s]


# subjects (with no twins or triplets): 10789

--- shapes with twins and triplets ---

smri shape           : (11261, 113)
fmri shape           : (9375, 266)
covr shape           : (11878, 18)
smri_cov shape       : (11261, 131)
fmri_covr shape      : (9375, 284)
smri_fmri_covr shape : (9375, 397)

--- shapes without twins and triplets ---

smri shape           : (10219, 113)
fmri shape           : (8511, 266)
covr shape           : (10789, 18)
smri_cov shape       : (10219, 131)
fmri_covr shape      : (8511, 284)
smri_fmri_covr shape : (8511, 397)


### Preparing combined "Baseline" data for ML-based analysis to a specified path

In [7]:
dependent = baseline(read_file(path+'abcd_cbcls01.txt'))[['cbcl_scr_dsm5_depress_t']].dropna()

baseline_smri = smri.merge(dependent, on='subjectkey', how='inner', validate='1:1')
baseline_fmri = fmri.merge(dependent, on='subjectkey', how='inner', validate='1:1')
baseline_covr = covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')
baseline_smri_covr = smri_covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')
baseline_fmri_covr = fmri_covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')
baseline_smri_fmri_covr = smri_fmri_covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')

print('--- shapes with dependent variable ---\n')
print('baseline_smri shape           :', baseline_smri.shape)
print('baseline_fmri shape           :', baseline_fmri.shape)
print('baseline_covr shape           :', baseline_covr.shape)
print('baseline_smri_cov shape       :', baseline_smri_covr.shape)
print('baseline_fmri_covr shape      :', baseline_fmri_covr.shape)
print('baseline_smri_fmri_covr shape :', baseline_smri_fmri_covr.shape)

--- shapes with dependent variable ---

baseline_smri shape           : (10214, 114)
baseline_fmri shape           : (8507, 267)
baseline_covr shape           : (10782, 19)
baseline_smri_cov shape       : (10214, 132)
baseline_fmri_covr shape      : (8507, 285)
baseline_smri_fmri_covr shape : (8507, 398)


### Preparing combined "Follow-up" data for ML-based analysis to a specified path

In [8]:
dependent = followup(read_file(path+'abcd_cbcls01.txt'))[['cbcl_scr_dsm5_depress_t']].dropna()

followup_smri = smri.merge(dependent, on='subjectkey', how='inner', validate='1:1')
followup_fmri = fmri.merge(dependent, on='subjectkey', how='inner', validate='1:1')
followup_covr = covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')
followup_smri_covr = smri_covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')
followup_fmri_covr = fmri_covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')
followup_smri_fmri_covr = smri_fmri_covr.merge(dependent, on='subjectkey', how='inner', validate='1:1')

print('--- shapes with dependent variable ---\n')
print('followup_smri shape           :', followup_smri.shape)
print('followup_fmri shape           :', followup_fmri.shape)
print('followup_covr shape           :', followup_covr.shape)
print('followup_smri_cov shape       :', followup_smri_covr.shape)
print('followup_fmri_covr shape      :', followup_fmri_covr.shape)
print('followup_smri_fmri_covr shape :', followup_smri_fmri_covr.shape)

--- shapes with dependent variable ---

followup_smri shape           : (9597, 114)
followup_fmri shape           : (7998, 267)
followup_covr shape           : (10137, 19)
followup_smri_cov shape       : (9597, 132)
followup_fmri_covr shape      : (7998, 285)
followup_smri_fmri_covr shape : (7998, 398)


### Finding Common Subjects Between Baseline & Followup Data

In [9]:
common_subjects = list(set(baseline_smri.index).intersection(set(followup_smri.index)))
baseline_smri = baseline_smri.loc[common_subjects, :]
followup_smri = followup_smri.loc[common_subjects, :]

common_subjects = list(set(baseline_fmri.index).intersection(set(followup_fmri.index)))
baseline_fmri = baseline_fmri.loc[common_subjects, :]
followup_fmri = followup_fmri.loc[common_subjects, :]

common_subjects = list(set(baseline_covr.index).intersection(set(followup_covr.index)))
baseline_covr = baseline_covr.loc[common_subjects, :]
followup_covr = followup_covr.loc[common_subjects, :]

common_subjects = list(set(baseline_smri_covr.index).intersection(set(followup_smri_covr.index)))
baseline_smri_covr = baseline_smri_covr.loc[common_subjects, :]
followup_smri_covr = followup_smri_covr.loc[common_subjects, :]

common_subjects = list(set(baseline_fmri_covr.index).intersection(set(followup_fmri_covr.index)))
baseline_fmri_covr = baseline_fmri_covr.loc[common_subjects, :]
followup_fmri_covr = followup_fmri_covr.loc[common_subjects, :]

common_subjects = list(set(baseline_smri_fmri_covr.index).intersection(set(followup_smri_fmri_covr.index)))
baseline_smri_fmri_covr = baseline_smri_fmri_covr.loc[common_subjects, :]
followup_smri_fmri_covr = followup_smri_fmri_covr.loc[common_subjects, :]

print('--- shapes with dependent variable ---\n')
print('baseline_smri shape           :', baseline_smri.shape)
print('baseline_fmri shape           :', baseline_fmri.shape)
print('baseline_covr shape           :', baseline_covr.shape)
print('baseline_smri_cov shape       :', baseline_smri_covr.shape)
print('baseline_fmri_covr shape      :', baseline_fmri_covr.shape)
print('baseline_smri_fmri_covr shape :', baseline_smri_fmri_covr.shape)

print('--- shapes with dependent variable ---\n')
print('followup_smri shape           :', followup_smri.shape)
print('followup_fmri shape           :', followup_fmri.shape)
print('followup_covr shape           :', followup_covr.shape)
print('followup_smri_cov shape       :', followup_smri_covr.shape)
print('followup_fmri_covr shape      :', followup_fmri_covr.shape)
print('followup_smri_fmri_covr shape :', followup_smri_fmri_covr.shape)

--- shapes with dependent variable ---

baseline_smri shape           : (9593, 114)
baseline_fmri shape           : (7995, 267)
baseline_covr shape           : (10132, 19)
baseline_smri_cov shape       : (9593, 132)
baseline_fmri_covr shape      : (7995, 285)
baseline_smri_fmri_covr shape : (7995, 398)
--- shapes with dependent variable ---

followup_smri shape           : (9593, 114)
followup_fmri shape           : (7995, 267)
followup_covr shape           : (10132, 19)
followup_smri_cov shape       : (9593, 132)
followup_fmri_covr shape      : (7995, 285)
followup_smri_fmri_covr shape : (7995, 398)


### Train - Test Split followed by Median Imputation and Robust Scaling (IQR Based)

In [10]:
%%capture 

def split_impute_function(df):
    X = df.drop('cbcl_scr_dsm5_depress_t', axis=1)
    y = df['cbcl_scr_dsm5_depress_t']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=17)
    
    imputer = SimpleImputer(strategy='median')
    X_train.loc[:, :] = imputer.fit_transform(X_train)
    X_train.loc[:, :] = robust_scale(X_train)
    
    imputer = SimpleImputer(strategy='median')
    X_test.loc[:, :] = imputer.fit_transform(X_test)
    X_test.loc[:, :] = robust_scale(X_test)
    
    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    
    return df_train, df_test


train_baseline_smri, test_baseline_smri = split_impute_function(baseline_smri)
train_baseline_fmri, test_baseline_fmri = split_impute_function(baseline_fmri)
train_baseline_covr, test_baseline_covr = split_impute_function(baseline_covr)
train_baseline_smri_covr, test_baseline_smri_covr = split_impute_function(baseline_smri_covr)
train_baseline_fmri_covr, test_baseline_fmri_covr = split_impute_function(baseline_fmri_covr)
train_baseline_smri_fmri_covr, test_baseline_smri_fmri_covr = split_impute_function(baseline_smri_fmri_covr)

train_followup_smri, test_followup_smri = split_impute_function(followup_smri)
train_followup_fmri, test_followup_fmri = split_impute_function(followup_fmri)
train_followup_covr, test_followup_covr = split_impute_function(followup_covr)
train_followup_smri_covr, test_followup_smri_covr = split_impute_function(followup_smri_covr)
train_followup_fmri_covr, test_followup_fmri_covr = split_impute_function(followup_fmri_covr)
train_followup_smri_fmri_covr, test_followup_smri_fmri_covr = split_impute_function(followup_smri_fmri_covr)

### Finding Residuals for Structural and Functional MRI Variables

In [11]:
smri_vars = list(structural.columns)
fmri_vars = list(set(functional.columns) - set(['rsfmri_var_meanmotion','rsfmri_var_ntpoints']))

In [12]:
mri01 = baseline(read_file(path+'abcd_mri01.txt'))[['mri_info_manufacturer', 'mri_info_deviceserialnumber']].astype('category', errors='ignore')
mri01['mri_info_manufacturer'] = mri01['mri_info_manufacturer'].cat.codes
mri01['mri_info_deviceserialnumber'] = mri01['mri_info_deviceserialnumber'].cat.codes
mri01 = mri01[~mri01.index.duplicated(keep='last')]

lt01 = baseline(read_file(path + 'abcd_lt01.txt'))[['site_id_l']]
s = lambda x: int(x.split('site')[1])
lt01['site_id_l'] = lt01['site_id_l'].apply(s)

other_parameters = mri01.merge(lt01, on='subjectkey', how='inner', validate='1:1')

train_baseline_smri = other_parameters.merge(train_baseline_smri, on='subjectkey', how='right', validate='1:1')
train_baseline_fmri = other_parameters.merge(train_baseline_fmri, on='subjectkey', how='right', validate='1:1')
train_baseline_smri_covr = other_parameters.merge(train_baseline_smri_covr, on='subjectkey', how='right', validate='1:1')
train_baseline_fmri_covr = other_parameters.merge(train_baseline_fmri_covr, on='subjectkey', how='right', validate='1:1')
train_baseline_smri_fmri_covr = other_parameters.merge(train_baseline_smri_fmri_covr, on='subjectkey', how='right', validate='1:1')
test_baseline_smri = other_parameters.merge(test_baseline_smri, on='subjectkey', how='right', validate='1:1')
test_baseline_fmri = other_parameters.merge(test_baseline_fmri, on='subjectkey', how='right', validate='1:1')
test_baseline_smri_covr = other_parameters.merge(test_baseline_smri_covr, on='subjectkey', how='right', validate='1:1')
test_baseline_fmri_covr = other_parameters.merge(test_baseline_fmri_covr, on='subjectkey', how='right', validate='1:1')
test_baseline_smri_fmri_covr = other_parameters.merge(test_baseline_smri_fmri_covr, on='subjectkey', how='right', validate='1:1')

imputer = SimpleImputer(strategy='most_frequent')
train_baseline_smri[:] = imputer.fit_transform(train_baseline_smri)
train_baseline_fmri[:] = imputer.fit_transform(train_baseline_fmri)
train_baseline_smri_covr[:] = imputer.fit_transform(train_baseline_smri_covr)
train_baseline_fmri_covr[:] = imputer.fit_transform(train_baseline_fmri_covr)
train_baseline_smri_fmri_covr[:] = imputer.fit_transform(train_baseline_smri_fmri_covr)
test_baseline_smri[:] = imputer.fit_transform(test_baseline_smri)
test_baseline_fmri[:] = imputer.fit_transform(test_baseline_fmri)
test_baseline_smri_covr[:] = imputer.fit_transform(test_baseline_smri_covr)
test_baseline_fmri_covr[:] = imputer.fit_transform(test_baseline_fmri_covr)
test_baseline_smri_fmri_covr[:] = imputer.fit_transform(test_baseline_smri_fmri_covr)

for smri_var in smri_vars:
    function = smri_var + ' ~ mri_info_manufacturer + mri_info_deviceserialnumber + site_id_l'
    train_baseline_smri[smri_var] = smf.ols(function, train_baseline_smri).fit().resid
    train_baseline_smri_covr[smri_var] = smf.ols(function, train_baseline_smri_covr).fit().resid
    train_baseline_smri_fmri_covr[smri_var] = smf.ols(function, train_baseline_smri_fmri_covr).fit().resid
    test_baseline_smri[smri_var] = smf.ols(function, test_baseline_smri).fit().resid
    test_baseline_smri_covr[smri_var] = smf.ols(function, test_baseline_smri_covr).fit().resid
    test_baseline_smri_fmri_covr[smri_var] = smf.ols(function, test_baseline_smri_fmri_covr).fit().resid
    
for fmri_var in fmri_vars:
    function = fmri_var + ' ~ mri_info_manufacturer + mri_info_deviceserialnumber + site_id_l + rsfmri_var_meanmotion + rsfmri_var_ntpoints'
    train_baseline_fmri[fmri_var] = smf.ols(function, train_baseline_fmri).fit().resid
    train_baseline_fmri_covr[fmri_var] = smf.ols(function, train_baseline_fmri_covr).fit().resid
    train_baseline_smri_fmri_covr[fmri_var] = smf.ols(function, train_baseline_smri_fmri_covr).fit().resid
    test_baseline_fmri[fmri_var] = smf.ols(function, test_baseline_fmri).fit().resid
    test_baseline_fmri_covr[fmri_var] = smf.ols(function, test_baseline_fmri_covr).fit().resid
    test_baseline_smri_fmri_covr[fmri_var] = smf.ols(function, test_baseline_smri_fmri_covr).fit().resid
    
train_baseline_smri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
train_baseline_smri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
test_baseline_smri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
test_baseline_smri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
train_baseline_fmri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
train_baseline_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
train_baseline_smri_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
test_baseline_fmri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
test_baseline_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
test_baseline_smri_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)

In [13]:
train_followup_smri = other_parameters.merge(train_followup_smri, on='subjectkey', how='right', validate='1:1')
train_followup_fmri = other_parameters.merge(train_followup_fmri, on='subjectkey', how='right', validate='1:1')
train_followup_smri_covr = other_parameters.merge(train_followup_smri_covr, on='subjectkey', how='right', validate='1:1')
train_followup_fmri_covr = other_parameters.merge(train_followup_fmri_covr, on='subjectkey', how='right', validate='1:1')
train_followup_smri_fmri_covr = other_parameters.merge(train_followup_smri_fmri_covr, on='subjectkey', how='right', validate='1:1')
test_followup_smri = other_parameters.merge(test_followup_smri, on='subjectkey', how='right', validate='1:1')
test_followup_fmri = other_parameters.merge(test_followup_fmri, on='subjectkey', how='right', validate='1:1')
test_followup_smri_covr = other_parameters.merge(test_followup_smri_covr, on='subjectkey', how='right', validate='1:1')
test_followup_fmri_covr = other_parameters.merge(test_followup_fmri_covr, on='subjectkey', how='right', validate='1:1')
test_followup_smri_fmri_covr = other_parameters.merge(test_followup_smri_fmri_covr, on='subjectkey', how='right', validate='1:1')

imputer = SimpleImputer(strategy='most_frequent')
train_followup_smri[:] = imputer.fit_transform(train_followup_smri)
train_followup_fmri[:] = imputer.fit_transform(train_followup_fmri)
train_followup_smri_covr[:] = imputer.fit_transform(train_followup_smri_covr)
train_followup_fmri_covr[:] = imputer.fit_transform(train_followup_fmri_covr)
train_followup_smri_fmri_covr[:] = imputer.fit_transform(train_followup_smri_fmri_covr)
test_followup_smri[:] = imputer.fit_transform(test_followup_smri)
test_followup_fmri[:] = imputer.fit_transform(test_followup_fmri)
test_followup_smri_covr[:] = imputer.fit_transform(test_followup_smri_covr)
test_followup_fmri_covr[:] = imputer.fit_transform(test_followup_fmri_covr)
test_followup_smri_fmri_covr[:] = imputer.fit_transform(test_followup_smri_fmri_covr)

for smri_var in smri_vars:
    function = smri_var + ' ~ mri_info_manufacturer + mri_info_deviceserialnumber + site_id_l'
    train_followup_smri[smri_var] = smf.ols(function, train_followup_smri).fit().resid
    train_followup_smri_covr[smri_var] = smf.ols(function, train_followup_smri_covr).fit().resid
    train_followup_smri_fmri_covr[smri_var] = smf.ols(function, train_followup_smri_fmri_covr).fit().resid
    test_followup_smri[smri_var] = smf.ols(function, test_followup_smri).fit().resid
    test_followup_smri_covr[smri_var] = smf.ols(function, test_followup_smri_covr).fit().resid
    test_followup_smri_fmri_covr[smri_var] = smf.ols(function, test_followup_smri_fmri_covr).fit().resid
    
for fmri_var in fmri_vars:
    function = fmri_var + ' ~ mri_info_manufacturer + mri_info_deviceserialnumber + site_id_l + rsfmri_var_meanmotion + rsfmri_var_ntpoints'
    train_followup_fmri[fmri_var] = smf.ols(function, train_followup_fmri).fit().resid
    train_followup_fmri_covr[fmri_var] = smf.ols(function, train_followup_fmri_covr).fit().resid
    train_followup_smri_fmri_covr[fmri_var] = smf.ols(function, train_followup_smri_fmri_covr).fit().resid
    test_followup_fmri[fmri_var] = smf.ols(function, test_followup_fmri).fit().resid
    test_followup_fmri_covr[fmri_var] = smf.ols(function, test_followup_fmri_covr).fit().resid
    test_followup_smri_fmri_covr[fmri_var] = smf.ols(function, test_followup_smri_fmri_covr).fit().resid
    
train_followup_smri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
train_followup_smri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
test_followup_smri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
test_followup_smri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l'], axis=1, inplace=True)
train_followup_fmri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
train_followup_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
train_followup_smri_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
test_followup_fmri.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
test_followup_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)
test_followup_smri_fmri_covr.drop(['mri_info_manufacturer', 'mri_info_deviceserialnumber', 'site_id_l', 'rsfmri_var_meanmotion', 'rsfmri_var_ntpoints'], axis=1, inplace=True)

### Exporting all combinations of train and test data

In [14]:
print('--- shapes with dependent variable ---\n')
print('train_baseline_smri shape           :', train_baseline_smri.shape)
print('train_baseline_fmri shape           :', train_baseline_fmri.shape)
print('train_baseline_covr shape           :', train_baseline_covr.shape)
print('train_baseline_smri_cov shape       :', train_baseline_smri_covr.shape)
print('train_baseline_fmri_covr shape      :', train_baseline_fmri_covr.shape)
print('train_baseline_smri_fmri_covr shape :', train_baseline_smri_fmri_covr.shape)

print('\n--- shapes with dependent variable ---\n')
print('train_followup_smri shape           :', train_followup_smri.shape)
print('train_followup_fmri shape           :', train_followup_fmri.shape)
print('train_followup_covr shape           :', train_followup_covr.shape)
print('train_followup_smri_cov shape       :', train_followup_smri_covr.shape)
print('train_followup_fmri_covr shape      :', train_followup_fmri_covr.shape)
print('train_followup_smri_fmri_covr shape :', train_followup_smri_fmri_covr.shape)

print('\n--- shapes with dependent variable ---\n')
print('test_baseline_smri shape           :', test_baseline_smri.shape)
print('test_baseline_fmri shape           :', test_baseline_fmri.shape)
print('test_baseline_covr shape           :', test_baseline_covr.shape)
print('test_baseline_smri_cov shape       :', test_baseline_smri_covr.shape)
print('test_baseline_fmri_covr shape      :', test_baseline_fmri_covr.shape)
print('test_baseline_smri_fmri_covr shape :', test_baseline_smri_fmri_covr.shape)

print('\n--- shapes with dependent variable ---\n')
print('test_followup_smri shape           :', test_followup_smri.shape)
print('test_followup_fmri shape           :', test_followup_fmri.shape)
print('test_followup_covr shape           :', test_followup_covr.shape)
print('test_followup_smri_cov shape       :', test_followup_smri_covr.shape)
print('test_followup_fmri_covr shape      :', test_followup_fmri_covr.shape)
print('test_followup_smri_fmri_covr shape :', test_followup_smri_fmri_covr.shape)

train_baseline_smri.to_pickle(to_path+'train/'+'baseline_smri')
train_baseline_fmri.to_pickle(to_path+'train/'+'baseline_fmri')
train_baseline_covr.to_pickle(to_path+'train/'+'baseline_covr')
train_baseline_smri_covr.to_pickle(to_path+'train/'+'baseline_smri_covr')
train_baseline_fmri_covr.to_pickle(to_path+'train/'+'baseline_fmri_covr')
train_baseline_smri_fmri_covr.to_pickle(to_path+'train/'+'baseline_smri_fmri_covr')

test_baseline_smri.to_pickle(to_path+'test/'+'baseline_smri')
test_baseline_fmri.to_pickle(to_path+'test/'+'baseline_fmri')
test_baseline_covr.to_pickle(to_path+'test/'+'baseline_covr')
test_baseline_smri_covr.to_pickle(to_path+'test/'+'baseline_smri_covr')
test_baseline_fmri_covr.to_pickle(to_path+'test/'+'baseline_fmri_covr')
test_baseline_smri_fmri_covr.to_pickle(to_path+'test/'+'baseline_smri_fmri_covr')

train_followup_smri.to_pickle(to_path+'train/'+'followup_smri')
train_followup_fmri.to_pickle(to_path+'train/'+'followup_fmri')
train_followup_covr.to_pickle(to_path+'train/'+'followup_covr')
train_followup_smri_covr.to_pickle(to_path+'train/'+'followup_smri_covr')
train_followup_fmri_covr.to_pickle(to_path+'train/'+'followup_fmri_covr')
train_followup_smri_fmri_covr.to_pickle(to_path+'train/'+'followup_smri_fmri_covr')

test_followup_smri.to_pickle(to_path+'test/'+'followup_smri')
test_followup_fmri.to_pickle(to_path+'test/'+'followup_fmri')
test_followup_covr.to_pickle(to_path+'test/'+'followup_covr')
test_followup_smri_covr.to_pickle(to_path+'test/'+'followup_smri_covr')
test_followup_fmri_covr.to_pickle(to_path+'test/'+'followup_fmri_covr')
test_followup_smri_fmri_covr.to_pickle(to_path+'test/'+'followup_smri_fmri_covr')

--- shapes with dependent variable ---

train_baseline_smri shape           : (7194, 114)
train_baseline_fmri shape           : (5996, 265)
train_baseline_covr shape           : (7599, 19)
train_baseline_smri_cov shape       : (7194, 132)
train_baseline_fmri_covr shape      : (5996, 283)
train_baseline_smri_fmri_covr shape : (5996, 396)

--- shapes with dependent variable ---

train_followup_smri shape           : (7194, 114)
train_followup_fmri shape           : (5996, 265)
train_followup_covr shape           : (7599, 19)
train_followup_smri_cov shape       : (7194, 132)
train_followup_fmri_covr shape      : (5996, 283)
train_followup_smri_fmri_covr shape : (5996, 396)

--- shapes with dependent variable ---

test_baseline_smri shape           : (2399, 114)
test_baseline_fmri shape           : (1999, 265)
test_baseline_covr shape           : (2533, 19)
test_baseline_smri_cov shape       : (2399, 132)
test_baseline_fmri_covr shape      : (1999, 283)
test_baseline_smri_fmri_covr shape :