# Load data
- load the sample
- load the ct files for these subjects
- load the seed files for these subjects
- regress nuisance covariates
- build subtypes

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import patsy as pat
import nibabel as nib
from sklearn import linear_model as sln

In [2]:
# Paths
root_p = '/home/surchs/sim_big/PROJECT/abide_hps/'
# Pheno
sample_p = os.path.join(root_p, 'pheno', 'psm_abide1.csv')
# Data
ct_p = os.path.join(root_p, 'ct')
seed_p = os.path.join(root_p, 'seed', 'MIST_20')
mask_p = os.path.join(root_p, 'mask', 'MIST_mask.nii.gz')
# File templates
ct_t = '{}+{:07}_{}+{}_native_rms_rsl_tlaplace_30mm_{}.txt'
sd_t = 'sub_{}_mist_20.npy'
# Out_path
ct_out_p = os.path.join(root_p, 'residual', 'ct_30mm_residual_psm')
sd_out_p = os.path.join(root_p, 'residual', 'sd_30mm_residual_psm')

In [3]:
# Get the sample
sample = pd.read_csv(sample_p)
# Get the mask
mask_i = nib.load(mask_p)
mask = mask_i.get_data().astype(bool)

## Regress nuisance
The seed maps are organized in 2D arrays, we need to flatten them for regression

In [4]:
n_sub = sample.shape[0]
n_vox = np.sum(mask)
n_seed = 20
resid_seed = np.zeros((n_sub, n_vox, n_seed))

dmatrix = pat.dmatrix('AGE_AT_SCAN + FD_scrubbed + Site', data=sample)
for sid in range(n_seed):
    # Build the regression model for the seed maps
    mod = sln.LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1)
    sub_seed = np.zeros((n_sub, n_vox))
    for rid, row in sample.iterrows():
        p = os.path.join(seed_p, sd_t.format(row['SUB_ID']))
        d = np.load(p)
        sub_seed[rid, :] = d[sid, ...]
    res = mod.fit(dmatrix, sub_seed)
    resid = sub_seed - res.predict(dmatrix)
    resid_seed[..., sid] = resid

In [5]:
# Generate the CT residuals
for rid, row in sample.iterrows():
    p_right = os.path.join(ct_p, ct_t.format(row['Site'], row['Subject'], row['Session'], row['Run'], 'right'))
    p_left = os.path.join(ct_p, ct_t.format(row['Site'], row['Subject'], row['Session'], row['Run'], 'left'))
    ct_l = pd.read_csv(p_left, header=None)[0].values
    ct_r = pd.read_csv(p_right, header=None)[0].values
    # Combine left and right
    ct = np.concatenate((ct_l, ct_r))
    if rid==0:
        n_vert = len(ct)
        sub_ct = np.zeros((n_sub, n_vert))
    sub_ct[rid, :] = ct

In [6]:
# Regress from CT
dmatrix = pat.dmatrix('AGE_AT_SCAN + Site', data=sample)
mod = sln.LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1)
res = mod.fit(dmatrix, sub_ct)
resid_ct = sub_ct - res.predict(dmatrix)

In [7]:
# Save the output
np.save(sd_out_p, resid_seed)
np.save(ct_out_p, resid_ct)