In [None]:
import numpy as np
import os
import pickle
from sklearn.linear_model import LinearRegression
# from nilearn.signal import butterworth

In [2]:
path = 'resting_state_denoised_cortex-subcortex_254_rois_awscli/'     

In [3]:
filenames = [os.listdir(path)[i] for i in range(len(os.listdir(path)))]

In [4]:
len(filenames)

4072

In [5]:
filenames[0]

'100206_rfMRI_REST1_LR.1D'

In [None]:
# hcp subjects with 4 runs (from HCP webpage .csv file)
import pandas as pd
hcp_subjects = pd.read_csv('hcp_subjects_with_all_rsfMRI.txt', sep=' ', header=None, dtype = str)

In [7]:
hcp_subjects = [hcp_subjects[0][i] for i in range(len(hcp_subjects[0]))]

In [8]:
hcp_subjects[0]

'100206'

In [9]:
# check if downloaded files match those subjects, and subject_id's are ordered in the same way 
filename_match = np.zeros(len(filenames)) 
for i in range(len(hcp_subjects)):
    for j in range(4):
        filename_match[4*i+j] = np.char.startswith(filenames[4*i+j], hcp_subjects[i], start=0, end=None)

In [10]:
len(filename_match)-sum(filename_match) # 0: match

0.0

In [11]:
# check shapes of full data
datas = []
for i in os.listdir(path):
  temp = np.loadtxt(path+i)
  datas.append(temp)

In [None]:
check_data_shape = []
for i in range(len(datas)):
    check_data_shape.append(datas[i].shape == (1200,254))

In [13]:
file_id = [i for i in range(len(datas)) if check_data_shape[i] == False]

Training data for model selection ($K,D$)

In [15]:
# first 100 subjects (400 runs) for model selection
datas = []
for i in range(400):
  temp = np.loadtxt(path+filenames[i])
  datas.append(temp)

In [17]:
# create tags using subject id
tags = []
for i in range(100):
    for j in range(4): # append subject_id four times for the four runs
        tags.append(hcp_subjects[i])

preprocessing of model selection data

In [17]:
# transform cortex part to 17-network order (cortex part in Tian's mask was in 7-network order)
with open('7-17_transformation_200roi', 'rb') as f:
    pi = pickle.load(f)
for i in range(len(datas)):
    cortex_ts = datas[i][:,54:254]
    datas[i][:,54:254] = cortex_ts[:,pi]

In [None]:
# 1. normalize; 2. global signal removal 

def regress_GS(ts):
    gs = ts.mean(axis=1).reshape(-1, 1) 
    ret = np.zeros_like(ts)
    for idx_roi in range(ts.shape[1]):
        roi_ts = ts[:,idx_roi].reshape(-1, 1) 
        reg = LinearRegression().fit(gs, roi_ts)
        residual = roi_ts - reg.predict(gs)
        ret[:,idx_roi] = residual.flatten()
    return ret

for i in range(len(datas)):
    ts = datas[i]

    ts = (ts - np.expand_dims(ts.mean(axis=0), axis=0))/np.expand_dims(ts.std(axis=0), axis=0)
  
    ts_gs = regress_GS(ts)
        
    datas[i] = ts_gs

In [None]:
## only for no-GSR processing
#for i in range(len(datas)):
#    ts = datas[i]

#    ts = (ts - np.expand_dims(ts.mean(axis=0), axis=0))/np.expand_dims(ts.std(axis=0), axis=0)
        
#    datas[i] = ts

#with open('roi_timeseries_no_GSR_rsfMRI_HCP_model_selection','wb') as f:
#    pickle.dump(datas, f)    

In [24]:
with open('roi_timeseries_rsfMRI_HCP_model_selection','wb') as f:
    pickle.dump(datas, f)

In [26]:
with open('tags_rsfMRI_HCP_model_selection','wb') as f:
    pickle.dump(tags, f)

preprocessing of held-out data

In [18]:
# number of held-out participants
len(hcp_subjects)-100

918

In [None]:
held_out_datas = datas[400:4072]

In [47]:
with open('7-17_transformation_200roi', 'rb') as f:
    pi = pickle.load(f)
for i in range(len(held_out_datas)):
    cortex_ts = held_out_datas[i][:,54:254]
    held_out_datas[i][:,54:254] = cortex_ts[:,pi]

In [27]:
# create tags using subject id
tags = []
for i in range(100,1018):
    for j in range(4): # append subject_id four times for the four runs
        tags.append(hcp_subjects[i])

In [32]:
# check first tag 
tags[0:10]

['122620',
 '122620',
 '122620',
 '122620',
 '122822',
 '122822',
 '122822',
 '122822',
 '123117',
 '123117']

In [None]:
# 1. normalize; 2. global signal removal 

def regress_GS(ts):
    gs = ts.mean(axis=1).reshape(-1, 1) 
    ret = np.zeros_like(ts)
    for idx_roi in range(ts.shape[1]):
        roi_ts = ts[:,idx_roi].reshape(-1, 1) 
        reg = LinearRegression().fit(gs, roi_ts)
        residual = roi_ts - reg.predict(gs)
        ret[:,idx_roi] = residual.flatten()
    return ret

for i in range(len(held_out_datas)):
    ts = held_out_datas[i]

    ts = (ts - np.expand_dims(ts.mean(axis=0), axis=0))/np.expand_dims(ts.std(axis=0), axis=0)
  
    ts_gs = regress_GS(ts)
        
    held_out_datas[i] = ts_gs

In [49]:
with open('roi_timeseries_rsfMRI_HCP_held_out','wb') as f:
    pickle.dump(held_out_datas, f)

In [50]:
with open('tags_rsfMRI_HCP_held_out','wb') as f:
    pickle.dump(tags, f)