# Combine Pheno
Combine the ABIDE phenotype information with QC information. 
Also figure out for which ABIDE1 subjects I have QC passing CT/GWR information

In [1]:
import os
import glob
import numpy as np
import pandas as pd

In [25]:
# Paths
root_p = '/home/surchs/sim_big/PROJECT/abide_hps/'
# Get the QC information
gleb_qc = os.path.join(root_p, 'qc', 'gleb_QC_full.csv')
abide_1_qc_p = os.path.join(root_p, 'qc', 'abide_niak_qc_report.csv')
abide_1_motion_p = os.path.join(root_p, 'qc', 'quant_qc_info.csv')
# Get the pheno information
abide_1_psm_p = os.path.join(root_p, 'pheno', 'abide_paper_consensus_model.csv')
abide_1_pheno_p = os.path.join(root_p, 'pheno', 'abide1_pheno.txt')
abide_2_pheno = os.path.join(root_p, 'pheno', 'abide2_pheno.txt')
brain_volume_p = os.path.join(root_p, 'pheno', 'brain_mask_volumes.csv')
# File templates
ct_p = os.path.join(root_p, 'ct')
fc_p = os.path.join(root_p, 'fc')
ct_t = '{}+{:07}_{}+{}_native_rms_rsl_tlaplace_30mm_left.txt'
fc_t = 'fmri_{:07}_session_1_run1.nii.gz'
# Output
abide_full_out = os.path.join(root_p, 'pheno', 'abide_1_complete.csv')

In [3]:
#  Get the psm sample
psm_pheno = pd.read_csv(abide_1_psm_p)
# Get the motion information
motion = pd.read_csv(abide_1_motion_p)

In [4]:
bv = pd.read_csv(brain_volume_p, header=None, delimiter='\t')
# Get the subject ID
bv['SUB_ID'] = [int(row[0].split('+')[1]) for rid, row in bv.iterrows()]
bv.rename(columns={2:'BV'}, inplace=True)
# Get the FC QC information
fc_qc = pd.read_csv(abide_1_qc_p)
# Get the X off of the subject ID
fc_qc['SUB_ID'] = fc_qc['id_subject '].str.replace('X', '').astype(int)
# Kill trailing whitespace in column headers
fc_qc.rename(columns=lambda x: x.strip(), inplace=True)
# Get rid of trailing whitespace in status cell
fc_qc['status'] = fc_qc['status'].str.strip(' ')

In [5]:
bv.head()

Unnamed: 0,0,1,BV,SUB_ID
0,Caltech+0051456,session_1+anat_1,1218639.0,51456
1,Caltech+0051457,session_1+anat_1,1430570.0,51457
2,Caltech+0051458,session_1+anat_1,1222213.0,51458
3,Caltech+0051459,session_1+anat_1,1264665.0,51459
4,Caltech+0051460,session_1+anat_1,1158337.0,51460


In [6]:
psm_pheno.head()

Unnamed: 0,SUB_ID,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,HANDEDNESS_CATEGORY,HANDEDNESS_SCORES,FIQ,...,comment_func,frames_scrubbed,frames_OK,FD,FD_scrubbed,session,run,ADOS_sb_sev,ADOS_sc_sev,biv
0,50772,KKI,KKI_0050772,Control,Control,12.77,Male,R,100.0,98.0,...,,15.0,141.0,0.165359,0.133364,1,1,,,1230207.099
1,50774,KKI,KKI_0050774,Control,Control,10.64,Male,R,56.0,124.0,...,,9.0,147.0,0.16271,0.153141,1,1,,,1297837.11
2,50776,KKI,KKI_0050776,Control,Control,9.3,Male,R,100.0,102.0,...,,51.0,105.0,0.346581,0.218636,1,1,,,1086355.503
3,50777,KKI,KKI_0050777,Control,Control,8.39,Male,R,100.0,125.0,...,minor IC_1,4.0,152.0,0.162699,0.159246,1,1,,,1340981.412
4,50779,KKI,KKI_0050779,Control,Control,9.41,Male,R,83.0,105.0,...,,33.0,123.0,0.234864,0.184077,1,1,,,1012924.868


In [7]:
motion.head()

Unnamed: 0,SUB_ID,SITE_ID,frames_scrubbed,frames_OK,FD,FD_scrubbed
0,51160,Stanford,70.0,110.0,0.300555,0.181696
1,51161,Stanford,140.0,40.0,0.803036,0.189346
2,51162,Stanford,0.0,180.0,0.144961,0.144961
3,51163,Stanford,32.0,148.0,0.205591,0.137707
4,51164,Stanford,52.0,128.0,0.257335,0.170774


In [8]:
# Merge with motion information
fc_qc_full = pd.merge(fc_qc, motion[['SUB_ID', 'frames_scrubbed', 'frames_OK', 'FD', 'FD_scrubbed']], on='SUB_ID', how='inner')

In [9]:
# Remap values in abide pheno
abide_pheno = pd.read_csv(abide_1_pheno_p)
abide_pheno.replace({'DX_GROUP':{1:'Autism', 2:'Control'},
                     'DSM_IV_TR':{0:'Control', 1:'Autism', 2:'Aspergers', 3:'PDD-NOS', 4:'Aspergers or PDD-NOS'},
                     'CURRENT_MED_STATUS':{0:'No medication', 1:'taking medication'},
                     'EYE_STATUS_AT_SCAN':{1:'Open', 2:'Closed'}}, inplace=True)

In [10]:
abide_pheno.head()

Unnamed: 0,SITE_ID,SUB_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,HANDEDNESS_CATEGORY,HANDEDNESS_SCORES,FIQ,VIQ,...,WISC_IV_BLK_DSN_SCALED,WISC_IV_PIC_CON_SCALED,WISC_IV_MATRIX_SCALED,WISC_IV_DIGIT_SPAN_SCALED,WISC_IV_LET_NUM_SCALED,WISC_IV_CODING_SCALED,WISC_IV_SYM_SCALED,EYE_STATUS_AT_SCAN,AGE_AT_MPRAGE,BMI
0,CALTECH,51456,Autism,Aspergers or PDD-NOS,55.4,1,R,,126.0,118.0,...,,,,,,,,Closed,,
1,CALTECH,51457,Autism,Aspergers or PDD-NOS,22.9,1,Ambi,,107.0,119.0,...,,,,,,,,Closed,,
2,CALTECH,51458,Autism,Autism,39.2,1,R,,93.0,80.0,...,,,,,,,,Closed,,
3,CALTECH,51459,Autism,Autism,22.8,1,R,,106.0,94.0,...,,,,,,,,Closed,,
4,CALTECH,51460,Autism,Autism,34.6,2,Ambi,,133.0,135.0,...,,,,,,,,Closed,,


In [11]:
anat_qc = pd.read_csv(gleb_qc)
# Split the format into something useful
anat_qc[['Site', 'Subject', 'Session', 'Run']] = anat_qc['0'].str.split('+', expand=True)
anat_qc['SUB_ID'] = anat_qc['Subject'].astype(int)
# Make something of use out of the anatomical ratings
qc_l = list()
for rid, row in anat_qc.iterrows():
    qc = row['1']
    if '-+1' in qc:
        qc_l.append(3)        
    elif '-1' in qc.lower() and 'no images' in qc.lower():
        qc_l.append(1)
        pass
    elif '-1' in qc:
        qc_l.append(2)
        pass
    elif '1' in qc:
        qc_l.append(4)
        pass
    elif '0' in qc:
        qc_l.append(0)
        pass
    else:
        print(qc)
anat_qc['Ratings'] = qc_l
# Rename the headerless columns
anat_qc.rename(columns={'0':'Civet_ID', '1':'Manual_Rating'}, inplace=True)
# Retain only useful columns
anat_useful_qc = anat_qc[['Civet_ID', 'Manual_Rating', 
                          'Site', 'Subject', 
                          'Session', 'Run', 
                          'SUB_ID', 'Ratings']]

In [12]:
anat_useful_qc.head()

Unnamed: 0,Civet_ID,Manual_Rating,Site,Subject,Session,Run,SUB_ID,Ratings
0,Caltech+0051456+session_1+anat_1,-1 (gradient too high around sensorimotor str...,Caltech,51456,session_1,anat_1,51456,2
1,Caltech+0051457+session_1+anat_1,-1 (gradient too high around sm strip and to ...,Caltech,51457,session_1,anat_1,51457,2
2,Caltech+0051458+session_1+anat_1,-1 (slightly too high gradient on sm strip,Caltech,51458,session_1,anat_1,51458,2
3,Caltech+0051459+session_1+anat_1,-1 (more problematic in V1 than in sm strip; ...,Caltech,51459,session_1,anat_1,51459,2
4,Caltech+0051460+session_1+anat_1,-1 (V1 worse than sm strip),Caltech,51460,session_1,anat_1,51460,2


In [13]:
anat_qc.tail()

Unnamed: 0,Civet_ID,Manual_Rating,2,3,4,Site,Subject,Session,Run,SUB_ID,Ratings
2289,ABIDEII-UPSM_Long+50049+followup_1+anat_1,-+1 (a bit better),,,,ABIDEII-UPSM_Long,50049,followup_1,anat_1,50049,3
2290,ABIDEII-UPSM_Long+50050+baseline+anat_1,-+1 (quite reasonable),,,,ABIDEII-UPSM_Long,50050,baseline,anat_1,50050,3
2291,ABIDEII-UPSM_Long+50050+followup_1+anat_1,-+1 (just a tiny bit worse),,,,ABIDEII-UPSM_Long,50050,followup_1,anat_1,50050,3
2292,ABIDEII-UPSM_Long+50051+baseline+anat_1,-1 (hi grad in left occpol and right tpol and...,,,,ABIDEII-UPSM_Long,50051,baseline,anat_1,50051,2
2293,ABIDEII-UPSM_Long+50051+followup_1+anat_1,-+1 (about same and pattern of hi grad differ...,,,,ABIDEII-UPSM_Long,50051,followup_1,anat_1,50051,3


# QC overlap of CT and FC Abide 1

In [14]:
# Merge the full qc of FC and CT
merged_qc = pd.merge(anat_useful_qc, fc_qc_full, on='SUB_ID', how='inner')
# Find duplicates here, i.e. follow up sessions in ABIDE
ind = np.where(['ABIDEII' in row['Civet_ID'] and not 'baseline' in row['Civet_ID'] for rid, row in merged_qc.iterrows()])[0]
# And remove them
merged_qc.drop(ind, inplace=True)

In [15]:
merged_qc.head()

Unnamed: 0,Civet_ID,Manual_Rating,Site,Subject,Session,Run,SUB_ID,Ratings,id_subject,status,anat,comment_anat,func,comment_func,frames_scrubbed,frames_OK,FD,FD_scrubbed
0,Caltech+0051456+session_1+anat_1,-1 (gradient too high around sensorimotor str...,Caltech,51456,session_1,anat_1,51456,2,X0051456,Fail,OK,,Fail,major IC_4,95.0,55.0,0.434593,0.328121
1,Caltech+0051457+session_1+anat_1,-1 (gradient too high around sm strip and to ...,Caltech,51457,session_1,anat_1,51457,2,X0051457,Fail,OK,,Fail,major IC_4,16.0,134.0,0.20716,0.170893
2,Caltech+0051458+session_1+anat_1,-1 (slightly too high gradient on sm strip,Caltech,51458,session_1,anat_1,51458,2,X0051458,Fail,OK,,Fail,major IC_4,18.0,132.0,0.322805,0.288872
3,Caltech+0051459+session_1+anat_1,-1 (more problematic in V1 than in sm strip; ...,Caltech,51459,session_1,anat_1,51459,2,X0051459,Maybe,OK,,Maybe,medium IC_4,0.0,150.0,0.152963,0.152963
4,Caltech+0051460+session_1+anat_1,-1 (V1 worse than sm strip),Caltech,51460,session_1,anat_1,51460,2,X0051460,Fail,OK,,Fail,major IC_4,0.0,150.0,0.207277,0.207277


In [16]:
# Merge the actual phenotypic information in
tmp = pd.merge(merged_qc, abide_pheno, on='SUB_ID', how='inner')
# Add the BIV information to that as well
a1_full = pd.merge(tmp, bv[['SUB_ID', 'BV']], on='SUB_ID', how='inner')

In [17]:
# Figure out who we actually have files for
ctp = os.path.join(ct_p, ct_t)
fcp = os.path.join(fc_p, fc_t)
ct_available = [False if not glob.glob(ctp.format(row['Site'], row['SUB_ID'], row['Session'], row['Run'])) else True
    for rid, row in a1_full.iterrows()]
fc_available = [False if not glob.glob(fcp.format(row['SUB_ID'])) else True
    for rid, row in a1_full.iterrows()]
# Store this information
a1_full['ct_available'] = ct_available
a1_full['fc_available'] = fc_available

In [34]:
# Check if it is part of the PSM sample
psm_list = psm_pheno['SUB_ID'].values
psm_ind = [row['SUB_ID'] in psm_list for rid, row in a1_full.iterrows()]
# Store this information in the table as well
a1_full['in_psm'] = psm_ind

In [35]:
a1_full.head()

Unnamed: 0,Civet_ID,Manual_Rating,Site,Subject,Session,Run,SUB_ID,Ratings,id_subject,status,...,WISC_IV_LET_NUM_SCALED,WISC_IV_CODING_SCALED,WISC_IV_SYM_SCALED,EYE_STATUS_AT_SCAN,AGE_AT_MPRAGE,BMI,BV,ct_available,fc_available,in_psm
0,Caltech+0051456+session_1+anat_1,-1 (gradient too high around sensorimotor str...,Caltech,51456,session_1,anat_1,51456,2,X0051456,Fail,...,,,,Closed,,,1218639.0,True,True,False
1,Caltech+0051457+session_1+anat_1,-1 (gradient too high around sm strip and to ...,Caltech,51457,session_1,anat_1,51457,2,X0051457,Fail,...,,,,Closed,,,1430570.0,True,True,False
2,Caltech+0051458+session_1+anat_1,-1 (slightly too high gradient on sm strip,Caltech,51458,session_1,anat_1,51458,2,X0051458,Fail,...,,,,Closed,,,1222213.0,True,True,False
3,Caltech+0051459+session_1+anat_1,-1 (more problematic in V1 than in sm strip; ...,Caltech,51459,session_1,anat_1,51459,2,X0051459,Maybe,...,,,,Closed,,,1264665.0,True,True,False
4,Caltech+0051460+session_1+anat_1,-1 (V1 worse than sm strip),Caltech,51460,session_1,anat_1,51460,2,X0051460,Fail,...,,,,Closed,,,1158337.0,True,True,False


In [36]:
a1_full.columns

Index(['Civet_ID', 'Manual_Rating', 'Site', 'Subject', 'Session', 'Run',
       'SUB_ID', 'Ratings', 'id_subject', 'status', 'anat', 'comment_anat',
       'func', 'comment_func', 'frames_scrubbed', 'frames_OK', 'FD',
       'FD_scrubbed', 'SITE_ID', 'DX_GROUP', 'DSM_IV_TR', 'AGE_AT_SCAN', 'SEX',
       'HANDEDNESS_CATEGORY', 'HANDEDNESS_SCORES', 'FIQ', 'VIQ', 'PIQ',
       'FIQ_TEST_TYPE', 'VIQ_TEST_TYPE', 'PIQ_TEST_TYPE',
       'ADI_R_SOCIAL_TOTAL_A', 'ADI_R_VERBAL_TOTAL_BV', 'ADI_RRB_TOTAL_C',
       'ADI_R_ONSET_TOTAL_D', 'ADI_R_RSRCH_RELIABLE', 'ADOS_MODULE',
       'ADOS_TOTAL', 'ADOS_COMM', 'ADOS_SOCIAL', 'ADOS_STEREO_BEHAV',
       'ADOS_RSRCH_RELIABLE', 'ADOS_GOTHAM_SOCAFFECT', 'ADOS_GOTHAM_RRB',
       'ADOS_GOTHAM_TOTAL', 'ADOS_GOTHAM_SEVERITY', 'SRS_VERSION',
       'SRS_RAW_TOTAL', 'SRS_AWARENESS', 'SRS_COGNITION', 'SRS_COMMUNICATION',
       'SRS_MOTIVATION', 'SRS_MANNERISMS', 'SCQ_TOTAL', 'AQ_TOTAL',
       'COMORBIDITY', 'CURRENT_MED_STATUS', 'MEDICATION_NAME',
       

In [37]:
# Save this matrix
a1_full.to_csv(abide_full_out, index=False)

In [28]:
abide_full_out

'/home/surchs/sim_big/PROJECT/abide_hps/pheno/abide_1_complete.csv'