In [128]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

local_dir = '/home/will/Projects/PVS'
data_dir = os.path.join(local_dir, 'wscore_data')

In [129]:
# load labelset data
label_df = pd.read_excel(os.path.join(local_dir,'Schaefer2018_200Parcels_17Networks_order.xlsx'))
label_names = list(label_df['label_abbrev_name'])
label_full_names = ['17Networks_' + x for x in label_names]
label_full_names[0:10]

['17Networks_LH_VisCent_ExStr_1',
 '17Networks_LH_VisCent_ExStr_2',
 '17Networks_LH_VisCent_Striate_1',
 '17Networks_LH_VisCent_ExStr_3',
 '17Networks_LH_VisCent_ExStr_4',
 '17Networks_LH_VisCent_ExStr_5',
 '17Networks_LH_VisPeri_ExStrInf_1',
 '17Networks_LH_VisPeri_ExStrInf_2',
 '17Networks_LH_VisPeri_ExStrInf_3',
 '17Networks_LH_VisPeri_StriCal_1']

## Create spreadsheet

In [130]:
# first combine all the dataframes
df_list = []
pvs_df = pd.DataFrame()
for fname in os.listdir(data_dir):
    fpath = os.path.join(data_dir, fname)
    df = pd.read_csv(fpath)
    # add session id as column
    ses_id = fname.split('ses-')[1].split('_')[0]
    df.insert(0, 'SessionID', ses_id)
    # pivot dataframe
    pvtd = df.pivot(index='SessionID', columns='label_full_name', values='w-score')
    # add subject id as column
    sub_id = fname.split('sub-')[1].split('_')[0]
    pvtd.insert(0, 'INDDID', sub_id)
    
    df_list.append(pvtd)

pvs_df = pd.concat(df_list)
pvs_df.head()

label_full_name,INDDID,17Networks_LH_ContA_Cingm_1,17Networks_LH_ContA_IPS_1,17Networks_LH_ContA_IPS_2,17Networks_LH_ContA_IPS_3,17Networks_LH_ContA_PFCd_1,17Networks_LH_ContA_PFCl_1,17Networks_LH_ContA_PFCl_2,17Networks_LH_ContA_PFCl_3,17Networks_LH_ContA_PFClv_1,...,17Networks_RH_VisCent_ExStr_3,17Networks_RH_VisCent_ExStr_4,17Networks_RH_VisCent_ExStr_5,17Networks_RH_VisCent_Striate_1,17Networks_RH_VisPeri_ExStrInf_1,17Networks_RH_VisPeri_ExStrInf_2,17Networks_RH_VisPeri_ExStrSup_1,17Networks_RH_VisPeri_ExStrSup_2,17Networks_RH_VisPeri_ExStrSup_3,17Networks_RH_VisPeri_StriCal_1
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
115562x20170309x3T,115562,-0.6944,2.321097,1.294149,0.981595,2.273557,1.476498,1.53053,1.369574,2.312892,...,1.965991,2.492056,1.702256,0.567938,1.523095,1.199445,1.40606,1.764025,1.783238,1.530509
116707x20190117x3T,116707,0.954576,2.575742,2.154192,1.745228,1.977333,2.407527,2.38777,2.741372,2.418698,...,2.49418,2.851573,2.569641,1.488172,2.2343,0.711866,2.009206,2.298937,2.593222,1.691183
124288x20190829x3T,124288,0.118939,-0.300105,0.478074,-0.321055,-0.128036,-0.660108,-0.273322,0.167206,-1.672911,...,0.722583,0.760961,0.250942,0.552577,1.486216,-0.048384,0.74697,0.872201,-0.060272,1.033843
124868x20190822x3T,124868,-1.772232,-0.167389,0.136095,-1.004691,-1.113349,-0.45538,0.446414,-0.068461,0.359397,...,0.399515,-0.33856,0.578593,0.708675,0.580514,0.266053,-0.412246,-0.022279,-0.277576,0.181376
119035x20161110x3T,119035,0.304676,0.786516,1.186541,1.281763,0.313387,0.810012,1.546734,0.989584,1.379923,...,1.154637,1.569618,1.354581,1.399546,0.548016,-0.342485,0.473402,1.232933,0.960576,1.385054


In [131]:
pvs_df = pvs_df.reindex(['INDDID'] + label_full_names, axis=1)

In [132]:
pvs_df.head()

label_full_name,INDDID,17Networks_LH_VisCent_ExStr_1,17Networks_LH_VisCent_ExStr_2,17Networks_LH_VisCent_Striate_1,17Networks_LH_VisCent_ExStr_3,17Networks_LH_VisCent_ExStr_4,17Networks_LH_VisCent_ExStr_5,17Networks_LH_VisPeri_ExStrInf_1,17Networks_LH_VisPeri_ExStrInf_2,17Networks_LH_VisPeri_ExStrInf_3,...,17Networks_RH_DefaultB_AntTemp_1,17Networks_RH_DefaultB_PFCd_1,17Networks_RH_DefaultB_PFCv_1,17Networks_RH_DefaultC_IPL_1,17Networks_RH_DefaultC_Rsp_1,17Networks_RH_DefaultC_PHC_1,17Networks_RH_TempPar_1,17Networks_RH_TempPar_2,17Networks_RH_TempPar_3,17Networks_RH_TempPar_4
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
115562x20170309x3T,115562,1.937467,0.86529,0.304432,1.894185,2.757598,2.196862,2.073014,1.088571,2.403711,...,1.827341,1.307841,2.623699,2.756166,1.201103,-0.057126,2.297526,1.692659,2.801574,2.59545
116707x20190117x3T,116707,1.537478,1.356918,-0.205439,2.195686,2.415308,2.577049,2.282898,1.579406,1.198403,...,1.598266,2.386526,2.787714,2.394199,1.925642,0.751033,1.22074,2.141935,2.743767,2.464402
124288x20190829x3T,124288,1.116862,1.06812,0.906187,0.229293,0.771403,0.559947,0.999771,0.995084,0.018267,...,0.294485,0.023493,0.213185,-0.309003,-0.482059,-0.050471,0.807194,-0.119892,0.624068,0.022494
124868x20190822x3T,124868,0.256561,0.252578,0.084412,0.782695,-0.130224,0.082596,0.541404,0.379766,-0.915315,...,-0.084277,-0.778188,0.854505,0.008148,-0.09758,-0.286652,-0.167386,0.398563,-0.043057,-0.41333
119035x20161110x3T,119035,1.029089,0.912492,0.780854,1.434159,1.459308,1.20531,1.100022,0.780125,-1.272066,...,-0.063832,1.250027,0.52382,1.159019,1.021646,-0.213767,-0.282603,0.273786,1.304768,0.884312


In [133]:
# now add in ClinicalRead and AgeAtMRI from original spreadsheet
orig_spreadsheet = os.path.join(local_dir, 'PVS_subjects.xlsx')
orig_df = pd.read_excel(orig_spreadsheet)
orig_df.head()

# go through original df and get Age and Read from each subject, then add those to a dictionary of sid: {age, read}
d = {}
for row in orig_df.itertuples(index=False):
    d[row.INDDID] = [row.AgeatMRI, row.Clinical_Read]

d[122975]

[72, 'Negative']

In [136]:
for k in d.keys():
    # k is INDDID; assign k's Age and Read to respective columns in main dataframe
    pvs_df.loc[(pvs_df.INDDID==str(k)),'AgeAtMRI']=d[k][0]
    pvs_df.loc[(pvs_df.INDDID==str(k)),'Clinical_Read']=d[k][1]
    
    # shift columns to first position
    age = pvs_df.pop('AgeAtMRI')
    read = pvs_df.pop('Clinical_Read')
    pvs_df.insert(1, 'AgeAtMRI', age)
    pvs_df.insert(2, 'Clinical_Read', read)
    
pvs_df.to_csv(os.path.join(local_dir, 'PVS_data.csv'))
pvs_df.head()

label_full_name,INDDID,AgeAtMRI,Clinical_Read,17Networks_LH_VisCent_ExStr_1,17Networks_LH_VisCent_ExStr_2,17Networks_LH_VisCent_Striate_1,17Networks_LH_VisCent_ExStr_3,17Networks_LH_VisCent_ExStr_4,17Networks_LH_VisCent_ExStr_5,17Networks_LH_VisPeri_ExStrInf_1,...,17Networks_RH_DefaultB_AntTemp_1,17Networks_RH_DefaultB_PFCd_1,17Networks_RH_DefaultB_PFCv_1,17Networks_RH_DefaultC_IPL_1,17Networks_RH_DefaultC_Rsp_1,17Networks_RH_DefaultC_PHC_1,17Networks_RH_TempPar_1,17Networks_RH_TempPar_2,17Networks_RH_TempPar_3,17Networks_RH_TempPar_4
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
115562x20170309x3T,115562,89.0,Positive,1.937467,0.86529,0.304432,1.894185,2.757598,2.196862,2.073014,...,1.827341,1.307841,2.623699,2.756166,1.201103,-0.057126,2.297526,1.692659,2.801574,2.59545
116707x20190117x3T,116707,86.0,Positive,1.537478,1.356918,-0.205439,2.195686,2.415308,2.577049,2.282898,...,1.598266,2.386526,2.787714,2.394199,1.925642,0.751033,1.22074,2.141935,2.743767,2.464402
124288x20190829x3T,124288,67.0,Negative,1.116862,1.06812,0.906187,0.229293,0.771403,0.559947,0.999771,...,0.294485,0.023493,0.213185,-0.309003,-0.482059,-0.050471,0.807194,-0.119892,0.624068,0.022494
124868x20190822x3T,124868,65.0,Negative,0.256561,0.252578,0.084412,0.782695,-0.130224,0.082596,0.541404,...,-0.084277,-0.778188,0.854505,0.008148,-0.09758,-0.286652,-0.167386,0.398563,-0.043057,-0.41333
119035x20161110x3T,119035,75.0,Negative,1.029089,0.912492,0.780854,1.434159,1.459308,1.20531,1.100022,...,-0.063832,1.250027,0.52382,1.159019,1.021646,-0.213767,-0.282603,0.273786,1.304768,0.884312


## Reviewing quantities 

In [8]:
#ax = sns.boxplot(x="Clinical_Read", y="17Networks_LH_DefaultB_Temp_1", data=pvs_df)

In [137]:
# break down INDDID list into amyloid positive/negative patients
pos_inddid = list(pvs_df[pvs_df['Clinical_Read']=='Positive']['INDDID'])
neg_inddid = list(pvs_df[pvs_df['Clinical_Read']=='Negative']['INDDID'])

# expect some INDDID duplicates because of longitudinal sessions
print(len(pos_inddid))
print(len(neg_inddid))
print(len(set(pos_inddid)))
print(len(set(neg_inddid)))

82
93
60
61


In [138]:
# Code snippet for getting scores from a particular region.
pos_temp_wscores = list(pvs_df[pvs_df['Clinical_Read']=='Positive']['17Networks_LH_DefaultB_Temp_2'])
neg_temp_wscores = list(pvs_df[pvs_df['Clinical_Read']=='Negative']['17Networks_LH_DefaultB_Temp_2'])
# import scipy
# from scipy import stats
# scipy.stats.ttest_ind(pos_temp_wscores, neg_temp_wscores)

# Should all be unique
print(len(pos_temp_wscores))
print(len(set(pos_temp_wscores)))
print(len(neg_temp_wscores))
print(len(set(neg_temp_wscores)))

82
82
93
93


## Weight w-scores by ROI volume

Manually copy weights from /home/will/Projects/healthy-t1-dataset/labels/Schaefer2018_200Parcels_17Networks_order.xlsx to PVS_data.csv.

In [139]:
vol_list = list(label_df['volume'])
vol_list[0:10]

[9225, 6346, 7116, 6267, 6246, 6376, 4634, 4835, 1239, 8181]

In [158]:
# go through every column of pvs_data (i.e. every ROI) and multiply by corresponding volume
# gives a tuple of column name and series for each column in the dataframe
w_pvs_df = pvs_df.copy()
for (column_name, column_data) in pvs_df.items():
    if '17Networks' in column_name:
        abbrev_name = column_name.split('17Networks_')[1]
        volume = label_df.loc[(label_df.label_abbrev_name==abbrev_name),'volume']
        
        wscores = np.array([float(x) for x in column_data.values])
        weighted_wscores = np.multiply(wscores, int(volume))
        #print("{} | {} | {} | {}".format(column_data[0], abbrev_name, int(volume), weighted_wscores[0])
        
        # assign weighted values to new column
        new_col_name = 'weighted_' + column_name
        w_pvs_df[new_col_name] = weighted_wscores
                

In [159]:
# calculate weighted averages for all regions, frontal regions, etc. and insert into dataframe
wa_total_list = []
wa_pfron_list = []
for index, row in w_pvs_df.iterrows():

    sum_weights = np.sum(label_df['volume']) # be careful of order
    weighted_cols = [col for col in row.index if col.startswith('weighted')]

    # calculate weighted average for all 200 regions
    weighted_wscores = row[weighted_cols]
    sum_weighted = np.sum(weighted_wscores)
    wa_total = np.divide(sum_weighted, sum_weights)
    
    # calculate weighted average for frontal regions
    pfron_cols = [col for col in weighted_cols if 'PFC' in col or 'OFC' in col]
    pfron_weighted_wscores = row[pfron_cols]
    pfron_weighted = np.sum(pfron_weighted_wscores)
    wa_pfron = np.divide(pfron_weighted, sum_weights)
    
    wa_total_list.append(wa_total)
    wa_pfron_list.append(wa_pfron)
    

# assign weighted averages to new column(s)
w_pvs_df.insert(len(w_pvs_df.columns), 'total_weighted_average', wa_total_list, allow_duplicates=False)
w_pvs_df.insert(len(w_pvs_df.columns), 'prefrontal_weighted_average', wa_pfron_list, allow_duplicates=False)

w_pvs_df.to_csv(os.path.join(local_dir, 'weighted_PVS_data.csv'))

In [160]:
w_pvs_df.head()

label_full_name,INDDID,AgeAtMRI,Clinical_Read,17Networks_LH_VisCent_ExStr_1,17Networks_LH_VisCent_ExStr_2,17Networks_LH_VisCent_Striate_1,17Networks_LH_VisCent_ExStr_3,17Networks_LH_VisCent_ExStr_4,17Networks_LH_VisCent_ExStr_5,17Networks_LH_VisPeri_ExStrInf_1,...,weighted_17Networks_RH_DefaultB_PFCv_1,weighted_17Networks_RH_DefaultC_IPL_1,weighted_17Networks_RH_DefaultC_Rsp_1,weighted_17Networks_RH_DefaultC_PHC_1,weighted_17Networks_RH_TempPar_1,weighted_17Networks_RH_TempPar_2,weighted_17Networks_RH_TempPar_3,weighted_17Networks_RH_TempPar_4,total_weighted_average,prefrontal_weighted_average
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
115562x20170309x3T,115562,89.0,Positive,1.937467,0.86529,0.304432,1.894185,2.757598,2.196862,2.073014,...,24568.316148,18088.715474,4850.051989,-319.905954,14361.835414,6613.219582,11239.913677,8837.507281,1.895779,0.541931
116707x20190117x3T,116707,86.0,Positive,1.537478,1.356918,-0.205439,2.195686,2.415308,2.577049,2.282898,...,26104.14982,15713.126473,7775.743461,4205.786703,7630.842623,8368.540131,11007.995171,8391.28983,2.156533,0.691856
124288x20190829x3T,124288,67.0,Negative,1.116862,1.06812,0.906187,0.229293,0.771403,0.559947,0.999771,...,1996.266629,-2027.984322,-1946.555105,-282.639022,5045.767012,-468.416483,2503.760738,76.592486,0.25325,0.025977
124868x20190822x3T,124868,65.0,Negative,0.256561,0.252578,0.084412,0.782695,-0.130224,0.082596,0.541404,...,8001.589059,53.475662,-394.02874,-1605.253393,-1046.33115,1557.1851,-172.744248,-1407.389392,-0.108497,-0.026737
119035x20161110x3T,119035,75.0,Negative,1.029089,0.912492,0.780854,1.434159,1.459308,1.20531,1.100022,...,4905.046409,7606.643615,4125.407888,-1197.094772,-1766.552586,1069.681778,5234.728559,3011.082462,0.728156,0.258001
