In [None]:
import pandas as pd

# Data normalization

### Tau_with_Deomographics.csv


Here, the average of Tau between the left and reight cerebellum cortex are average. 
These values are subtracted from all columns and all negative values are set to 0.

APOE4 normalization: Centers the values to 0 from [0,2] to [-1,1] by subtracting one from the columns

In [None]:
twd_df = pd.read_csv("./data/Tau_with_Demographics.csv").drop(columns=['Unnamed: 0'])
twd_df['Avg-Cerebellum-Cortex'] =  twd_df.loc[:,['Left-Cerebellum-Cortex', 'Right-Cerebellum-Cortex']].mean(axis=1)

# Could also search for columns starting with 'ctx', 'Left', or 'Right'
regions = ['ctx-rh-bankssts','ctx-rh-caudalanteriorcingulate','ctx-rh-caudalmiddlefrontal','ctx-rh-cuneus','ctx-rh-entorhinal','ctx-rh-fusiform',\
           'ctx-rh-inferiorparietal','ctx-rh-inferiortemporal','ctx-rh-isthmuscingulate','ctx-rh-lateraloccipital','ctx-rh-lateralorbitofrontal',\
           'ctx-rh-lingual','ctx-rh-medialorbitofrontal','ctx-rh-middletemporal','ctx-rh-parahippocampal','ctx-rh-paracentral','ctx-rh-parsopercularis',\
           'ctx-rh-parsorbitalis','ctx-rh-parstriangularis','ctx-rh-pericalcarine','ctx-rh-postcentral','ctx-rh-posteriorcingulate','ctx-rh-precentral',\
           'ctx-rh-precuneus','ctx-rh-rostralanteriorcingulate','ctx-rh-rostralmiddlefrontal','ctx-rh-superiorfrontal','ctx-rh-superiorparietal',\
           'ctx-rh-superiortemporal','ctx-rh-supramarginal','ctx-rh-frontalpole','ctx-rh-temporalpole','ctx-rh-transversetemporal','ctx-rh-insula',\
           'ctx-lh-bankssts','ctx-lh-caudalanteriorcingulate','ctx-lh-caudalmiddlefrontal','ctx-lh-cuneus','ctx-lh-entorhinal','ctx-lh-fusiform',\
           'ctx-lh-inferiorparietal','ctx-lh-inferiortemporal','ctx-lh-isthmuscingulate','ctx-lh-lateraloccipital','ctx-lh-lateralorbitofrontal',\
           'ctx-lh-lingual','ctx-lh-medialorbitofrontal','ctx-lh-middletemporal','ctx-lh-parahippocampal','ctx-lh-paracentral','ctx-lh-parsopercularis',\
           'ctx-lh-parsorbitalis','ctx-lh-parstriangularis','ctx-lh-pericalcarine','ctx-lh-postcentral','ctx-lh-posteriorcingulate','ctx-lh-precentral',\
           'ctx-lh-precuneus','ctx-lh-rostralanteriorcingulate','ctx-lh-rostralmiddlefrontal','ctx-lh-superiorfrontal','ctx-lh-superiorparietal',\
           'ctx-lh-superiortemporal','ctx-lh-supramarginal','ctx-lh-frontalpole','ctx-lh-temporalpole','ctx-lh-transversetemporal','ctx-lh-insula',\
           'Left-Cerebellum-Cortex','Left-Thalamus-Proper','Left-Caudate','Left-Putamen','Left-Pallidum','Left-Hippocampus','Left-Amygdala','Left-Accumbens-area',\
           'Left-VentralDC','Right-Cerebellum-Cortex','Right-Thalamus-Proper','Right-Caudate','Right-Putamen','Right-Pallidum','Right-Hippocampus',\
           'Right-Amygdala','Right-Accumbens-area','Right-VentralDC']

# # Subtracts average from all columns, then makes negative values 0.
twd_df.loc[:,regions] = twd_df.loc[:,regions].subtract(twd_df['Avg-Cerebellum-Cortex'], axis=0).clip(lower=0)
# twd_df.loc[:,regions] = twd_df.loc[:,regions]/twd_df.loc[:,regions].max()
twd_df.drop(columns=['Avg-Cerebellum-Cortex'], inplace=True)

#APOE 4 Normalization [0,2] -> [-1,1]
twd_df.loc[:, ['APOE4']] = twd_df.loc[:, ['APOE4']].subtract(1)

# Remove unnamed columns
remove = [x for x in twd_df.columns if 'Unnamed' in x.split(':')]
twd_df.drop(columns=remove, inplace=True)

# Remove merge columns
remove = [x for x in twd_df.columns if 'merge' in x.split('_')]
twd_df.drop(columns=remove, inplace=True)

# Remove duplicate columns
remove = [x for x in twd_df.columns if '1' in x.split('.')]
twd_df.drop(columns=remove, inplace=True)

# Remove columns with NANs 
# twd_df.dropna(inplace=True, axis=1) # removes APOE4 column (contains no NaNs)
remove = ['FDG', 'PIB', 'AV45', 'ABETA', 'TAU', 'PTAU']
twd_df.drop(columns=remove, inplace=True)

# Add one hot encoding for sex, education level

# Remove columns that are not int or float
str_idx = [i for i in range(twd_df.shape[1]) if type(twd_df.iloc[0, i]) == str]
twd_df.drop(twd_df.columns[str_idx], axis=1, inplace=True)

# Remove rows with NaNs
twd_df.dropna(axis=0, inplace=True)

# Normalize between 0 and 1 for remaining columns (except for 'ml_stage' used in nexis and brain regions)
adjust = [col for col in twd_df.columns if twd_df[col].max()>1 and col!='ml_stage' and col not in regions and col!='RID']
twd_df.loc[:,adjust] = twd_df.loc[:,adjust]/twd_df.loc[:,adjust].max()

# Remove ID and weighted tau data. Minimize leaking data
twd_df = twd_df.drop(columns=['W_average_hippo','W_ADAS11','W_average_tau','W_average_frontal','W_average_temporal','W_average_parietal','W_average_occipital'])

# # Save to CSV
twd_df.to_csv('./data/Tau_with_Demographics_Normalized.csv')
twd_df



### connectome_mean80_fibercount.csv

All values, normalized between 0 and 1 by dividing by the maximum value in the df.

In [None]:
connectome = pd.read_csv("./data/connectome_mean80_fibercount.csv")
connectome /= connectome.max().max()

# Save to CSV
connectome.to_csv('./data/connectome_mean80_fibercount_normalized.csv')