# Data Cleaning / Wrangling

In [2]:
import os.path
import numpy as np
import networkx as nx
import pickle as pkl
import importlib
import sys
from collections import defaultdict, Counter

import controllability as ctrb
importlib.reload(ctrb)
import Towlson_group_code.data_io as myFunc
importlib.reload(myFunc)
import Prevent.PREVENT_functions as prev_fct
importlib.reload(prev_fct)

print(sys.version)

PICKLE_PATH = '../../PREVENT_Study/pickles/'
CONNECTOME_DATA_PATH = '../../PREVENT_Study/data/Individual_VolumeNormalized_dataframes/'
# CONNECTOME_DATA_PATH = '../PREVENT_Study/data/Individual_NONnormalized_dataframes/'
FIGURE_PATH = '../../PREVENT_Study/figures/'

3.8.12 (default, Mar  2 2022, 12:59:08) 
[Clang 13.0.0 (clang-1300.0.27.3)]


# Create Networks from Volume Normalized Connectomes
- [x] Calculate controllability values for each network
- [x] Record network metric values into graphs

In [None]:
def loadBrainCSV(time, stabilize=True):
    hc_data = {}
    p_data = {}
    pklName = f'{time}_connectomes.pkl'
    bad_regions = []
    if os.path.exists(PICKLE_PATH+pklName):
        with open(PICKLE_PATH+pklName, 'rb') as f:
            hc_data = pkl.load(f)
            p_data = pkl.load(f)
    else:
        # regions to delete
        delete_regions =  ['Optic-Chiasm', '3rd-Ventricle', 'CSF', '4th-Ventricle', 'Left-vessel', 'Right-vessel', 'Left-Lateral-Ventricle', 'Left-Inf-Lat-Vent', 'Right-Inf-Lat-Vent', 'Right-Lateral-Ventricle', 'Brain-Stem', 'CC_Posterior', 'CC_Mid_Posterior', 'CC_Central', 'CC_Mid_Anterior', 'CC_Anterior', 'Left-Cerebral-White-Matter', 'Left-Cerebellum-White-Matter', 'Left-Cerebellum-Cortex', 'Right-Cerebral-White-Matter', 'Right-Cerebellum-White-Matter', 'Right-Cerebellum-Cortex', 'Left-VentralDC', 'Left-choroid-plexus', 'Right-VentralDC', 'Right-choroid-plexus', 'WM-hypointensities', 'ctx-lh-unknown', 'ctx-rh-unknown']
        delete_regions += ['Right-Pallidum', 'Left-Pallidum', 'ctx_lh_G_insular_short', 'ctx_rh_G_insular_short']

        # Load all pickle files from data source folder
        bad_files = 0
        n_population = 0
        hc_pop = 0
        tia_pop = 0
        stdOut = sys.stdout
        sys.stdout = open(f'../../PREVENT_Study/data/{time}_error_log.txt', 'w')
        for root, dirs, files in os.walk(CONNECTOME_DATA_PATH):
            for file in (files):
                if not file.endswith('.pickle'):
                    continue
                if time not in file:
                    continue
                # remove = ['174_bl_con', '087_Y5_con', '044_Y5_tia']
                remove = []
                skip = False
                for r in remove:
                    if r in file:
                        skip = True
                if skip:
                    continue

                with open(root+file, 'rb') as f:
                    cdf = pkl.load(f)
                # Clean up DF connectome
                adjMatrix = cdf.drop(delete_regions, axis=0)
                adjMatrix = adjMatrix.drop(delete_regions, axis=1)
                if stabilize:
                    adjMat = adjMatrix.to_numpy()
                    if np.count_nonzero(adjMat) == 0:
                        print("No brain data #", patID)
                        continue
                    meanWeight = adjMat.sum() / np.count_nonzero(adjMat)
                    adjMatrix = adjMatrix.div(meanWeight)
                G = nx.from_pandas_adjacency(adjMatrix)
                badGraph, G, badRegions = prev_fct.rank_nodes(G)
                if badGraph:
                    print(file[:10])
                    bad_files += 1
                    for b in badRegions:
                        bad_regions.append(b)
                        print('\t*', b)
                    continue
                    # cdf.to_excel('../PREVENT_study/data/bad/'+file[:len(file)-7]+'.xlsx')

                # Avg Controllability calc and ranking
                avgCtrbDict = ctrb.avg_control(G)
                nx.set_node_attributes(G, avgCtrbDict, name='avgCtrb')
                badGraph, G, badRegions = prev_fct.rank_nodes(G, attr='avgCtrb')
                if badGraph:
                    print(f"Could not rank avg ctrb for {file}")

                # Modal controllability calc and ranking
                modalCtrbDict = ctrb.modal_control(G)
                nx.set_node_attributes(G, modalCtrbDict, name='modCtrb')
                badGraph, G, badRegions = prev_fct.rank_nodes(G, attr='modCtrb')
                if badGraph:
                    print(f"Could not rank mod ctrb for {file}")

                info = file.split("_")
                patID = info[0]
                if info[2] == 'tia':
                    # store in p_data as a networkX graph
                    p_data[patID] = G
                    tia_pop += 1
                else:
                    hc_data[patID] = G
                    hc_pop += 1
                n_population += 1
        with open(PICKLE_PATH+pklName, 'wb') as f:
            pkl.dump(hc_data, f)
            pkl.dump(p_data, f)
        print('\n-----------------------------------')
        print(f'Summary for {time} data set: ')
        print(f'There is a total of {n_population} good files.')
        print(f'   * {hc_pop} are Control')
        print(f'   * {tia_pop} are TIA')
        print(f'There are {bad_files} bad files that contained isolated nodes (hence not saved to cleaned data pickle).')
        print(f'In total the frequency of bad regions are: ')
        freq = sorted(Counter(bad_regions).items(), key=lambda x: x[1], reverse=True)
        print(*freq, sep="\n")

        sys.stdout = stdOut
    return hc_data, p_data, bad_regions

Once we have a pickle for each time frame, save them together in a single pickle.

In [None]:
hc_bl_data, p_bl_data, bl_bad_regions = loadBrainCSV(time='bl')
hc_y1_data, p_y1_data, y1_bad_regions  = loadBrainCSV(time='Y1')
hc_y3_data, p_y3_data, y3_bad_regions  = loadBrainCSV(time='Y3')
hc_y5_data, p_y5_data, y5_bad_regions  = loadBrainCSV(time='Y5')

individual_data = {'HCbl': hc_bl_data, 'Pbl': p_bl_data, 'HCy1': hc_y1_data, 'Py1': p_y1_data,
                   'HCy3': hc_y3_data, 'Py3': p_y3_data, 'HCy5': hc_y5_data, 'Py5': p_y5_data}
myFunc.save_to_pickle(individual_data, PICKLE_PATH, 'Normalized_Connectomes.pkl')

# For FSLeyes visualization data prep

In [None]:
with open('../../PREVENT_Study/Brain_Atlas/abbrev_to_label_mapping.txt') as f:
    lines = f.readlines()

mapping = {}
for line in lines:
    s = line.split(' ')
    if s[0] == '95':
        break
    lr = s[1].split('_')[-1]
    abbrv = s[3].rstrip('\n') + "." + lr
    # print(abbrv, s[1])
    mapping[abbrv] = s[1]

# Volume Data
- Store as pandas Dataframe pickles.
- Rename columns to correspond to our network's node names

There are more region names in the volume data sheets than the nodes we have in our brain networks.
Load each volume data and only select the node regions that matches the nodes we have in our brain network.

In [8]:
metadata, node_list = prev_fct.load_meta_data()
column_list = []
for n in node_list:
    if n[:3] == 'ctx':
        column_list.append(n[4:].replace("_and_", "&")+"_volume")
    elif 'Thalamus' in n:
        column_list.append(n+'-Proper')
    else:
        column_list.append(n)

158


In [53]:
vol_df = myFunc.import_XLSX('../../PREVENT_Study/data/Region Volume data for participants/', 'Y5.xlsx')
remove_columns = list(set(vol_df.columns) - set(column_list))
vol_df = vol_df.drop(remove_columns, axis=1)
rename_columns = {c: node_list[i]  for i ,c in enumerate(column_list)}
vol_df.rename(columns=rename_columns, inplace=True)

In [62]:
print(159+179)

338
