In [9]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.io as sio
import sys
from tqdm import tqdm


import holoviews as hv
from holoviews.operation.datashader import datashade
from holoviews import opts, dim
hv.extension('bokeh')

from colorcet import fire
from scipy import stats

In [10]:
def parse_epoch_filename(filename):
    # Remove .npy extension
    base = os.path.splitext(filename)[0]
    
    # Split by underscore
    parts = base.split('_')

    pid = parts[0]
    idxs = parts[2]
    idxbegin = idxs.split('-')[0]
    idxend = idxs.split('-')[1]
        
    return int(pid), int(idxbegin), int(idxend)

epochs = pd.read_csv('/media/dan/Data/data/connectivity/epochs.txt', header=None)
parsed_epochs = list(epochs[0].apply(parse_epoch_filename).values)
epochs = pd.DataFrame(parsed_epochs, columns=['pid', 'begin', 'end'])

In [11]:
epochs

Unnamed: 0,pid,begin,end
0,1,0,512
1,1,512,1024
2,1,1024,1536
3,1,1536,2048
4,1,2048,2560
...,...,...,...
43364,113,611328,612352
43365,113,612352,613376
43366,113,613376,614400
43367,113,614400,615424


In [12]:
base = '/media/dan/Data/data/connectivity/downloads-dump/BCT/outputs'

subdir = ['cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5',
'cov_EmpiricalCovariance',
'pdist_cosine',
'mi_gaussian']


In [13]:
shapes = {}
for sub in subdir:
    path = os.path.join(base, sub)

    for f in list(sorted(os.listdir(path))):
        parts = f.split('~')
        pid = parts[1]
        if pid not in shapes:
            shapes[pid] = {}
        if sub not in shapes[pid]:
            shapes[pid][sub] = []
        shapes[pid][sub].append(f)


In [14]:
for pid in shapes:
    lens = [len(shapes[pid][sub]) for sub in shapes[pid]]
    if len(set(lens)) > 1:
        print(pid, lens)

111 [600, 602, 602, 602]


# check collinearity

In [15]:
metadata_df = pd.read_csv('/media/dan/Data/FULL_composite_patient_info.csv')
new_df = pd.DataFrame()
for file in list(sorted(os.listdir("/media/dan/Data/data/electrodes_used"))):
    tmp = pd.read_csv(os.path.join("/media/dan/Data/data/electrodes_used", file))
    pid = int(file.split('_')[0])
    electrodes = tmp['0'].values
    # match electrodes to metadata_df
    subset = metadata_df[(metadata_df['pid'] == pid) & (metadata_df['electrode'].isin(electrodes))].copy()
    if subset['soz'].sum() == 0:
        continue
    subset['electrode_idx'] = 0
    for i,e in enumerate(electrodes):
        subset.loc[subset['electrode'] == e, 'electrode_idx'] = i
    new_df = pd.concat([new_df, subset])


In [31]:
results = []
for sub in subdir:
    calcs_path = os.path.join(base, sub)
    for f in tqdm(list(sorted(os.listdir(calcs_path)))):
        pid = int(os.path.basename(f).split('~')[1])
        time = int(os.path.basename(f).split('~')[2])
        if pid == 111:
            continue

        mat = sio.loadmat(os.path.join(calcs_path, f), struct_as_record=False, squeeze_me=True, simplify_cells=True)

        # parse mat features into a dataframe
        electrodes = {}
        singular_values = {}
        for key in mat['out'].keys():
            if key == 'timing':
                continue

            for keys in mat['out'][key].keys():
                feature_name = f"{sub}~{key}~{keys}"
                if isinstance(mat['out'][key][keys], np.ndarray):
                    for i, val in enumerate(mat['out'][key][keys]):
                        if i not in electrodes:
                            electrodes[i] = {}
                        electrodes[i][feature_name] = val
                else:
                    singular_values[feature_name] = mat['out'][key][keys]

        for x in electrodes.keys():
            electrodes[x].update(singular_values)
            tmp = next(iter(new_df[(new_df['pid'] == pid) & (new_df['electrode_idx'] == x)][['x', 'y', 'z', 'soz']].T.to_dict().values()))
            electrodes[x].update(tmp)
            electrodes[x].update({'pid': pid, 'time': time, 'electrode_idx': x})

        df = pd.DataFrame(electrodes).T.reset_index(drop=True)
        results.append(df)
    


  1%|          | 435/41547 [00:37<58:22, 11.74it/s]


KeyboardInterrupt: 

In [28]:
from functools import reduce
merged_df = reduce(lambda left, right: pd.merge(left, right, on=['x', 'y', 'z', 'soz', 'pid', 'time', 'electrode_idx'], how='outer'), results)

In [30]:
merged_df.columns.values

array(['cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~modularity~binary',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~modularity~weighted',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~clustering_coefficient~binary',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~clustering_coefficient~weighted',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~degrees~binary',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~strength~weighted',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~betweenness~binary',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~betweenness~weighted',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~eigenvector_centrality~binary',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~eigenvector_centrality~weighted',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~participation_coefficient~binary',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~participation_coefficient~weighted',
       'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0

In [18]:
features = []
for sub in subdir:
    calcs_path = os.path.join(base, sub)
    for f in list(sorted(os.listdir(calcs_path))):
        pid = int(os.path.basename(f).split('~')[1])
        time = int(os.path.basename(f).split('~')[2])
        if pid == 111:
            continue

        mat = sio.loadmat(os.path.join(calcs_path, f), struct_as_record=False, squeeze_me=True,simplify_cells=True)

        for key in mat['out'].keys():
            if key == 'timing':
                continue

            for keys in mat['out'][key].keys():
                feature_name = f"{sub}~{key}~{keys}"
                features.append(feature_name)
        break

In [24]:
features

['cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~modularity~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~modularity~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~modularity_Q~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~modularity_Q~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~clustering_coefficient~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~clustering_coefficient~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~density~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~density~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~transitivity~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~transitivity~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~degrees~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~strength~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~betweenness~binary',
 'cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~betweenness~weighted',
 'cohmag_multitaper_mean_fs-1_fmin-0_fm

In [70]:
feature_name = f"{key}_{sub}"
feature_name





'modularity_binary'

In [None]:
features = {}
for sub in subdir:
    path = os.path.join(base, sub)
    if sub not in features:
        features[sub] = []
    for f in list(sorted(os.listdir(path))):
        if '111' in f: # these don't have the same shape need to fix but not important for now
            continue 
        mat = sio.loadmat(os.path.join(path, f))
        data[sub] =np.concatenate([data[sub], mat['measure'].reshape(-1)], axis=0)



In [4]:
# merge all data

# data = {}
# for sub in subdir:
#     path = os.path.join(base, sub)
#     if sub not in data:
#         data[sub] = []
#     for f in tqdm(list(sorted(os.listdir(path)))):
#         if '111' in f: # these don't have the same shape need to fix but not important for now
#             continue 
#         mat = sio.loadmat(os.path.join(path, f))
#         data[sub] =np.concatenate([data[sub], mat['measure'].reshape(-1)], axis=0)


In [5]:
# plot data

# key1 = subdir[2]
# key2 = subdir[3]
# valid_indices = ~np.isnan(data[key1]) & ~np.isnan(data[key2])
# datashade(hv.Points((data[key1][valid_indices], data[key2][valid_indices])), cmap=fire[50:]).opts(width=1000, 
#                                                           height=600,
#                                                           xlabel=key1,
#                                                           ylabel=key2)

# prepare data

In [6]:
# load electrode labels
soz = {}
sub = subdir[0]
path = os.path.join(base, sub)

for f in tqdm(list(sorted(os.listdir(path)))):
    mat = sio.loadmat(os.path.join(path, f))
    pid = f.split("~")[1].split('.')[0]
    soz[pid] = mat['soz'][0]

100%|██████████| 69/69 [00:04<00:00, 15.60it/s]


In [22]:
path = "/home/dan/Downloads/BCT/outputs/cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5/cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~001~000059~networkcalcs~lowest~4986.mat"
mat = sio.loadmat(path, struct_as_record=False, squeeze_me=True,simplify_cells=True)


In [8]:
full_set = set([1,11,13,14,19,22,31,34,35,39,40,47,62,64,69,77,83,91,95,98,99,101,102,106,108,30,92,10,16,26,33,86,])
val_set = set([62,106,1,64,77,34,40,33,31,10])
test_set = full_set - val_set



In [None]:
n = mat['out']['modularity']['binary'].shape[0]

names = list(sorted(mat['out'].keys()))
features = np.zeros((n, len(names)+4)) # 4 for electrode number, soz, patient id, and filewindow
for i in range(n): # loop electrodes 
    for j, x in enumerate(names): # loop features
        if x == 'timing':
            continue
        for y in list(sorted(mat['out'][x].keys())):
            # if size is 1, then it is a scalar
            if isinstance(mat['out'][x][y], np.ndarray):
                features[i, j] = mat['out'][x][y][i]
            else:
                features[i, j] = mat['out'][x][y]


AttributeError: 'float' object has no attribute 'size'

In [21]:
mat['out']

{'modularity': {'binary': array([3, 5, 1, 5, 1, 4, 4, 4, 1, 3, 5, 3, 4, 2, 2, 2, 5, 3, 3, 3, 4, 3,
         4, 3, 5, 3, 3, 5, 1, 4, 1, 3, 5, 5, 5, 3, 1, 4, 5, 5, 5, 4, 5, 5,
         3, 3, 3, 3, 3, 3, 3, 3, 5, 3, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 4, 4,
         1, 1, 1, 5, 5, 5, 5, 5, 3, 5, 3, 2, 4, 3, 5, 5, 5, 5, 5, 5, 5, 3,
         3, 3, 4, 3, 5, 3, 1, 3, 5, 4, 5, 5, 5, 5, 3, 1, 5, 5, 3, 3, 3, 3,
         4, 1, 1, 1, 1, 3, 1, 3], dtype=uint8),
  'weighted': array([4, 3, 2, 3, 2, 1, 1, 1, 2, 4, 3, 4, 1, 5, 5, 5, 3, 4, 4, 4, 1, 4,
         1, 1, 3, 1, 4, 3, 2, 1, 2, 4, 3, 3, 3, 4, 2, 1, 3, 3, 3, 1, 3, 3,
         4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 1,
         2, 2, 2, 3, 3, 3, 3, 3, 4, 3, 4, 5, 1, 4, 3, 3, 3, 3, 3, 3, 3, 4,
         4, 4, 1, 4, 3, 4, 2, 4, 3, 1, 3, 3, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4,
         1, 2, 2, 2, 2, 1, 2, 4], dtype=uint8)},
 'modularity_Q': {'binary': 0.27137091759778315,
  'weighted': 0.2753077343130272},
 'clustering_coefficient': {'bina