In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sb

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import os

In [6]:
import holoviews as hv
from holoviews.operation.datashader import datashade
from holoviews import opts, dim
hv.extension('bokeh')

from colorcet import fire
from scipy import stats

In [3]:
csv = "/media/dan/Data/git/pyspi_testing/prediction/predict_df_4NetMets_20250319.csv"
raw_df = pd.read_csv(csv)

non_predict_columns = ['soz', 'pid', 'time', 'electrode_idx', 'x', 'y', 'z']
predict_columns = [c for c in raw_df.columns if c not in non_predict_columns]

df_modified = raw_df.copy()

# fill nan with median of each column
df_modified[predict_columns] = df_modified[predict_columns].fillna(df_modified[predict_columns].median())

log_features = ['betweenness']
log_columns = []
for x in log_features:
    for col in predict_columns:
        if x in col:
            log_columns.append(col)

# log transform log_columns
# df_modified[log_columns].describe() 
# none of the log_columns have any values less than 0 but they have 0's so +1 to avoid log(0)
df_modified[log_columns] = np.log(df_modified[log_columns] + 1)
cleaned_df = df_modified.copy()

# correlations
print("Calculating correlations...(~4 mins)")
correlations_mat = cleaned_df[predict_columns].corr()

######################################################


correlations_upper = correlations_mat.where(np.triu(np.ones(correlations_mat.shape), k=1).astype(bool))

to_include = copy.deepcopy(predict_columns)
to_drop = []

threshold = 0.65
i = 0
prev_len = -np.inf
while len(to_drop) < len(predict_columns):
    # extract pairs where correlation is high
    high_correlations = correlations_upper.loc[to_include,to_include].abs() >= threshold
    # get x/y positions
    xy_positions = np.where(high_correlations)

    pairs = []
    for x,y in zip(xy_positions[0], xy_positions[1]):
        pairs.append((high_correlations.index[x], high_correlations.index[y]))

    counts = {k:0 for k in predict_columns}
    for x in pairs:
        counts[x[0]] += np.abs(correlations_upper.loc[x[0],x[1]])
        counts[x[1]] += np.abs(correlations_upper.loc[x[0],x[1]])

    counts = {k:v for k,v in counts.items() if v >= threshold}
    if len(counts) == 0:
        print(i,'no new drops')
        break
    # sort by counts
    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    print(i,counts)
    to_drop.append(counts[0][0])
    to_drop = list(set(to_drop))
    if len(to_drop) == prev_len:
        print(i,'no new drops')
        break
    prev_len = len(to_drop)
    to_include = [x for x in to_include if x not in to_drop]
    # print(len(to_include))
    i += 1

to_drop.extend(['pid', 'time', 'electrode_idx', 'x', 'y', 'z'])

Calculating correlations...(~4 mins)
0 [('mi_gaussian~global_efficiency~binary', np.float64(14.726213933712556)), ('pdist_cosine~global_efficiency~binary', np.float64(14.453742095216102)), ('mi_gaussian~characteristic_path_length~binary', np.float64(13.939267607629366)), ('cov_EmpiricalCovariance~characteristic_path_length~weighted', np.float64(11.44368482678303)), ('pdist_cosine~characteristic_path_length~weighted', np.float64(11.323425664145507)), ('cov_EmpiricalCovariance~characteristic_path_length~binary', np.float64(10.997268064558432)), ('cov_EmpiricalCovariance~global_efficiency~binary', np.float64(10.971128055682428)), ('mi_gaussian~density~weighted', np.float64(10.41286144455603)), ('mi_gaussian~density~binary', np.float64(10.41286144455603)), ('pdist_cosine~density~weighted', np.float64(10.346969648073802)), ('pdist_cosine~density~binary', np.float64(10.3469696480738)), ('cohmag_multitaper_mean_fs-1_fmin-0_fmax-0-5~characteristic_path_length~weighted', np.float64(10.262527574

In [22]:
dropped_df = cleaned_df.drop(columns=to_drop).copy()

In [23]:
cols = dropped_df.columns[1:]

In [24]:
indexs = [0,1,19]

In [25]:
cols[0]

'pdist_cosine~global_efficiency~weighted'

In [37]:
# plot data
key1 = cols[9]
key2 = cols[19]
datashade(hv.Points((dropped_df[key1], dropped_df[key2])), cmap=fire[50:]).opts(width=1000, 
                                                          height=600,
                                                          xlabel=key1,
                                                          ylabel=key2)

BokehModel(combine_events=True, render_bundle={'docs_json': {'0a07deb9-d2a1-447d-b7bb-1b86b2b6fb88': {'version…

In [39]:
# plot data
key1 = cols[1]
key2 = cols[19]
datashade(hv.Points((dropped_df[key1], dropped_df[key2])), cmap=fire[50:]).opts(width=1000, 
                                                          height=600,
                                                          xlabel=key1,
                                                          ylabel=key2)

BokehModel(combine_events=True, render_bundle={'docs_json': {'50310afc-8335-471c-9d0b-aa2e51b74383': {'version…

In [None]:
# plot data
key1 = cols[1]
key2 = cols[19]
datashade(hv.Points((dropped_df[key1], dropped_df[key2])), cmap=fire[50:]).opts(width=1000, 
                                                          height=600,
                                                          xlabel=key1,
                                                          ylabel=key2)

BokehModel(combine_events=True, render_bundle={'docs_json': {'50310afc-8335-471c-9d0b-aa2e51b74383': {'version…