### Imports

In [64]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.72 ms (started: 2023-05-29 23:55:45 -05:00)


### Parameters

In [65]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
dimension = dimensions[0]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 dgi
Group:		 ['pck1^', 2]
Subgroup:	 ['1', '2']
Dimensions:	 [3]
time: 3.02 ms (started: 2023-05-29 23:55:45 -05:00)


In [66]:
list_graphs = []

time: 508 µs (started: 2023-05-29 23:55:45 -05:00)


### Get common subgraphs

In [67]:
graphs = []
for item in tqdm(subgroups):
    weighted_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], item))
    G = nx.from_pandas_edgelist(weighted_edges, "source", "target", edge_attr="weight")
    graph_detail(G)
    graphs.append(G)

 50%|█████     | 1/2 [00:03<00:03,  3.42s/it]

Num. nodes: 6235
Num. edges: 1629510



100%|██████████| 2/2 [00:12<00:00,  6.35s/it]

Num. nodes: 6234
Num. edges: 4417355

time: 12.7 s (started: 2023-05-29 23:55:45 -05:00)





In [68]:
SG = get_subgraphs(graphs)
graph_detail(SG)

list_graphs.append(SG.copy())
nx.write_gexf(SG, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter.gexf".format(dir, group[0]))

100%|██████████| 1/1 [00:00<00:00, 257.65it/s]
100%|██████████| 1/1 [00:12<00:00, 12.46s/it]


Num. nodes: 5710
Num. edges: 842720

time: 37.4 s (started: 2023-05-29 23:55:58 -05:00)


In [69]:
df_edges_subgraph = pd.DataFrame(SG.edges())
df_edges_subgraph.columns = ["source", "target"]
df_edges_subgraph

Unnamed: 0,source,target
0,520.9177,870.5460
1,520.9177,378.7450
2,520.9177,397.2518
3,520.9177,193.0769
4,520.9177,147.0655
...,...,...
842715,455.0738,455.1677
842716,711.1266,711.1610
842717,687.0800,687.0919
842718,760.3625,760.3770


time: 501 ms (started: 2023-05-29 23:56:35 -05:00)


### Calculate ANOVA (nodes)

In [70]:
""" df_nodes_anova = get_nodes_anova(SG, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)
len(nodes) """

' df_nodes_anova = get_nodes_anova(SG, dir, group)\ndf_nodes_anova\n\n# Filter by ANOVA (p > 0.001)\ndf_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]\ndf_nodes_anova_filter\nnodes = list(df_nodes_anova_filter.index)\nlen(nodes) '

time: 3.09 ms (started: 2023-05-29 23:56:36 -05:00)


### Calculate STD (edges)

In [71]:
""" df_edges_std = get_edges_std(SG, dir, group, subgroups)
df_edges_std

# Filter by std (std <= 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] <= 0.3]
df_edges_std_filter """

' df_edges_std = get_edges_std(SG, dir, group, subgroups)\ndf_edges_std\n\n# Filter by std (std <= 0.3)\ndf_edges_std_filter = df_edges_std[df_edges_std["std"] <= 0.3]\ndf_edges_std_filter '

time: 3.47 ms (started: 2023-05-29 23:56:36 -05:00)


---

### STD-ANOVA

In [72]:
# STD
df_edges_std = get_edges_std(SG, dir, group, subgroups, ddof=0)
df_edges_std

# Filter by std (std < 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]
df_edges_std_filter

100%|██████████| 2/2 [00:08<00:00,  4.37s/it]


Unnamed: 0,source,target,weight,std
608869,59.0137,59.0291,0.974889,0.013730
767402,59.0137,59.0370,0.740804,0.087645
146184,59.0137,60.0171,0.801043,0.096319
767281,59.0137,61.9884,0.684453,0.095850
767342,59.0137,71.0139,0.681361,0.020293
...,...,...,...,...
349710,988.4452,990.5673,0.675599,0.080981
646004,988.5664,990.5673,0.587136,0.016518
839146,989.4484,989.5686,0.632912,0.007138
708391,990.4534,990.5673,0.772224,0.105207


time: 11.2 s (started: 2023-05-29 23:56:36 -05:00)


In [73]:
G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())
nx.write_gexf(G, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-std.gexf".format(dir, group[0]))

Num. nodes: 5677
Num. edges: 732148

time: 16.1 s (started: 2023-05-29 23:56:47 -05:00)


In [74]:
df_edges_filter = nx.to_pandas_edgelist(G)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std.csv".format(dir, group[0]), index=False)
df_edges_filter

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
732143,978.3602,978.3779,0.682080
732144,978.4185,978.6600,0.754314
732145,979.8201,979.8546,0.664992
732146,979.8546,979.9144,0.614228


time: 8.88 s (started: 2023-05-29 23:57:03 -05:00)


In [75]:
# ANOVA
df_nodes_anova = get_nodes_anova(G, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)

time: 10.2 s (started: 2023-05-29 23:57:12 -05:00)


In [76]:
H = G.subgraph(nodes)
graph_detail(H)
print(len(list(n for n in H.nodes() if H.degree(n) == 0)))

list_graphs.append(H.copy())
nx.write_gexf(H, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-std-anova.gexf".format(dir, group[0]))

df_edges_filter = nx.to_pandas_edgelist(H)

df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std-anova.csv".format(dir, group[0]), index=False)
df_edges_filter

Num. nodes: 4691
Num. edges: 473180

40


Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
473175,978.3602,978.3779,0.682080
473176,978.4185,978.6600,0.754314
473177,979.8201,979.8546,0.664992
473178,979.8546,979.9144,0.614228


time: 24.7 s (started: 2023-05-29 23:57:23 -05:00)


### ANOVA-STD

In [77]:
# ANOVA
df_nodes_anova = get_nodes_anova(SG, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)

time: 10.1 s (started: 2023-05-29 23:57:47 -05:00)


In [78]:
H = SG.subgraph(nodes)

time: 3.6 ms (started: 2023-05-29 23:57:58 -05:00)


In [79]:
df_edges_filter = nx.to_pandas_edgelist(H)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova.csv".format(dir, group[0]), index=False)
df_edges_filter

Unnamed: 0,source,target
0,520.9177,378.7450
1,520.9177,397.2518
2,520.9177,147.0655
3,520.9177,241.9185
4,520.9177,333.2457
...,...,...
524839,455.0738,455.1677
524840,711.1266,711.1610
524841,687.0800,687.0919
524842,760.3625,760.3770


time: 8.52 s (started: 2023-05-29 23:57:58 -05:00)


In [80]:
H = nx.from_pandas_edgelist(df_edges_filter, "source", "target")
graph_detail(H)

list_graphs.append(H.copy())
nx.write_gexf(H, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-anova.gexf".format(dir, group[0]))

Num. nodes: 4674
Num. edges: 524844

time: 9.99 s (started: 2023-05-29 23:58:06 -05:00)


In [81]:
# STD
df_edges_std = get_edges_std(H, dir, group, subgroups, ddof=0)
df_edges_std

# Filter by std (std < 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]
df_edges_std_filter

100%|██████████| 2/2 [00:08<00:00,  4.10s/it]


Unnamed: 0,source,target,weight,std
51879,59.0137,59.0291,0.974889,0.013730
165185,59.0137,59.0370,0.740804,0.087645
164684,59.0137,60.0171,0.801043,0.096319
36831,59.0137,61.9884,0.684453,0.095850
120370,59.0137,71.0139,0.681361,0.020293
...,...,...,...,...
181890,984.5048,984.7590,0.673136,0.046539
182021,984.7590,986.7752,0.651981,0.133005
182024,984.7590,987.7787,0.627558,0.088982
509966,985.7644,986.7752,0.581169,0.054479


time: 9.53 s (started: 2023-05-29 23:58:16 -05:00)


In [82]:
G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())
nx.write_gexf(G, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-anova-std.gexf".format(dir, group[0]))

Num. nodes: 4651
Num. edges: 473180

time: 9.55 s (started: 2023-05-29 23:58:26 -05:00)


In [83]:
df_edges_filter = nx.to_pandas_edgelist(G)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova-std.csv".format(dir, group[0]), index=False)
df_edges_filter

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
473175,978.3602,978.3779,0.682080
473176,978.4185,978.6600,0.754314
473177,979.8201,979.8546,0.664992
473178,979.8546,979.9144,0.614228


time: 7.53 s (started: 2023-05-29 23:58:36 -05:00)


### Results

In [84]:
for graph in list_graphs:
    graph_detail(graph)

Num. nodes: 5710
Num. edges: 842720

Num. nodes: 5677
Num. edges: 732148

Num. nodes: 4691
Num. edges: 473180

Num. nodes: 4674
Num. edges: 524844

Num. nodes: 4651
Num. edges: 473180

time: 24.9 ms (started: 2023-05-29 23:58:43 -05:00)
