### Imports

In [21]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 879 µs (started: 2023-05-28 15:19:13 -05:00)


### Parameters

In [22]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
dimension = dimensions[0]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 dgi
Group:		 ['WT', 5]
Subgroup:	 ['1', '2', '3', '4', '5']
Dimensions:	 [3]
time: 2.73 ms (started: 2023-05-28 15:19:13 -05:00)


In [23]:
list_graphs = []

time: 51 ms (started: 2023-05-28 15:19:13 -05:00)


### Get common subgraphs

In [24]:
graphs = []
for item in tqdm(subgroups):
    weighted_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], item))
    G = nx.from_pandas_edgelist(weighted_edges, "source", "target", edge_attr="weight")
    graph_detail(G)
    graphs.append(G)

 20%|██        | 1/5 [00:02<00:10,  2.59s/it]

Num. nodes: 6234
Num. edges: 1243057



 40%|████      | 2/5 [00:04<00:06,  2.23s/it]

Num. nodes: 6231
Num. edges: 939038



 60%|██████    | 3/5 [00:09<00:07,  3.69s/it]

Num. nodes: 6233
Num. edges: 2252847



 80%|████████  | 4/5 [00:19<00:05,  5.85s/it]

Num. nodes: 6226
Num. edges: 3728175



100%|██████████| 5/5 [00:28<00:00,  5.65s/it]

Num. nodes: 6226
Num. edges: 4071713

time: 28.2 s (started: 2023-05-28 15:19:14 -05:00)





In [25]:
SG = get_subgraphs(graphs)
graph_detail(SG)

list_graphs.append(SG.copy())

100%|██████████| 4/4 [00:00<00:00, 376.78it/s]
100%|██████████| 4/4 [00:42<00:00, 10.51s/it]


Num. nodes: 4202
Num. edges: 135076

time: 48.2 s (started: 2023-05-28 15:19:42 -05:00)


In [26]:
df_edges_subgraph = pd.DataFrame(SG.edges())
df_edges_subgraph.columns = ["source", "target"]
df_edges_subgraph

Unnamed: 0,source,target
0,243.0404,267.1048
1,243.0404,401.2187
2,243.0404,661.3471
3,243.0404,267.0965
4,243.0404,416.1414
...,...,...
135071,579.4245,579.4980
135072,663.3295,663.3449
135073,226.0573,226.0718
135074,412.8122,412.8271


time: 90.1 ms (started: 2023-05-28 15:20:30 -05:00)


### Calculate ANOVA (nodes)

In [27]:
""" df_nodes_anova = get_nodes_anova(SG, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)
len(nodes) """

' df_nodes_anova = get_nodes_anova(SG, dir, group)\ndf_nodes_anova\n\n# Filter by ANOVA (p > 0.001)\ndf_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]\ndf_nodes_anova_filter\nnodes = list(df_nodes_anova_filter.index)\nlen(nodes) '

time: 2.32 ms (started: 2023-05-28 15:20:31 -05:00)


### Calculate STD (edges)

In [28]:
""" df_edges_std = get_edges_std(SG, dir, group, subgroups)
df_edges_std

# Filter by std (std <= 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] <= 0.3]
df_edges_std_filter """

' df_edges_std = get_edges_std(SG, dir, group, subgroups)\ndf_edges_std\n\n# Filter by std (std <= 0.3)\ndf_edges_std_filter = df_edges_std[df_edges_std["std"] <= 0.3]\ndf_edges_std_filter '

time: 2.67 ms (started: 2023-05-28 15:20:31 -05:00)


---

### STD-ANOVA

In [29]:
# STD
df_edges_std = get_edges_std(SG, dir, group, subgroups)
df_edges_std

# Filter by std (std < 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]
df_edges_std_filter

100%|██████████| 5/5 [00:16<00:00,  3.35s/it]


Unnamed: 0,source,target,weight,std
133290,59.0137,59.0291,0.859127,0.136282
131497,59.0137,349.2758,0.701044,0.118268
110528,59.0137,350.2793,0.706849,0.112406
132969,59.0137,350.3033,0.736142,0.128241
129629,59.0137,389.2739,0.773249,0.145148
...,...,...,...,...
130790,980.7146,981.7177,0.842014,0.070631
108711,981.5305,981.7177,0.673811,0.111292
131593,981.7177,996.7096,0.638545,0.099503
134700,985.5091,985.5490,0.724017,0.106148


time: 17.2 s (started: 2023-05-28 15:20:31 -05:00)


In [30]:
G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())

Num. nodes: 4181
Num. edges: 131540

time: 579 ms (started: 2023-05-28 15:20:48 -05:00)


In [31]:
df_edges_filter = nx.to_pandas_edgelist(G)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std.csv".format(dir, group[0]), index=False)
df_edges_filter

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.859127
1,59.0137,349.2758,0.701044
2,59.0137,350.2793,0.706849
3,59.0137,350.3033,0.736142
4,59.0137,389.2739,0.773249
...,...,...,...
131535,978.6174,978.6600,0.628246
131536,979.6191,979.6627,0.679891
131537,980.0763,980.0986,0.736160
131538,985.5091,985.5490,0.724017


time: 2.21 s (started: 2023-05-28 15:20:49 -05:00)


In [32]:
# ANOVA
df_nodes_anova = get_nodes_anova(G, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)

time: 8.61 s (started: 2023-05-28 15:20:51 -05:00)


In [33]:
H = G.subgraph(nodes)
graph_detail(H)
print(len(list(n for n in H.nodes() if H.degree(n) == 0)))

list_graphs.append(H.copy())

df_edges_filter = nx.to_pandas_edgelist(H)

df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std-anova.csv".format(dir, group[0]), index=False)
df_edges_filter

Num. nodes: 1651
Num. edges: 11868

199


Unnamed: 0,source,target,weight
0,61.9884,293.1786,0.596832
1,122.9910,123.0007,0.989721
2,122.9910,123.0188,0.969324
3,122.9910,123.0360,0.900589
4,122.9910,123.9906,0.952875
...,...,...,...
11863,492.9771,493.0401,0.805087
11864,494.9997,495.0368,0.774382
11865,504.1338,504.0519,0.676079
11866,505.1609,505.1361,0.764508


time: 389 ms (started: 2023-05-28 15:21:00 -05:00)


### ANOVA-STD

In [34]:
# ANOVA
df_nodes_anova = get_nodes_anova(SG, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)

time: 8.75 s (started: 2023-05-28 15:21:00 -05:00)


In [35]:
H = SG.subgraph(nodes)

time: 1.72 ms (started: 2023-05-28 15:21:09 -05:00)


In [36]:
df_edges_filter = nx.to_pandas_edgelist(H)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova.csv".format(dir, group[0]), index=False)
df_edges_filter

Unnamed: 0,source,target
0,61.9884,293.1786
1,122.9910,123.0007
2,122.9910,123.9906
3,122.9910,123.0188
4,122.9910,124.9664
...,...,...
11965,475.1307,507.1555
11966,480.0275,480.0419
11967,492.9771,493.0401
11968,171.0467,171.0302


time: 176 ms (started: 2023-05-28 15:21:09 -05:00)


In [37]:
H = nx.from_pandas_edgelist(df_edges_filter, "source", "target")
graph_detail(H)

list_graphs.append(H.copy())

Num. nodes: 1452
Num. edges: 11970

time: 55.5 ms (started: 2023-05-28 15:21:09 -05:00)


In [38]:
# STD
df_edges_std = get_edges_std(H, dir, group, subgroups)
df_edges_std

# Filter by std (std < 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]
df_edges_std_filter

100%|██████████| 5/5 [00:15<00:00,  3.09s/it]


Unnamed: 0,source,target,weight,std
0,61.9884,293.1786,0.596832,0.076383
31,122.9910,123.0007,0.989721,0.018039
33,122.9910,123.0188,0.969324,0.049914
36,122.9910,123.0360,0.900589,0.167513
32,122.9910,123.9906,0.952875,0.066730
...,...,...,...,...
11753,978.3602,978.3779,0.789932,0.097692
11951,978.6174,978.6600,0.628246,0.071524
11750,979.6191,979.6627,0.679891,0.082784
11952,980.0763,980.0986,0.736160,0.111510


time: 15.6 s (started: 2023-05-28 15:21:10 -05:00)


In [39]:
G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())

Num. nodes: 1452
Num. edges: 11868

time: 71.7 ms (started: 2023-05-28 15:21:25 -05:00)


In [40]:
df_edges_filter = nx.to_pandas_edgelist(G)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova-std.csv".format(dir, group[0]), index=False)
df_edges_filter

Unnamed: 0,source,target,weight
0,61.9884,293.1786,0.596832
1,293.1786,265.1480,0.944443
2,293.1786,266.1514,0.932479
3,293.1786,267.1457,0.767931
4,293.1786,294.1818,0.913458
...,...,...,...
11863,978.3602,978.3779,0.789932
11864,978.6174,978.6600,0.628246
11865,979.6191,979.6627,0.679891
11866,980.0763,980.0986,0.736160


time: 80.7 ms (started: 2023-05-28 15:21:25 -05:00)


### Results

In [41]:
for graph in list_graphs:
    graph_detail(graph)
    print()

Num. nodes: 4202
Num. edges: 135076


Num. nodes: 4181
Num. edges: 131540


Num. nodes: 1651
Num. edges: 11868


Num. nodes: 1452
Num. edges: 11970


Num. nodes: 1452
Num. edges: 11868


time: 14.7 ms (started: 2023-05-28 15:21:26 -05:00)
