### Imports

In [1]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


time: 156 µs (started: 2023-06-13 11:04:52 -05:00)


### Parameters

In [2]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

method = "greedy"
print("Method:\t\t", method)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id)

option = ""
print("Option:\t", option)

Exp:		 exp1
Method:		 greedy
Groups id:	 ['WT', 'zwf1^', 'pck1^']
Subgroups id:	 {'WT': ['1', '2', '3', '4', '5'], 'zwf1^': ['1', '2', '3'], 'pck1^': ['1', '2']}
Option:	 
time: 1.07 ms (started: 2023-06-13 11:04:52 -05:00)


### Get common subgraphs

In [3]:
# read edges
dict_graphs = {}

for group in tqdm(groups_id):
    graphs = []
    for subgroup in tqdm(subgroups_id[group]):
        df_edges = pd.read_csv("output/{}/preprocessing/edges/edges_{}_{}.csv".format(exp, group, subgroup),
                               dtype={"source": "string", "target": "string"})
        G = nx.from_pandas_edgelist(df_edges, "source", "target", edge_attr="weight")
        # graph_detail(G)
        graphs.append(G)
    
    dict_graphs[group] = graphs

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:30<00:00,  6.01s/it]
100%|██████████| 3/3 [00:42<00:00, 14.12s/it]
100%|██████████| 2/2 [00:14<00:00,  7.33s/it]
100%|██████████| 3/3 [01:27<00:00, 29.02s/it]

time: 1min 27s (started: 2023-06-13 11:04:52 -05:00)





In [4]:
# get subgraphs

dict_df_edges_filter = get_subgraphs_global(dict_graphs, groups_id)

100%|██████████| 4/4 [00:00<00:00, 921.57it/s]
100%|██████████| 4/4 [00:25<00:00,  6.36s/it]


Num. nodes: 4200
Num. edges: 135075



100%|██████████| 2/2 [00:00<00:00, 1262.39it/s]
100%|██████████| 2/2 [00:33<00:00, 16.60s/it]


Num. nodes: 5579
Num. edges: 1100817



100%|██████████| 1/1 [00:00<00:00, 1235.44it/s]
100%|██████████| 1/1 [00:09<00:00,  9.19s/it]


Num. nodes: 5710
Num. edges: 842738



100%|██████████| 3/3 [01:55<00:00, 38.49s/it]

time: 1min 55s (started: 2023-06-13 11:06:19 -05:00)





In [5]:
df_edges_filter = dict_df_edges_filter[groups_id[0]]
df_edges_filter.head()

Unnamed: 0,source,target
0,531.4693,746.4967
1,531.4693,883.5481
2,531.4693,717.5262
3,531.4693,694.6713
4,531.4693,411.3841


time: 6.11 ms (started: 2023-06-13 11:08:15 -05:00)


In [6]:
# change data type
for group in tqdm(groups_id):
    df_edges_filter = dict_df_edges_filter[group]
    df_edges_filter[["source", "target"]] = df_edges_filter[["source", "target"]].astype("string")

 67%|██████▋   | 2/3 [00:00<00:00, 18.53it/s]

100%|██████████| 3/3 [00:00<00:00, 17.77it/s]

time: 186 ms (started: 2023-06-13 11:08:15 -05:00)





In [7]:
# get weight by subgroups

dict_df_edges_filter_weight = get_weight_global(dict_df_edges_filter, exp, groups_id, subgroups_id)
df_edges_filter_weight = dict_df_edges_filter_weight[groups_id[0]]
df_edges_filter_weight.head()

100%|██████████| 5/5 [00:21<00:00,  4.23s/it]
100%|██████████| 3/3 [00:33<00:00, 11.25s/it]
100%|██████████| 2/2 [00:12<00:00,  6.01s/it]
100%|██████████| 3/3 [01:09<00:00, 23.30s/it]


Unnamed: 0,source,target,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5
129664,100.9337,141.0173,0.56999,0.597722,0.524561,0.553647,0.58286
112814,100.9679,176.975,0.68249,0.504283,0.548594,0.795642,0.864287
128880,100.9679,260.9136,0.602921,0.526767,0.654005,0.721963,0.852265
89898,100.9679,94.9806,0.576085,0.595119,0.555905,0.89232,0.891339
131874,102.0562,122.0247,0.594741,0.562568,0.617639,0.540012,0.740655


time: 1min 10s (started: 2023-06-13 11:08:15 -05:00)


### Filter by STD and average weight

In [8]:
# dispersion (std)

dict_df_common_edges = std_global(dict_df_edges_filter_weight, exp, method, groups_id, option, th=0.3, plot=True, save=True)
dict_df_common_edges[groups_id[0]].head()

100%|██████████| 3/3 [00:44<00:00, 14.71s/it]


Unnamed: 0,source,target,weight
0,100.9337,141.0173,0.565756
1,100.9679,176.975,0.679059
2,100.9679,260.9136,0.671584
3,100.9679,94.9806,0.702154
4,102.0562,122.0247,0.611123


<Figure size 640x480 with 0 Axes>

time: 44.1 s (started: 2023-06-13 11:09:25 -05:00)


---

### STD-ANOVA

STD

In [9]:
# STD
    
""" df_edges_std = get_edges_std(SG, dir, group, subgroups, ddof=0)
df_edges_std

# Filter by std (std < 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]
df_edges_std_filter """

' df_edges_std = get_edges_std(SG, dir, group, subgroups, ddof=0)\ndf_edges_std\n\n# Filter by std (std < 0.3)\ndf_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]\ndf_edges_std_filter '

time: 2.4 ms (started: 2023-06-13 11:10:09 -05:00)


In [10]:
""" x = df_edges_std["std"]
plt.hist(x, bins=100)
plt.axvline(x=0.3, color="red", lw=1)
l = len(df_edges_std) - len(df_edges_std_filter)
t = len(df_edges_std)
plt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))
plt.grid()
plt.show() """

' x = df_edges_std["std"]\nplt.hist(x, bins=100)\nplt.axvline(x=0.3, color="red", lw=1)\nl = len(df_edges_std) - len(df_edges_std_filter)\nt = len(df_edges_std)\nplt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))\nplt.grid()\nplt.show() '

time: 3.88 ms (started: 2023-06-13 11:10:09 -05:00)


In [11]:
""" G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())
nx.write_gexf(G, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-std.gexf".format(dir, group[0])) """

' G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])\ngraph_detail(G)\n\nlist_graphs.append(G.copy())\nnx.write_gexf(G, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-std.gexf".format(dir, group[0])) '

time: 7.45 ms (started: 2023-06-13 11:10:09 -05:00)


In [12]:
""" df_edges_filter = nx.to_pandas_edgelist(G)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std.csv".format(dir, group[0]), index=False)
df_edges_filter """

' df_edges_filter = nx.to_pandas_edgelist(G)\ndf_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std.csv".format(dir, group[0]), index=False)\ndf_edges_filter '

time: 2.85 ms (started: 2023-06-13 11:10:09 -05:00)


ANOVA

In [13]:
# ANOVA
""" df_nodes_anova = get_nodes_anova(G, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)
df_nodes_anova_filter """

' df_nodes_anova = get_nodes_anova(G, dir, group)\ndf_nodes_anova\n\n# Filter by ANOVA (p > 0.001)\ndf_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]\ndf_nodes_anova_filter\nnodes = list(df_nodes_anova_filter.index)\ndf_nodes_anova_filter '

time: 3.61 ms (started: 2023-06-13 11:10:09 -05:00)


In [14]:
""" x = df_nodes_anova["p-value"]
plt.hist(x, bins=100)
plt.axvline(x=0.001, color="red", lw=1)
l = len(df_nodes_anova) - len(df_nodes_anova_filter)
t = len(df_nodes_anova)
plt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))
plt.grid()
plt.show() """

' x = df_nodes_anova["p-value"]\nplt.hist(x, bins=100)\nplt.axvline(x=0.001, color="red", lw=1)\nl = len(df_nodes_anova) - len(df_nodes_anova_filter)\nt = len(df_nodes_anova)\nplt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))\nplt.grid()\nplt.show() '

time: 3.57 ms (started: 2023-06-13 11:10:09 -05:00)


In [15]:
""" H = G.subgraph(nodes)
graph_detail(H)
print(len(list(n for n in H.nodes() if H.degree(n) == 0)))

list_graphs.append(H.copy())
nx.write_gexf(H, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-std-anova.gexf".format(dir, group[0]))

df_edges_filter = nx.to_pandas_edgelist(H)

df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std-anova.csv".format(dir, group[0]), index=False)
df_edges_filter """

' H = G.subgraph(nodes)\ngraph_detail(H)\nprint(len(list(n for n in H.nodes() if H.degree(n) == 0)))\n\nlist_graphs.append(H.copy())\nnx.write_gexf(H, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-std-anova.gexf".format(dir, group[0]))\n\ndf_edges_filter = nx.to_pandas_edgelist(H)\n\ndf_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-std-anova.csv".format(dir, group[0]), index=False)\ndf_edges_filter '

time: 4.32 ms (started: 2023-06-13 11:10:09 -05:00)


### ANOVA-STD

ANOVA

In [16]:
# ANOVA
""" df_nodes_anova = get_nodes_anova(SG, dir, group)
df_nodes_anova

# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
nodes = list(df_nodes_anova_filter.index)
df_nodes_anova_filter """

' df_nodes_anova = get_nodes_anova(SG, dir, group)\ndf_nodes_anova\n\n# Filter by ANOVA (p > 0.001)\ndf_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]\ndf_nodes_anova_filter\nnodes = list(df_nodes_anova_filter.index)\ndf_nodes_anova_filter '

time: 3.5 ms (started: 2023-06-13 11:10:09 -05:00)


In [17]:
""" x = df_nodes_anova["p-value"]
plt.hist(x, bins=100)
plt.axvline(x=0.001, color="red", lw=1)
l = len(df_nodes_anova) - len(df_nodes_anova_filter)
t = len(df_nodes_anova)
plt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))
plt.grid()
plt.show() """

' x = df_nodes_anova["p-value"]\nplt.hist(x, bins=100)\nplt.axvline(x=0.001, color="red", lw=1)\nl = len(df_nodes_anova) - len(df_nodes_anova_filter)\nt = len(df_nodes_anova)\nplt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))\nplt.grid()\nplt.show() '

time: 3.6 ms (started: 2023-06-13 11:10:09 -05:00)


In [18]:
# H = SG.subgraph(nodes)

time: 338 µs (started: 2023-06-13 11:10:09 -05:00)


In [19]:
""" df_edges_filter = nx.to_pandas_edgelist(H)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova.csv".format(dir, group[0]), index=False)
df_edges_filter """

' df_edges_filter = nx.to_pandas_edgelist(H)\ndf_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova.csv".format(dir, group[0]), index=False)\ndf_edges_filter '

time: 3.99 ms (started: 2023-06-13 11:10:09 -05:00)


In [20]:
""" H = nx.from_pandas_edgelist(df_edges_filter, "source", "target")
graph_detail(H)

list_graphs.append(H.copy())
nx.write_gexf(H, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-anova.gexf".format(dir, group[0])) """

' H = nx.from_pandas_edgelist(df_edges_filter, "source", "target")\ngraph_detail(H)\n\nlist_graphs.append(H.copy())\nnx.write_gexf(H, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-anova.gexf".format(dir, group[0])) '

time: 3.39 ms (started: 2023-06-13 11:10:09 -05:00)


STD

In [21]:
# STD
""" df_edges_std = get_edges_std(H, dir, group, subgroups, ddof=0)
df_edges_std

# Filter by std (std < 0.3)
df_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]
df_edges_std_filter """

' df_edges_std = get_edges_std(H, dir, group, subgroups, ddof=0)\ndf_edges_std\n\n# Filter by std (std < 0.3)\ndf_edges_std_filter = df_edges_std[df_edges_std["std"] < 0.3]\ndf_edges_std_filter '

time: 3.95 ms (started: 2023-06-13 11:10:09 -05:00)


In [22]:
""" x = df_edges_std["std"]
plt.hist(x, bins=100)
plt.axvline(x=0.3, color="red", lw=1)
l = len(df_edges_std) - len(df_edges_std_filter)
t = len(df_edges_std)
plt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))
plt.grid()
plt.show() """

' x = df_edges_std["std"]\nplt.hist(x, bins=100)\nplt.axvline(x=0.3, color="red", lw=1)\nl = len(df_edges_std) - len(df_edges_std_filter)\nt = len(df_edges_std)\nplt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))\nplt.grid()\nplt.show() '

time: 80.6 ms (started: 2023-06-13 11:10:09 -05:00)


In [23]:
""" G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())
nx.write_gexf(G, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-anova-std.gexf".format(dir, group[0])) """

' G = nx.from_pandas_edgelist(df_edges_std_filter, "source", "target", edge_attr=["weight"])\ngraph_detail(G)\n\nlist_graphs.append(G.copy())\nnx.write_gexf(G, "{}/output_greedy/graphs_filter/greedy_{}_graph-filter-anova-std.gexf".format(dir, group[0])) '

time: 54.8 ms (started: 2023-06-13 11:10:09 -05:00)


In [24]:
""" df_edges_filter = nx.to_pandas_edgelist(G)
df_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova-std.csv".format(dir, group[0]), index=False)
df_edges_filter """

' df_edges_filter = nx.to_pandas_edgelist(G)\ndf_edges_filter.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-anova-std.csv".format(dir, group[0]), index=False)\ndf_edges_filter '

time: 12.8 ms (started: 2023-06-13 11:10:09 -05:00)


### Results

In [25]:
""" for graph in list_graphs:
    graph_detail(graph) """

' for graph in list_graphs:\n    graph_detail(graph) '

time: 4.23 ms (started: 2023-06-13 11:10:09 -05:00)
