### Imports

In [1]:
from pyod.models.ecod import ECOD
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


time: 167 µs (started: 2023-06-07 10:29:38 -05:00)


### Parameters

In [4]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

method = params["method"]
print("Method:\t\t", method)

dimension = params["dimension"]
print("Dimension:\t", dimension)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id)

subgroups_id_dyn = {"WT": ["dyn"], "zwf1^": ["dyn"], "pck1^": ["dyn"]} # params["subgroups_id"]
print("Subgroups id dyn:\t", subgroups_id_dyn)

Exp:		 exp3
Method:		 dgi
Dimension:	 3
Groups id:	 ['WT', 'zwf1^', 'pck1^']
Subgroups id:	 {'WT': ['1', '2', '3', '4', '5'], 'zwf1^': ['1', '2', '3'], 'pck1^': ['1', '2']}
Subgroups id dyn:	 {'WT': ['dyn'], 'zwf1^': ['dyn'], 'pck1^': ['dyn']}
time: 1.76 ms (started: 2023-06-07 10:30:23 -05:00)


### Edge embeddings

In [5]:
# get edges embeddings

edge_embeddings_global(exp, method, dimension, groups_id, subgroups_id_dyn)

100%|██████████| 1/1 [15:49<00:00, 949.99s/it]
100%|██████████| 1/1 [21:57<00:00, 1317.75s/it]
100%|██████████| 1/1 [07:43<00:00, 463.35s/it]]
100%|██████████| 3/3 [45:31<00:00, 910.36s/it] 

time: 45min 31s (started: 2023-06-07 10:30:58 -05:00)





In [7]:
df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}_{}_{}.csv".format(exp, groups_id[0], subgroups_id_dyn[groups_id[0]][0], method, dimension, "L2"), index_col=[0, 1])
df_edge_embeddings.head()

Unnamed: 0,Unnamed: 1,0,1,2
0,1,0.000663,0.0004146542,1.283739e-05
0,2,1e-06,8.706569e-07,2.771359e-08
0,3,0.001208,0.0007716962,2.296494e-05
0,4,0.000285,0.0001941901,5.141284e-06
0,5,0.00021,0.000128822,4.12943e-06


time: 4.69 s (started: 2023-06-07 12:38:06 -05:00)


### Concat edge embeddings

In [8]:
for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.DataFrame()
    k = 1 # dyn
    for subgroup in tqdm(subgroups_id_dyn[group]): # dyn
        df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}_{}_{}.csv".format(exp, group, subgroup, method, dimension, "L2"), index_col=[0, 1])
        df_edge_embeddings["subgroup"] = [k] * len(df_edge_embeddings)

        df_edge_embeddings_concat = pd.concat([df_edge_embeddings_concat, df_edge_embeddings])
        k += 1 # dyn
    df_edge_embeddings_concat.to_csv("output/{}/edge_embeddings/edge-embeddings_concat_dyn_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"), index=True)

100%|██████████| 1/1 [00:06<00:00,  6.68s/it]
100%|██████████| 1/1 [00:09<00:00,  9.49s/it]
100%|██████████| 1/1 [00:03<00:00,  3.12s/it]
100%|██████████| 3/3 [03:15<00:00, 65.19s/it]

time: 3min 15s (started: 2023-06-07 12:40:35 -05:00)





In [12]:
df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_dyn_{}_{}_{}_{}.csv".format(exp, groups_id[0], method, dimension, "L2"), index_col=[0, 1])
df_edge_embeddings_concat.head()

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,1,0.000663,0.0004146542,1.283739e-05,1
0,2,1e-06,8.706569e-07,2.771359e-08,1
0,3,0.001208,0.0007716962,2.296494e-05,1
0,4,0.000285,0.0001941901,5.141284e-06,1
0,5,0.00021,0.000128822,4.12943e-06,1


time: 4.67 s (started: 2023-06-07 12:44:28 -05:00)


In [19]:
# plot edge embeddings concat

for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_dyn_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"), index_col=[0, 1])

    x = df_edge_embeddings_concat.iloc[:, 0]
    y = df_edge_embeddings_concat.iloc[:, 1]
    z = df_edge_embeddings_concat.iloc[:, 2]

    # Creating figure
    fig = plt.figure(figsize=(10, 7))
    ax = plt.axes(projection="3d")

    # Creating plot
    ax.scatter3D(x, y, z, c=df_edge_embeddings_concat.iloc[:, -1], alpha=0.1)
    # plt.title("Dimension: {}".format(dimension))

    # show plot
    plt.savefig("output/{}/plots/edge-embeddings_concat_dyn_{}_{}_{}_{}.png".format(exp, group, method, dimension, "L2"))
    # plt.show()
    plt.close()

100%|██████████| 3/3 [14:36<00:00, 292.16s/it]

time: 14min 36s (started: 2023-06-06 17:19:37 -05:00)





### Outliers detection

In [None]:
# Outlier detection (HDBSCAN)

""" df_edge_embeddings_concat = pd.read_csv("output/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(group, method, dimension, "L2"), index_col=[0, 1])

X_train = df_edge_embeddings_concat.iloc[:, :-1]
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)

threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
print(len(outliers))
outliers

inliers = np.setdiff1d(np.arange(len(df_edge_embeddings_concat)), outliers)
print(len(inliers))
inliers """

' df_edge_embeddings_concat = pd.read_csv("output/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(group, method, dimension, "L2"), index_col=[0, 1])\n\nX_train = df_edge_embeddings_concat.iloc[:, :-1]\nclusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)\n\nthreshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)\noutliers = np.where(clusterer.outlier_scores_ > threshold)[0]\nprint(len(outliers))\noutliers\n\ninliers = np.setdiff1d(np.arange(len(df_edge_embeddings_concat)), outliers)\nprint(len(inliers))\ninliers '

time: 4.64 ms (started: 2023-06-06 15:43:33 -05:00)


In [80]:
# outlier detection (ECOD)

dict_df_edge_embeddings_concat_outlier = {}
dict_df_edge_embeddings_concat_filter = {}

for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_dyn_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"), index_col=[0, 1])

    X_train = df_edge_embeddings_concat.iloc[:, :-1]

    clf = ECOD()
    clf.fit(X_train)

    X_train["labels"] = clf.labels_ # binary labels (0: inliers, 1: outliers)

    df_edge_embeddings_concat_filter = df_edge_embeddings_concat.copy()
    df_edge_embeddings_concat_filter["labels"] = clf.labels_
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["labels"] == 0]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, :-1]

    dict_df_edge_embeddings_concat_outlier[group] = X_train
    dict_df_edge_embeddings_concat_filter[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [00:56<00:00, 18.88s/it]

time: 56.7 s (started: 2023-06-07 18:06:40 -05:00)





In [81]:
df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[groups_id[0]]
df_edge_embeddings_concat_filter.head()

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,1,0.000663,0.000415,1.3e-05,1
0,3,0.001208,0.000772,2.3e-05,1
0,4,0.000285,0.000194,5e-06,1
0,7,0.002253,0.001414,4.3e-05,1
0,8,0.005167,0.003348,9.7e-05,1


time: 13.8 ms (started: 2023-06-07 18:07:53 -05:00)


In [26]:
# plot outliers/inliers

for group in tqdm(groups_id):
    fig = plt.figure(figsize = (10, 7))
    ax = plt.axes(projection ="3d")

    df_aux = dict_df_edge_embeddings_concat_outlier[group]
    print("Total:", len(df_aux))
    
    temp = df_aux[df_aux["labels"] == 0]
    x = temp.iloc[:, 0]
    y = temp.iloc[:, 1]
    z = temp.iloc[:, 2]
    ax.scatter3D(x, y, z, c="red", alpha=0.005)
    print("Num. of inliers:", len(temp))

    temp = df_aux[df_aux["labels"] == 1]
    x = temp.iloc[:, 0]
    y = temp.iloc[:, 1]
    z = temp.iloc[:, 2]
    ax.scatter3D(x, y, z, c="gray", alpha=0.005)
    print("Num. of inliers:", len(temp))

    # show plot
    plt.savefig("output/{}/plots/edge-embeddings_outlier_dyn_{}_{}_{}_{}.png".format(exp, group, method, dimension, "L2"))
    # plt.show()
    plt.close()

  0%|          | 0/3 [00:00<?, ?it/s]

Total: 12257101
Num. of inliers: 11031391
Num. of inliers: 1225710


 33%|███▎      | 1/3 [04:53<09:47, 293.73s/it]

Total: 17101443
Num. of inliers: 15391298
Num. of inliers: 1710145


 67%|██████▋   | 2/3 [11:56<06:09, 369.70s/it]

Total: 6050515
Num. of inliers: 5445463
Num. of inliers: 605052


100%|██████████| 3/3 [14:16<00:00, 285.66s/it]

time: 14min 16s (started: 2023-06-06 17:46:49 -05:00)





###  Filter common edges

In [82]:
# mapping idx with id

for group in tqdm(groups_id):
    dict_df_nodes = {}
    for subgroup in subgroups_id_dyn[group]: # dyn
        df_nodes = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, group, subgroup))
        dict_df_nodes[subgroup] = df_nodes
    
    # mapping
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
    list_index = []

    for row in tqdm(df_edge_embeddings_concat_filter.itertuples()):
        df_nodes = dict_df_nodes[subgroups_id_dyn[group][0]] # dyn
        """ s = df_nodes.iloc[row[0][0], -1][1:] # dyn
        t = df_nodes.iloc[row[0][1], -1][1:] # dyn
        if s != t: # dyn
            list_index.append((s, t)) """
        list_index.append((df_nodes.iloc[row[0][0], -1][1:], df_nodes.iloc[row[0][1], -1][1:]))
    
    # set new index
    df_edge_embeddings_concat_filter.set_index([pd.Index(list_index)], inplace=True)
    df_edge_embeddings_concat_filter

11031391it [04:50, 37966.37it/s]it/s]
15391298it [06:43, 38164.94it/s]2, 301.42s/it]
5445463it [02:23, 37827.30it/s]09, 369.42s/it]
100%|██████████| 3/3 [14:28<00:00, 289.41s/it]

time: 14min 28s (started: 2023-06-07 18:08:05 -05:00)





In [83]:
df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[groups_id[0]]
print(df_edge_embeddings_concat_filter.shape)
df_edge_embeddings_concat_filter.head()

(11031391, 4)


Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,274.0123,0.000663,0.000415,1.3e-05,1
59.0049,369.1213,0.001208,0.000772,2.3e-05,1
59.0049,369.1541,0.000285,0.000194,5e-06,1
59.0049,558.4273,0.002253,0.001414,4.3e-05,1
59.0049,641.1223,0.005167,0.003348,9.7e-05,1


time: 14.9 ms (started: 2023-06-07 18:22:36 -05:00)


In [92]:
# filter diferente edges
dict_df_edge_embeddings_concat_filter_ = {}
for group in tqdm(groups_id):
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter.index.get_level_values(0) != df_edge_embeddings_concat_filter.index.get_level_values(1)]
    dict_df_edge_embeddings_concat_filter_[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [00:02<00:00,  1.10it/s]

time: 2.73 s (started: 2023-06-07 18:24:40 -05:00)





In [93]:
df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter_[groups_id[0]]
print(df_edge_embeddings_concat_filter.shape)
df_edge_embeddings_concat_filter.head()

(11008980, 4)


Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,274.0123,0.000663,0.000415,1.3e-05,1
59.0049,369.1213,0.001208,0.000772,2.3e-05,1
59.0049,369.1541,0.000285,0.000194,5e-06,1
59.0049,558.4273,0.002253,0.001414,4.3e-05,1
59.0049,641.1223,0.005167,0.003348,9.7e-05,1


time: 13.9 ms (started: 2023-06-07 18:24:44 -05:00)


In [94]:
# count edges and filter by count

dict_df_edges_filter = {}
for group in tqdm(groups_id):
    # count
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter_[group]

    df_edge_embeddings_concat_filter.reset_index(inplace=True)
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[["level_0", "level_1"]].value_counts().to_frame()
    df_edge_embeddings_concat_filter.reset_index(inplace=True)
    df_edge_embeddings_concat_filter.columns = ["source", "target", "count"]

    # filter
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["count"] == len(subgroups_id[group])]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, [0, 1]]
    dict_df_edges_filter[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [00:13<00:00,  4.61s/it]

time: 13.8 s (started: 2023-06-07 18:24:53 -05:00)





In [95]:
df_edges_filter = dict_df_edges_filter[groups_id[1]]
print(df_edges_filter.shape)
df_edges_filter.head()

(27547, 2)


Unnamed: 0,source,target
0,302.1353,471.1553
1,591.4027,972.471
2,132.0303,216.0325
3,758.4964,969.6072
4,489.1457,586.2826


time: 10.8 ms (started: 2023-06-07 18:25:10 -05:00)


In [104]:
# change data type

for group in tqdm(groups_id):
    df_edges_filter = dict_df_edges_filter[group]
    df_edges_filter[["source", "target"]] = df_edges_filter[["source", "target"]].astype("float")
    dict_df_edges_filter[group] = df_edges_filter

100%|██████████| 3/3 [00:00<00:00, 11.52it/s]

time: 266 ms (started: 2023-06-07 18:28:05 -05:00)





In [108]:
# get weight by subgroups

dict_df_edges_filter_weight = get_weight_global(dict_df_edges_filter, exp, groups_id, subgroups_id)
df_edges_filter_weight = dict_df_edges_filter_weight[groups_id[0]]
df_edges_filter_weight.head()

100%|██████████| 5/5 [00:14<00:00,  2.91s/it]
100%|██████████| 3/3 [00:22<00:00,  7.50s/it]
100%|██████████| 2/2 [00:08<00:00,  4.12s/it]
100%|██████████| 3/3 [00:46<00:00, 15.64s/it]


Unnamed: 0,source,target,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5
7983,59.0137,349.2758,0.796859,0.78831,0.560861,0.551982,0.807207
7984,59.0137,350.2793,0.807013,0.779306,0.576143,0.56356,0.808223
8023,59.0137,350.3033,0.854806,0.751778,0.539172,0.650983,0.883973
8009,59.0137,389.2739,0.823682,0.88626,0.723124,0.51523,0.917952
8015,59.0137,389.2907,0.887545,0.93075,0.839795,0.800743,0.914504


time: 47.1 s (started: 2023-06-07 18:29:51 -05:00)


In [111]:
df_edges_filter_weight = dict_df_edges_filter_weight[groups_id[2]]
df_edges_filter_weight.head()

Unnamed: 0,source,target,subgroup1,subgroup2
36962,59.0137,96.9602,0.828449,0.646446
36955,59.0137,96.9688,0.822159,0.68491
25426,59.0137,102.0562,0.618107,0.777138
25427,59.0137,109.0407,0.654224,0.796294
25435,59.0137,113.0355,0.726675,0.743045


time: 11.6 ms (started: 2023-06-07 18:31:19 -05:00)


### Filter by STD and average weight

In [112]:
dict_df_common_edges = std_global(dict_df_edges_filter_weight, exp, method, dimension, groups_id, th=0.3, plot=True, save=True)
dict_df_common_edges[groups_id[0]].head()

100%|██████████| 3/3 [00:15<00:00,  5.10s/it]


Unnamed: 0,source,target,weight
0,59.0137,349.2758,0.701044
1,59.0137,350.2793,0.706849
2,59.0137,350.3033,0.736142
3,59.0137,389.2739,0.773249
4,59.0137,389.2907,0.874667


time: 15.3 s (started: 2023-06-07 18:31:24 -05:00)


In [113]:
df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}_{}_{}.csv".format(exp, groups_id[0], method, dimension, "L2"))
df_common_edges.head()

Unnamed: 0,source,target,weight
0,59.0137,349.2758,0.701044
1,59.0137,350.2793,0.706849
2,59.0137,350.3033,0.736142
3,59.0137,389.2739,0.773249
4,59.0137,389.2907,0.874667


time: 27.2 ms (started: 2023-06-07 18:32:38 -05:00)


In [114]:
# show details

for group in tqdm(groups_id):
    df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"))
    
    G = nx.from_pandas_edgelist(df_common_edges, "source", "target", edge_attr=["weight"])
    print("Group: {}".format(group))
    graph_detail(G)

  0%|          | 0/3 [00:00<?, ?it/s]

Group: WT
Num. nodes: 2193
Num. edges: 27314



 67%|██████▋   | 2/3 [00:01<00:00,  1.83it/s]

Group: zwf1^
Num. nodes: 4664
Num. edges: 533359



100%|██████████| 3/3 [00:01<00:00,  1.52it/s]

Group: pck1^
Num. nodes: 5118
Num. edges: 468193

time: 1.98 s (started: 2023-06-07 18:32:44 -05:00)



