### Imports

In [3]:
from pyod.models.ecod import ECOD
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.15 ms (started: 2023-06-08 12:54:23 -05:00)


### Parameters

In [4]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

method = params["method"]
print("Method:\t\t", method)

dimension = params["dimension"]
print("Dimension:\t", dimension)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id)

option = params["option"]
print("Option:\t\t", option)

if option:
    subgroups_id_op = {}
    for group in groups_id:
        subgroups_id_op[group] = [option]
else:
    subgroups_id_op = subgroups_id
print("Subgroups id op:", subgroups_id_op)

Exp:		 exp5
Method:		 dgi
Dimension:	 3
Groups id:	 ['WT', 'zwf1^', 'pck1^']
Subgroups id:	 {'WT': ['1', '2', '3', '4', '5'], 'zwf1^': ['1', '2', '3'], 'pck1^': ['1', '2']}
Option:		 dyn
Subgroups id op: {'WT': ['dyn'], 'zwf1^': ['dyn'], 'pck1^': ['dyn']}
time: 2.85 ms (started: 2023-06-08 12:54:23 -05:00)


### Edge embeddings

In [5]:
# get edges embeddings

edge_embeddings_global(exp, method, dimension, groups_id, subgroups_id_op)

100%|██████████| 1/1 [17:44<00:00, 1064.11s/it]
 33%|███▎      | 1/3 [17:44<35:28, 1064.11s/it]

In [None]:
df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}.csv".format(exp, method, groups_id[0], subgroups_id_op[groups_id[0]][0]), index_col=[0, 1])
df_edge_embeddings.head()

Unnamed: 0,Unnamed: 1,0,1,2
0,1,0.001141083,2.84729e-09,0.001935468
0,2,5.716872e-09,1.84041e-11,1.119999e-08
0,3,0.001711178,4.984949e-08,0.002932492
0,4,0.0001806792,1.005588e-07,0.0003249767
0,5,0.0003780948,5.812921e-10,0.0006364323


time: 488 ms (started: 2023-06-08 11:44:41 -05:00)


### Concat edge embeddings

In [None]:
for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.DataFrame()
    k = 0
    for subgroup in tqdm(subgroups_id_op[group]):
        k += 1
        df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}.csv".format(exp, method, group, subgroup), index_col=[0, 1])
        df_edge_embeddings["subgroup"] = [k] * len(df_edge_embeddings)

        df_edge_embeddings_concat = pd.concat([df_edge_embeddings_concat, df_edge_embeddings])
    df_edge_embeddings_concat.to_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}.csv".format(exp, method, group), index=True)

100%|██████████| 5/5 [00:07<00:00,  1.47s/it]
100%|██████████| 3/3 [00:09<00:00,  3.31s/it]
100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
100%|██████████| 3/3 [03:14<00:00, 64.92s/it]

time: 3min 14s (started: 2023-06-08 11:44:41 -05:00)





In [None]:
df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}.csv".format(exp, method, groups_id[0]), index_col=[0, 1])
df_edge_embeddings_concat.head()

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,1,0.001141083,2.84729e-09,0.001935468,1
0,2,5.716872e-09,1.84041e-11,1.119999e-08,1
0,3,0.001711178,4.984949e-08,0.002932492,1
0,4,0.0001806792,1.005588e-07,0.0003249767,1
0,5,0.0003780948,5.812921e-10,0.0006364323,1


time: 4.93 s (started: 2023-06-08 11:48:38 -05:00)


In [None]:
# plot edge embeddings concat

for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}.csv".format(exp, method, group), index_col=[0, 1])

    x = df_edge_embeddings_concat.iloc[:, 0]
    y = df_edge_embeddings_concat.iloc[:, 1]
    z = df_edge_embeddings_concat.iloc[:, 2]

    # Creating figure
    fig = plt.figure(figsize=(10, 7))
    ax = plt.axes(projection="3d")

    # Creating plot
    ax.scatter3D(x, y, z, c=df_edge_embeddings_concat.iloc[:, -1], alpha=0.1)
    # plt.title("Dimension: {}".format(dimension))

    # show plot
    plt.savefig("output/{}/plots/edge-embeddings_concat_{}_{}.png".format(exp, method, group))
    # plt.show()
    plt.close()

100%|██████████| 3/3 [14:34<00:00, 291.49s/it]

time: 14min 34s (started: 2023-06-08 11:49:12 -05:00)





### Outliers detection

In [None]:
# Outlier detection (HDBSCAN)

""" df_edge_embeddings_concat = pd.read_csv("output/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(group, method, dimension, "L2"), index_col=[0, 1])

X_train = df_edge_embeddings_concat.iloc[:, :-1]
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)

threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
print(len(outliers))
outliers

inliers = np.setdiff1d(np.arange(len(df_edge_embeddings_concat)), outliers)
print(len(inliers))
inliers """

' df_edge_embeddings_concat = pd.read_csv("output/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(group, method, dimension, "L2"), index_col=[0, 1])\n\nX_train = df_edge_embeddings_concat.iloc[:, :-1]\nclusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)\n\nthreshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)\noutliers = np.where(clusterer.outlier_scores_ > threshold)[0]\nprint(len(outliers))\noutliers\n\ninliers = np.setdiff1d(np.arange(len(df_edge_embeddings_concat)), outliers)\nprint(len(inliers))\ninliers '

time: 4.64 ms (started: 2023-06-06 15:43:33 -05:00)


In [None]:
# outlier detection (ECOD)

dict_df_edge_embeddings_concat_outlier = {}
dict_df_edge_embeddings_concat_filter = {}

for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}.csv".format(exp, method, group), index_col=[0, 1])

    X_train = df_edge_embeddings_concat.iloc[:, :-1]

    clf = ECOD()
    clf.fit(X_train)

    X_train["labels"] = clf.labels_ # binary labels (0: inliers, 1: outliers)

    df_edge_embeddings_concat_filter = df_edge_embeddings_concat.copy()
    df_edge_embeddings_concat_filter["labels"] = clf.labels_
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["labels"] == 0]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, :-1]

    dict_df_edge_embeddings_concat_outlier[group] = X_train
    dict_df_edge_embeddings_concat_filter[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [01:06<00:00, 22.02s/it]

time: 1min 6s (started: 2023-06-08 12:04:38 -05:00)





In [None]:
# plot outliers/inliers

for group in tqdm(groups_id):
    fig = plt.figure(figsize = (10, 7))
    ax = plt.axes(projection ="3d")

    df_aux = dict_df_edge_embeddings_concat_outlier[group]
    print("Total:", len(df_aux))
    
    temp = df_aux[df_aux["labels"] == 0]
    x = temp.iloc[:, 0]
    y = temp.iloc[:, 1]
    z = temp.iloc[:, 2]
    ax.scatter3D(x, y, z, c="red", alpha=0.005)
    print("Num. of inliers:", len(temp))

    temp = df_aux[df_aux["labels"] == 1]
    x = temp.iloc[:, 0]
    y = temp.iloc[:, 1]
    z = temp.iloc[:, 2]
    ax.scatter3D(x, y, z, c="gray", alpha=0.005)
    print("Num. of inliers:", len(temp))

    # show plot
    plt.savefig("output/{}/plots/edge-embeddings_outlier_{}_{}.png".format(exp, method, group))
    # plt.show()
    plt.close()

  0%|          | 0/3 [00:00<?, ?it/s]

Total: 12232238
Num. of inliers: 11009014
Num. of inliers: 1223224


 33%|███▎      | 1/3 [04:44<09:29, 284.65s/it]

Total: 17089033
Num. of inliers: 15380129
Num. of inliers: 1708904


 67%|██████▋   | 2/3 [11:36<05:59, 359.38s/it]

Total: 6044291
Num. of inliers: 5439862
Num. of inliers: 604429


100%|██████████| 3/3 [13:58<00:00, 279.46s/it]

time: 13min 58s (started: 2023-06-08 12:06:02 -05:00)





###  Filter common edges

In [None]:
# mapping idx with id

for group in tqdm(groups_id):
    dict_df_nodes = {}
    for subgroup in subgroups_id_op[group]:
        df_nodes = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, group, subgroup),
                               dtype={"id": "string"})
        dict_df_nodes[subgroup] = df_nodes
    
    # mapping
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
    list_index = []

    for row in tqdm(df_edge_embeddings_concat_filter.itertuples()):
        df_nodes = dict_df_nodes[str(row[-1])]
        list_index.append((df_nodes.iloc[row[0][0], -1], df_nodes.iloc[row[0][1], -1]))
    
    # set new index
    df_edge_embeddings_concat_filter.set_index([pd.Index(list_index)], inplace=True)
df_edge_embeddings_concat_filter

11009014it [05:21, 34279.95it/s]it/s]
15380129it [07:32, 33960.64it/s]1, 330.50s/it]
5439862it [02:40, 33894.07it/s]49, 409.59s/it]
100%|██████████| 3/3 [16:01<00:00, 320.45s/it]


Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,120.931,0.000438,0.000017,0.000950,1
59.0049,172.0403,0.000681,0.000027,0.001479,1
59.0049,210.0762,0.000834,0.000032,0.001809,1
59.0049,292.9064,0.003755,0.000111,0.008033,1
59.0049,314.9808,0.008483,0.000183,0.017847,1
...,...,...,...,...,...
637.253,978.3602,0.173682,0.256060,0.216804,2
829.9994,836.17,0.039066,0.025224,0.001482,2
585.106,654.4398,0.083181,0.110177,0.103508,2
654.4398,655.3718,0.178803,0.319180,0.456600,2


time: 16min 1s (started: 2023-06-08 12:20:28 -05:00)


In [None]:
df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[groups_id[0]]
df_edge_embeddings_concat_filter.head()

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,369.1213,0.001711,4.984949e-08,0.002932,1
59.0049,641.1223,0.006827,3.970386e-07,0.01177,1
59.0049,650.2145,0.005209,4.986619e-07,0.009032,1
59.0049,977.8237,0.018792,6.724997e-06,0.033386,1
274.0123,191.046,0.00513,3.046072e-06,0.009246,1


time: 7.46 ms (started: 2023-06-08 12:37:07 -05:00)


In [None]:
df_edge_embeddings_concat_filter.index[:5]

MultiIndex([( '59.0049', '369.1213'),
            ( '59.0049', '641.1223'),
            ( '59.0049', '650.2145'),
            ( '59.0049', '977.8237'),
            ('274.0123',  '191.046')],
           )

time: 2.87 ms (started: 2023-06-08 12:38:12 -05:00)


In [None]:
# count edges and filter by count

dict_df_edges_filter = {}
for group in tqdm(groups_id):
    # count
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]

    df_edge_embeddings_concat_filter.reset_index(inplace=True)
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[["level_0", "level_1"]].value_counts().to_frame()
    df_edge_embeddings_concat_filter.reset_index(inplace=True)
    df_edge_embeddings_concat_filter.columns = ["source", "target", "count"]

    # filter
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["count"] == len(subgroups_id_op[group])]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, [0, 1]]
    dict_df_edges_filter[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [00:15<00:00,  5.06s/it]

time: 15.2 s (started: 2023-06-08 12:38:32 -05:00)





In [None]:
df_edges_filter = dict_df_edges_filter[groups_id[0]]
df_edges_filter.head()

Unnamed: 0,source,target
0,592.406,746.5603
1,868.4118,937.6266
2,868.4118,935.6255
3,473.1791,571.1777
4,324.1922,496.1782


time: 6.87 ms (started: 2023-06-08 12:44:05 -05:00)


In [None]:
df_edges_filter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23452 entries, 0 to 23451
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  23452 non-null  object
 1   target  23452 non-null  object
dtypes: object(2)
memory usage: 549.7+ KB
time: 17.3 ms (started: 2023-06-08 12:44:08 -05:00)


In [None]:
# change data type
for group in tqdm(groups_id):
    df_edges_filter = dict_df_edges_filter[group]
    df_edges_filter[["source", "target"]] = df_edges_filter[["source", "target"]].astype("string")

100%|██████████| 3/3 [00:00<00:00, 23.52it/s]

time: 131 ms (started: 2023-06-08 12:44:54 -05:00)





In [None]:
df_edges_filter = dict_df_edges_filter[groups_id[0]]
df_edges_filter.head()

Unnamed: 0,source,target
0,592.406,746.5603
1,868.4118,937.6266
2,868.4118,935.6255
3,473.1791,571.1777
4,324.1922,496.1782


time: 6.44 ms (started: 2023-06-08 12:47:37 -05:00)


In [None]:
df_edges_filter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23452 entries, 0 to 23451
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  23452 non-null  string
 1   target  23452 non-null  string
dtypes: string(2)
memory usage: 549.7 KB
time: 10 ms (started: 2023-06-08 12:47:43 -05:00)


In [None]:
subgroups_id

{'WT': ['1', '2', '3', '4', '5'],
 'zwf1^': ['1', '2', '3'],
 'pck1^': ['1', '2']}

time: 2.08 ms (started: 2023-06-08 12:45:38 -05:00)


time: 1.53 ms (started: 2023-06-08 12:49:43 -05:00)


In [None]:
# get weight by subgroups

dict_df_edges_filter_weight = get_weight_global(dict_df_edges_filter, exp, groups_id, subgroups_id)
df_edges_filter_weight = dict_df_edges_filter_weight[groups_id[0]]
df_edges_filter_weight.head()

100%|██████████| 5/5 [00:22<00:00,  4.53s/it]
100%|██████████| 3/3 [00:35<00:00, 11.92s/it]
100%|██████████| 2/2 [00:13<00:00,  6.53s/it]
100%|██████████| 3/3 [01:13<00:00, 24.36s/it]


Unnamed: 0,source,target,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5
17430,109.0407,114.056,0.68468,0.65475,0.527725,0.859966,0.823646
17230,109.0407,127.0513,0.791174,0.792589,0.782065,0.91781,0.87042
17306,109.0407,131.0462,0.802158,0.72102,0.613603,0.872517,0.867231
17308,109.0407,131.0824,0.808212,0.663848,0.719276,0.857543,0.913011
17318,109.0407,132.086,0.764665,0.614573,0.60396,0.893537,0.878882


time: 1min 13s (started: 2023-06-08 12:49:47 -05:00)


In [None]:
df_edges_filter_weight = dict_df_edges_filter_weight[groups_id[0]]
df_edges_filter_weight.head()

Unnamed: 0,source,target,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5
17430,109.0407,114.056,0.68468,0.65475,0.527725,0.859966,0.823646
17230,109.0407,127.0513,0.791174,0.792589,0.782065,0.91781,0.87042
17306,109.0407,131.0462,0.802158,0.72102,0.613603,0.872517,0.867231
17308,109.0407,131.0824,0.808212,0.663848,0.719276,0.857543,0.913011
17318,109.0407,132.086,0.764665,0.614573,0.60396,0.893537,0.878882


time: 13.1 ms (started: 2023-06-08 12:51:17 -05:00)


### Filter by STD and average weight

In [None]:
dict_df_common_edges = std_global(dict_df_edges_filter_weight, exp, method, groups_id, th=0.3, plot=True, save=True)
dict_df_common_edges[groups_id[0]].head()

100%|██████████| 3/3 [00:15<00:00,  5.08s/it]


Unnamed: 0,source,target,weight
0,109.0407,114.056,0.710154
1,109.0407,127.0513,0.830812
2,109.0407,131.0462,0.775306
3,109.0407,131.0824,0.792378
4,109.0407,132.086,0.751123


time: 15.2 s (started: 2023-06-08 12:52:20 -05:00)


In [None]:
df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}.csv".format(exp, method, groups_id[0]),
                              dtype={"source": "string", "target": "string"})
df_common_edges.head()

Unnamed: 0,source,target,weight
0,109.0407,114.056,0.710154
1,109.0407,127.0513,0.830812
2,109.0407,131.0462,0.775306
3,109.0407,131.0824,0.792378
4,109.0407,132.086,0.751123


time: 22.3 ms (started: 2023-06-08 12:52:46 -05:00)


In [None]:
# show details

for group in tqdm(groups_id):
    df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}.csv".format(exp, method, group))
    
    G = nx.from_pandas_edgelist(df_common_edges, "source", "target", edge_attr=["weight"])
    print("Group: {}".format(group))
    graph_detail(G)

 33%|███▎      | 1/3 [00:00<00:00,  2.14it/s]

Group: WT
Num. nodes: 1916
Num. edges: 23331



 67%|██████▋   | 2/3 [00:01<00:00,  1.29it/s]

Group: zwf1^
Num. nodes: 4367
Num. edges: 521944



100%|██████████| 3/3 [00:02<00:00,  1.30it/s]

Group: pck1^
Num. nodes: 5046
Num. edges: 454546

time: 2.31 s (started: 2023-06-06 16:14:23 -05:00)



