### Imports

In [1]:
from pyod.models.ecod import ECOD
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


time: 176 µs (started: 2023-06-05 13:05:27 -05:00)


### Parameters

In [2]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

method = params["method"]
print("Method:\t\t", method)

dimension = params["dimension"]
print("Dimension:\t", dimension)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id)

Exp:		 exp2
Method:		 dgi
Dimension:	 3
Groups id:	 ['WT', 'zwf1^', 'pck1^']
Subgroups id:	 {'WT': ['1', '2', '3', '4', '5'], 'zwf1^': ['1', '2', '3'], 'pck1^': ['1', '2']}
time: 2.7 ms (started: 2023-06-05 13:05:27 -05:00)


### Edge embeddings

In [3]:
# get edges embeddings

edge_embeddings_global(exp, method, dimension, groups_id, subgroups_id)

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}_{}_{}.csv".format(exp, groups_id[0], subgroups_id[groups_id[0]][0], method, dimension, "L2"), index_col=[0, 1])
df_edge_embeddings.head()

Unnamed: 0,Unnamed: 1,0,1,2
0,1,6.461881,0.000695,0.480581
0,2,0.003822,0.842727,3.42433
0,3,0.783019,0.024875,0.354545
0,4,2.682043,2.122767,2.174131
0,5,0.445657,1.257031,0.000274


time: 514 ms (started: 2023-06-05 11:13:32 -05:00)


### Concat edge embeddings

In [None]:
for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.DataFrame()
    for subgroup in tqdm(subgroups_id[group]):
        df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}_{}_{}.csv".format(exp, group, subgroup, method, dimension, "L2"), index_col=[0, 1])
        df_edge_embeddings["subgroup"] = [subgroup] * len(df_edge_embeddings)

        df_edge_embeddings_concat = pd.concat([df_edge_embeddings_concat, df_edge_embeddings])
    df_edge_embeddings_concat.to_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"), index=True)

100%|██████████| 5/5 [00:06<00:00,  1.22s/it]
100%|██████████| 3/3 [00:08<00:00,  2.73s/it]
100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
100%|██████████| 3/3 [03:03<00:00, 61.07s/it]

time: 3min 3s (started: 2023-06-05 11:13:33 -05:00)





In [None]:
df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(exp, groups_id[0], method, dimension, "L2"), index_col=[0, 1])
df_edge_embeddings_concat.head()

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,1,6.461881,0.000695,0.480581,1
0,2,0.003822,0.842727,3.42433,1
0,3,0.783019,0.024875,0.354545,1
0,4,2.682043,2.122767,2.174131,1
0,5,0.445657,1.257031,0.000274,1


time: 5.04 s (started: 2023-06-05 11:34:16 -05:00)


In [None]:
# plot edge embeddings concat

for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"), index_col=[0, 1])

    x = df_edge_embeddings_concat.iloc[:, 0]
    y = df_edge_embeddings_concat.iloc[:, 1]
    z = df_edge_embeddings_concat.iloc[:, 2]

    # Creating figure
    fig = plt.figure(figsize=(10, 7))
    ax = plt.axes(projection="3d")

    # Creating plot
    ax.scatter3D(x, y, z, c=df_edge_embeddings_concat.iloc[:, -1], alpha=0.1)
    # plt.title("Dimension: {}".format(dimension))

    # show plot
    plt.savefig("output/{}/plots/edge-embeddings_concat_{}_{}_{}_{}.png".format(exp, group, method, dimension, "L2"))
    # plt.show()
    plt.close()

100%|██████████| 3/3 [15:11<00:00, 303.81s/it]

time: 15min 11s (started: 2023-06-05 11:34:21 -05:00)





### Outliers detection

In [None]:
# Outlier detection (HDBSCAN)
""" df_edge_embeddings_concat = pd.read_csv("output/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(group, method, dimension, "L2"), index_col=[0, 1])

X_train = df_edge_embeddings_concat.iloc[:, :-1]
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)

threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
print(len(outliers))
outliers

inliers = np.setdiff1d(np.arange(len(df_edge_embeddings_concat)), outliers)
print(len(inliers))
inliers """

' df_edge_embeddings_concat = pd.read_csv("output/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(group, method, dimension, "L2"), index_col=[0, 1])\n\nX_train = df_edge_embeddings_concat.iloc[:, :-1]\nclusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)\n\nthreshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)\noutliers = np.where(clusterer.outlier_scores_ > threshold)[0]\nprint(len(outliers))\noutliers\n\ninliers = np.setdiff1d(np.arange(len(df_edge_embeddings_concat)), outliers)\nprint(len(inliers))\ninliers '

time: 3.47 ms (started: 2023-06-05 11:49:32 -05:00)


In [None]:
# outlier detection (ECOD)
dict_df_edge_embeddings_concat_outlier = {}
dict_df_edge_embeddings_concat_filter = {}

for group in tqdm(groups_id):
    df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(exp, group, method, dimension, "L2"), index_col=[0, 1])

    X_train = df_edge_embeddings_concat.iloc[:, :-1]

    clf = ECOD()
    clf.fit(X_train)

    X_train["labels"] = clf.labels_ # binary labels (0: inliers, 1: outliers)

    df_edge_embeddings_concat_filter = df_edge_embeddings_concat.copy()
    df_edge_embeddings_concat_filter["labels"] = clf.labels_
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["labels"] == 0]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, :-1]

    dict_df_edge_embeddings_concat_outlier[group] = X_train
    dict_df_edge_embeddings_concat_filter[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [01:06<00:00, 22.21s/it]

time: 1min 6s (started: 2023-06-05 12:41:00 -05:00)





In [None]:
# plot outliers/inliers
for group in tqdm(groups_id):
    fig = plt.figure(figsize = (10, 7))
    ax = plt.axes(projection ="3d")

    df_aux = dict_df_edge_embeddings_concat_outlier[group]
    print("Total:", len(df_aux))
    
    temp = df_aux[df_aux["labels"] == 0]
    x = temp.iloc[:, 0]
    y = temp.iloc[:, 1]
    z = temp.iloc[:, 2]
    ax.scatter3D(x, y, z, c="red", alpha=0.005)
    print("Num. of inliers:", len(temp))

    temp = df_aux[df_aux["labels"] == 1]
    x = temp.iloc[:, 0]
    y = temp.iloc[:, 1]
    z = temp.iloc[:, 2]
    ax.scatter3D(x, y, z, c="gray", alpha=0.005)
    print("Num. of inliers:", len(temp))

    # show plot
    plt.savefig("output/{}/plots/edge-embeddings_outlier_{}_{}_{}_{}.png".format(exp, group, method, dimension, "L2"))
    # plt.show()
    plt.close()

  0%|          | 0/3 [00:00<?, ?it/s]

Total: 12232238
Num. of inliers: 11009014
Num. of inliers: 1223224


 33%|███▎      | 1/3 [05:03<10:07, 303.80s/it]

Total: 17089033
Num. of inliers: 15380129
Num. of inliers: 1708904


 67%|██████▋   | 2/3 [12:12<06:17, 377.00s/it]

Total: 6044291
Num. of inliers: 5439862
Num. of inliers: 604429


100%|██████████| 3/3 [14:43<00:00, 294.61s/it]

time: 14min 43s (started: 2023-06-05 12:03:16 -05:00)





###  Filter common edges

In [None]:
# mapping idx with id

for group in tqdm(groups_id):
    dict_df_nodes = {}
    for subgroup in subgroups_id[group]:
        df_nodes = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, group, subgroup))
        dict_df_nodes[subgroup] = df_nodes
    
    # mapping
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
    list_index = []

    for row in tqdm(df_edge_embeddings_concat_filter.itertuples()):
        df_nodes = dict_df_nodes[str(row[-1])]
        list_index.append((df_nodes.iloc[row[0][0], -1], df_nodes.iloc[row[0][1], -1]))
    
    # set new index
    df_edge_embeddings_concat_filter.set_index([pd.Index(list_index)], inplace=True)
    df_edge_embeddings_concat_filter

11009014it [05:01, 36488.92it/s]it/s]
15380129it [07:02, 36433.58it/s]2, 311.26s/it]
5439862it [02:30, 36186.74it/s]23, 383.86s/it]
100%|██████████| 3/3 [15:01<00:00, 300.61s/it]

time: 15min 1s (started: 2023-06-05 12:42:12 -05:00)





In [None]:
df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[groups_id[0]]
df_edge_embeddings_concat_filter.head()

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,369.1213,0.783019,0.024875,0.354545,1
59.0049,503.0807,0.000378,0.23943,0.066928,1
59.0049,558.4273,2.49601,0.103704,0.54803,1
59.0049,977.8237,0.492603,0.250157,0.1019,1
274.0123,191.046,2.037267,0.17617,0.041174,1


time: 9.86 ms (started: 2023-06-05 12:57:21 -05:00)


In [None]:
# count edges and filter by count
dict_df_edges_filter = {}
for group in tqdm(groups_id):
    # count
    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]

    df_edge_embeddings_concat_filter.reset_index(inplace=True)
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[["level_0", "level_1"]].value_counts().to_frame()
    df_edge_embeddings_concat_filter.reset_index(inplace=True)
    df_edge_embeddings_concat_filter.columns = ["source", "target", "count"]

    # filter
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["count"] == len(subgroups_id[group])]
    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, [0, 1]]
    dict_df_edges_filter[group] = df_edge_embeddings_concat_filter

100%|██████████| 3/3 [00:10<00:00,  3.65s/it]

time: 10.9 s (started: 2023-06-05 12:57:26 -05:00)





In [None]:
df_edges_filter = dict_df_edges_filter[groups_id[0]]
df_edges_filter.head()

Unnamed: 0,source,target
0,487.2134,920.1236
1,293.1786,341.1934
2,716.5216,727.6633
3,732.4815,759.5518
4,903.6068,913.5813


time: 8.97 ms (started: 2023-06-05 12:57:40 -05:00)


In [None]:
# get weight by subgroups
dict_df_edges_filter_weight = {}

for group in tqdm(groups_id):
    df_edges_filter_weight = dict_df_edges_filter[group].copy()

    s = []
    t = []
    for row in df_edges_filter_weight.itertuples():
        if row[1] > row[2]:
            s.append(row[2])
            t.append(row[1])
        else:
            s.append(row[1])
            t.append(row[2])
    df_edges_filter_weight["source"] = s
    df_edges_filter_weight["target"] = t

    df_edges_filter_weight.sort_values(["source", "target"], ascending=True, inplace=True)
    df_edges_filter_weight["idx"] = df_edges_filter_weight["source"].astype(str) + "-" + df_edges_filter_weight["target"].astype(str)
    list_aux = df_edges_filter_weight.iloc[:, -1].values

    for subgroup in tqdm(subgroups_id[group]):
        df_edges = pd.read_csv("output/{}/preprocessing/edges/edges_{}_{}.csv".format(exp, group, subgroup))
        df_edges.sort_values(["source", "target"], ascending=True, inplace=True)
        df_edges["idx"] = df_edges["source"].astype(str) + "-" + df_edges["target"].astype(str)
        
        filter = df_edges["idx"].isin(list_aux)
        temp = df_edges[filter]
        list_temp = temp.iloc[:, -2].values
        df_edges_filter_weight["subgroup{}".format(subgroup)] = list_temp
        
    df_edges_filter_weight.drop(["idx"], inplace=True, axis=1)
    
    dict_df_edges_filter_weight[group] = df_edges_filter_weight


100%|██████████| 5/5 [00:14<00:00,  2.97s/it]
100%|██████████| 3/3 [00:24<00:00,  8.16s/it]
100%|██████████| 2/2 [00:08<00:00,  4.45s/it]
100%|██████████| 3/3 [00:50<00:00, 16.70s/it]

time: 50.1 s (started: 2023-06-05 12:57:43 -05:00)





In [None]:
df_edges_filter_weight = dict_df_edges_filter_weight[groups_id[0]]
df_edges_filter_weight.head()

Unnamed: 0,source,target,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5
21486,59.0137,389.2739,0.823682,0.88626,0.723124,0.51523,0.917952
21572,59.0137,389.3284,0.861323,0.889178,0.521535,0.795494,0.890928
21598,59.0137,418.3254,0.917996,0.948796,0.786776,0.798094,0.938164
23759,59.0291,417.322,0.855964,0.852473,0.571245,0.569312,0.899766
23787,59.0291,418.3254,0.857608,0.848869,0.619139,0.561154,0.903494


time: 14.6 ms (started: 2023-06-05 12:58:38 -05:00)


### Filter by STD and average weight

In [None]:
# dispersion (std)

# dict_df_common_edges = {}
th = 0.3
for group in tqdm(groups_id):
    df_edges_filter_weight = dict_df_edges_filter_weight[group]

    # calculate std
    df_edges_filter_weight["std"] = np.std(df_edges_filter_weight.iloc[:, 2:], axis=1)

    # filter std < 0.3
    df_edges_filter_weight_std = df_edges_filter_weight[df_edges_filter_weight["std"] < th]

    # average weight
    df_edges_filter_weight_std_avg = df_edges_filter_weight_std.iloc[:, :-1]
    df_edges_filter_weight_std_avg["weight"] = df_edges_filter_weight_std_avg.iloc[:, 2:].mean(axis=1)
    df_edges_filter_weight_std_avg = df_edges_filter_weight_std_avg.iloc[:, [0, 1, -1]]
    df_edges_filter_weight_std_avg.reset_index(drop=True, inplace=True)

    # save
    df_edges_filter_weight_std_avg.to_csv("output/{}/common_edges/common_edges_{}_{}.csv".format(exp, group, method, dimension, "L2"), index=False)

    # plot
    x = df_edges_filter_weight["std"]
    plt.hist(x, bins=100)
    plt.axvline(x=0.3, color="red", lw=1)
    l = len(df_edges_filter_weight) - len(df_edges_filter_weight_std)
    t = len(df_edges_filter_weight)
    plt.title("Loss: {} of {} ({}%)".format(l, t, round(l*100/t)))
    plt.savefig("output/{}/plots/edge-embeddings_std_{}_{}_{}_{}.png".format(exp, group, method, dimension, "L2"))
    # plt.show()
    plt.close()

    # dict_df_common_edges[group] = df_edges_filter_weight_std_avg

100%|██████████| 3/3 [00:02<00:00,  1.04it/s]

time: 2.9 s (started: 2023-06-05 13:04:15 -05:00)





In [None]:
df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}.csv".format(exp, groups_id[0], method, dimension, "L2"))
df_common_edges.head()

Unnamed: 0,source,target,weight
0,59.0137,389.2739,0.773249
1,59.0137,389.3284,0.791692
2,59.0137,418.3254,0.877965
3,59.0291,417.322,0.749752
4,59.0291,418.3254,0.758053


time: 24.1 ms (started: 2023-06-05 13:04:40 -05:00)


In [None]:
# show details
for group in tqdm(groups_id):
    df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}.csv".format(exp, groups_id[0], method, dimension, "L2"))
    
    G = nx.from_pandas_edgelist(df_common_edges, "source", "target", edge_attr=["weight"])
    print("Group: {}".format(group))
    graph_detail(G)

100%|██████████| 3/3 [00:00<00:00, 15.93it/s]

Group: WT
Num. nodes: 2305
Num. edges: 28617

Group: zwf1^
Num. nodes: 2305
Num. edges: 28617

Group: pck1^
Num. nodes: 2305
Num. edges: 28617

time: 193 ms (started: 2023-06-05 13:04:44 -05:00)



