### Imports

In [90]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.74 ms (started: 2023-05-19 20:55:13 -05:00)


### Parameters

In [91]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 dgi
Group:		 ['zwf1^', 3]
Subgroup:	 ['1', '2', '3']
Dimensions:	 [3]
time: 3.29 ms (started: 2023-05-19 20:55:13 -05:00)


### Edge embeddings

In [92]:
# Get edges embeddings

for dimension in tqdm(dimensions):
    # Get embeddings    
    for i in tqdm(subgroups):
        # Read dataset
        df_node_embeddings = pd.read_csv("{}/output_{}/node_embeddings/{}_node-embeddings_{}_{}.csv".format(dir, method, group[0], dimension, i), index_col=0)
        df_edges = pd.read_csv("{}/output_preprocessing/graph_data/{}_edges_data_{}.csv".format(dir, group[0], i))
        
        # Get edges embeddings
        list_df_edge_embeddings, list_edge_embeddings_legend = edge2vecx([df_node_embeddings], [df_edges], [""])
        for j in range(len(list_df_edge_embeddings)):
            list_df_edge_embeddings[j].to_csv("{}/output_{}/edge_embeddings/{}_edge-embeddings_{}_{}_{}.csv".format(dir, method, group[0], dimension, i, list_edge_embeddings_legend[j]), index=True)

100%|██████████| 3/3 [22:56<00:00, 458.67s/it]
100%|██████████| 1/1 [22:56<00:00, 1376.01s/it]

time: 22min 56s (started: 2023-05-19 20:55:13 -05:00)





In [93]:
df_edge_embeddings = pd.read_csv("{}/output_{}/edge_embeddings/{}_edge-embeddings_{}_{}_{}.csv".format(dir, method, group[0], 3, 1, "L2"), index_col=0)
df_edge_embeddings

Unnamed: 0,0,1,2
"(0, 1)",0.001165,4.049477e-07,0.002117
"(0, 2)",0.000295,3.266465e-07,0.000515
"(0, 3)",0.000146,9.602809e-08,0.000261
"(0, 4)",0.007963,3.566419e-07,0.014943
"(0, 5)",0.017515,1.922937e-08,0.033550
...,...,...,...
"(6181, 6207)",0.069031,9.482464e-04,0.095727
"(6185, 6186)",0.153108,1.940908e-03,0.214398
"(6190, 6211)",0.000194,1.007319e-04,0.000001
"(6191, 6192)",0.061875,7.545296e-03,0.035964


time: 2.16 s (started: 2023-05-19 21:18:09 -05:00)


### Join embeddings

In [94]:
for dimension in tqdm(dimensions):
    df_edge_embeddings_join = pd.DataFrame()
    for i in tqdm(subgroups):
        # Read dataset
        df_edge_embeddings = pd.read_csv("{}/output_{}/edge_embeddings/{}_edge-embeddings_{}_{}_{}.csv".format(dir, method, group[0], dimension, i, "L2"), index_col=0)
        df_edge_embeddings["subgroup"] = [i] * len(df_edge_embeddings)

        df_edge_embeddings_join = pd.concat([df_edge_embeddings_join, df_edge_embeddings])
    df_edge_embeddings_join.to_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=True)

100%|██████████| 3/3 [00:15<00:00,  5.20s/it]
100%|██████████| 1/1 [01:11<00:00, 71.72s/it]

time: 1min 11s (started: 2023-05-19 21:18:11 -05:00)





In [95]:
df_edge_embeddings_join = pd.read_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=0)
df_edge_embeddings_join

Unnamed: 0,0,1,2,subgroup
"(0, 1)",0.001165,4.049477e-07,0.002117,1
"(0, 2)",0.000295,3.266465e-07,0.000515,1
"(0, 3)",0.000146,9.602809e-08,0.000261,1
"(0, 4)",0.007963,3.566419e-07,0.014943,1
"(0, 5)",0.017515,1.922937e-08,0.033550,1
...,...,...,...,...
"(6167, 6209)",1.162970,7.340874e-01,0.204429,3
"(6168, 6169)",0.236175,3.633102e-02,0.003910,3
"(6186, 6187)",0.971371,9.020638e-01,0.285527,3
"(6188, 6189)",0.385933,6.946719e-02,0.090572,3


time: 14.8 s (started: 2023-05-19 21:19:23 -05:00)


In [96]:
# Plot join

for dimension in tqdm(dimensions):
    # Read dataset
    df_edge_embeddings_join = pd.read_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index_col=0)

    x = df_edge_embeddings_join.iloc[:, 0]
    y = df_edge_embeddings_join.iloc[:, 1]
    z = df_edge_embeddings_join.iloc[:, 2]

    # Creating figure
    fig = plt.figure(figsize = (10, 7))
    ax = plt.axes(projection ="3d")

    # Creating plot
    ax.scatter3D(x, y, z, c=df_edge_embeddings_join.iloc[:, -1], alpha=0.1)
    # plt.title("Dimension: {}".format(dimension))

    # show plot
    plt.savefig("{}/output_{}/plots/{}_plot_subgroups_{}_{}.png".format(dir, method, group[0], dimension, "L2"))
    # plt.show()
    plt.close()

100%|██████████| 1/1 [07:14<00:00, 435.00s/it]

time: 7min 15s (started: 2023-05-19 21:19:38 -05:00)





### Clustering

In [97]:
# silhouette_score(X_train.iloc[:, :-1], X_train.iloc[:, -1])

time: 267 µs (started: 2023-05-19 21:26:53 -05:00)


In [98]:
# Outlier detection

for dimension in tqdm(dimensions):
    df_edge_embeddings_join = pd.read_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index_col=0)
    
    X_train = df_edge_embeddings_join.iloc[:, :-1]
    clusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]

print(len(outliers))
outliers

1223483


array([       1,        3,        5, ..., 12234777, 12234811, 12234817])

time: 24.2 s (started: 2023-05-19 20:30:27 -05:00)


In [None]:
outliers_no = np.setdiff1d(np.arange(len(df_edge_embeddings_join)), outliers)
print(len(outliers_no))
outliers_no

11011347


array([       0,        2,        4, ..., 12234827, 12234828, 12234829])

time: 342 ms (started: 2023-05-19 20:30:51 -05:00)


In [None]:
print("Total:", len(df_edge_embeddings_join))
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")

temp = df_edge_embeddings_join.iloc[outliers, :]
x = temp.iloc[:, 0]
y = temp.iloc[:, 1]
z = temp.iloc[:, 2]
ax.scatter3D(x, y, z, c="gray", alpha=0.005)
print("Num. of outliers:", len(temp))

temp = df_edge_embeddings_join.iloc[outliers_no, :]
x = temp.iloc[:, 0]
y = temp.iloc[:, 1]
z = temp.iloc[:, 2]
ax.scatter3D(x, y, z, c="red", alpha=0.005)
print("Num. of no-outliers:", len(temp))

# plt.title("Dimension: {}".format(dimension))
plt.savefig("{}/output_{}/plots/{}_plot_cluster_{}_{}.png".format(dir, method, group[0], dimension, "L2"))
# plt.show()
plt.close()

Total: 12234830
Num. of outliers: 1223483
Num. of no-outliers: 11011347
time: 4min 55s (started: 2023-05-19 20:30:51 -05:00)


###  Filter common edges

In [None]:
df_edge_embeddings_join_filter = df_edge_embeddings_join.iloc[outliers_no, :]
df_edge_embeddings_join_filter.to_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index=True)
df_edge_embeddings_join_filter

Unnamed: 0,0,1,2,subgroup
"(0, 1)",0.001143,1.998984e-09,0.001940,1
"(0, 3)",0.001714,4.524980e-08,0.002940,1
"(0, 5)",0.000379,8.485569e-10,0.000638,1
"(0, 7)",0.003692,1.001801e-08,0.006271,1
"(0, 8)",0.006840,3.710272e-07,0.011798,1
...,...,...,...,...
"(6173, 6174)",0.004286,9.760723e-03,0.023302,5
"(6203, 6204)",0.020778,2.462606e-02,0.007199,5
"(6203, 6218)",0.060977,4.519730e-03,0.055055,5
"(6211, 6215)",0.029771,2.263333e-02,0.082080,5


time: 40.3 s (started: 2023-05-19 20:35:47 -05:00)


In [None]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=0)
df_edge_embeddings_join_filter.set_index([pd.Index([eval(item) for item in df_edge_embeddings_join_filter.index])], inplace=True)
df_edge_embeddings_join_filter.to_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index=True)
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,1,0.001143,1.998984e-09,0.001940,1
0,3,0.001714,4.524980e-08,0.002940,1
0,5,0.000379,8.485569e-10,0.000638,1
0,7,0.003692,1.001801e-08,0.006271,1
0,8,0.006840,3.710272e-07,0.011798,1
...,...,...,...,...,...
6173,6174,0.004286,9.760723e-03,0.023302,5
6203,6204,0.020778,2.462606e-02,0.007199,5
6203,6218,0.060977,4.519730e-03,0.055055,5
6211,6215,0.029771,2.263333e-02,0.082080,5


time: 1min 54s (started: 2023-05-19 20:36:28 -05:00)


In [None]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,1,0.001143,1.998984e-09,0.001940,1
0,3,0.001714,4.524980e-08,0.002940,1
0,5,0.000379,8.485569e-10,0.000638,1
0,7,0.003692,1.001801e-08,0.006271,1
0,8,0.006840,3.710272e-07,0.011798,1
...,...,...,...,...,...
6173,6174,0.004286,9.760723e-03,0.023302,5
6203,6204,0.020778,2.462606e-02,0.007199,5
6203,6218,0.060977,4.519730e-03,0.055055,5
6211,6215,0.029771,2.263333e-02,0.082080,5


time: 4.31 s (started: 2023-05-19 20:38:22 -05:00)


In [None]:
df_nodes = pd.read_csv("{}/output_preprocessing/graph_data/{}_nodes_data_{}.csv".format(dir, group[0], 1))
df_nodes

Unnamed: 0,idx,degree,ionMz
0,0,10,59.0049
1,1,25,274.0123
2,2,64,277.0867
3,3,60,369.1213
4,4,69,369.1541
...,...,...,...
6229,6229,1,818.2612
6230,6230,1,807.9051
6231,6231,2,995.9381
6232,6232,2,939.1026


time: 8.92 ms (started: 2023-05-19 20:38:26 -05:00)


In [None]:
# Mapping idx with ionMz
dict_df_nodes = {}
list_index = []
for i in tqdm(subgroups):
    df_nodes = pd.read_csv("{}/output_preprocessing/graph_data/{}_nodes_data_{}.csv".format(dir, group[0], i))
    dict_df_nodes[i] = df_nodes

for dimension in tqdm(dimensions):
    df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])

    for row in df_edge_embeddings_join_filter.itertuples():
        df_nodes = dict_df_nodes[str(row[-1])]        
        list_index.append((df_nodes.iloc[row[0][0], -1], df_nodes.iloc[row[0][1], -1]))

100%|██████████| 5/5 [00:00<00:00, 578.65it/s]
100%|██████████| 1/1 [04:53<00:00, 293.11s/it]

time: 4min 54s (started: 2023-05-19 20:38:27 -05:00)





In [None]:
# Set new index
df_edge_embeddings_join_filter.set_index([pd.Index(list_index)], inplace=True)
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,274.0123,0.001143,1.998984e-09,0.001940,1
59.0049,369.1213,0.001714,4.524980e-08,0.002940,1
59.0049,503.0516,0.000379,8.485569e-10,0.000638,1
59.0049,558.4273,0.003692,1.001801e-08,0.006271,1
59.0049,641.1223,0.006840,3.710272e-07,0.011798,1
...,...,...,...,...,...
979.9144,980.1544,0.004286,9.760723e-03,0.023302,5
734.2528,940.2058,0.020778,2.462606e-02,0.007199,5
734.2528,991.3236,0.060977,4.519730e-03,0.055055,5
735.2385,732.2616,0.029771,2.263333e-02,0.082080,5


time: 4.95 s (started: 2023-05-19 20:43:21 -05:00)


In [None]:
# Count
temp = df_edge_embeddings_join_filter.index.value_counts().to_frame()
df_edge_embeddings_join_filter_count = temp[temp["count"] == len(subgroups)]
df_edge_embeddings_join_filter_count.to_csv("{}/output_{}/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=True)
df_edge_embeddings_join_filter_count

Unnamed: 0,Unnamed: 1,count
275.1445,488.2155,5
256.2362,371.3167,5
473.3996,744.5531,5
253.1307,459.1907,5
473.3996,743.5410,5
...,...,...
383.0970,554.1227,5
686.4762,684.2283,5
255.2639,715.4726,5
367.3578,383.3530,5


time: 2.82 s (started: 2023-05-19 20:43:26 -05:00)


In [None]:
df_edge_embeddings_join_filter_count = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], 3, "L2"))
df_edge_embeddings_join_filter_count.columns = ["source", "target", "count"]
# print(df_edge_embeddings_join_filter_count.info())
df_edge_embeddings_join_filter_count

Unnamed: 0,source,target,count
0,275.1445,488.2155,5
1,256.2362,371.3167,5
2,473.3996,744.5531,5
3,253.1307,459.1907,5
4,473.3996,743.5410,5
...,...,...,...
24116,383.0970,554.1227,5
24117,686.4762,684.2283,5
24118,255.2639,715.4726,5
24119,367.3578,383.3530,5


time: 11.6 ms (started: 2023-05-19 20:43:29 -05:00)


In [None]:
df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], 2))
# print(df_edges.info())
df_edges

Unnamed: 0,source,target,weight
0,59.0049,59.0291,0.541623
1,59.0049,59.0370,0.553259
2,59.0049,164.0963,0.527691
3,59.0049,306.8897,0.538185
4,59.0049,405.2949,0.549859
...,...,...,...
939033,988.5664,996.7096,0.530655
939034,988.8558,998.4845,0.565444
939035,989.4484,989.5686,0.624192
939036,990.3213,990.4534,0.747188


time: 379 ms (started: 2023-05-19 20:43:29 -05:00)


In [None]:
# Get weight
df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
s = []
t = []
for row in df_edge_embeddings_join_filter_count_weight.itertuples():
    if row[1] > row[2]:
        s.append(row[2])
        t.append(row[1])
    else:
        s.append(row[1])
        t.append(row[2])
df_edge_embeddings_join_filter_count_weight["source"] = s
df_edge_embeddings_join_filter_count_weight["target"] = t

# df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
df_edge_embeddings_join_filter_count_weight.sort_values(["source", "target"], ascending=True, inplace=True)
df_edge_embeddings_join_filter_count_weight["idx"] = df_edge_embeddings_join_filter_count_weight["source"].astype(str) + "-" + df_edge_embeddings_join_filter_count_weight["target"].astype(str)
list_aux = df_edge_embeddings_join_filter_count_weight.iloc[:, -1].values

for i in tqdm(subgroups):
    df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], i))
    df_edges.sort_values(["source", "target"], ascending=True, inplace=True)
    df_edges["idx"] = df_edges["source"].astype(str) + "-" + df_edges["target"].astype(str)

    """ list_temp = []
    # for j in tqdm(range(len(df_edges_filter_count_weight))):
    for row in tqdm(df_edge_embeddings_join_filter_count_weight.itertuples()):
        # weight = 1 # df_edges[(df_edges["source"] == row[1]) & (df_edges["target"] == row[2])].iloc[0, -1]
        weight = df_edges[df_edges["idx"] == "{}-{}".format(row[1], row[2])].iloc[0, -2]
        list_temp.append(weight) """
    
    filter = df_edges["idx"].isin(list_aux)
    temp = df_edges[filter]
    list_temp = temp.iloc[:, -2].values
    df_edge_embeddings_join_filter_count_weight["subgroup{}".format(i)] = list_temp
df_edge_embeddings_join_filter_count_weight


100%|██████████| 5/5 [00:18<00:00,  3.71s/it]


Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5
21472,59.0137,349.2758,5,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207
21465,59.0137,350.2793,5,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223
21459,59.0137,350.3033,5,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973
21913,59.0137,389.2739,5,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952
21934,59.0137,389.3284,5,59.0137-389.3284,0.861323,0.889178,0.521535,0.795494,0.890928
...,...,...,...,...,...,...,...,...,...
12975,970.6852,978.5535,5,970.6852-978.5535,0.786612,0.789379,0.837201,0.859124,0.834018
12960,970.6852,978.5819,5,970.6852-978.5819,0.674501,0.612071,0.652370,0.759533,0.756092
12962,970.6852,979.5558,5,970.6852-979.5558,0.706012,0.796137,0.834149,0.858540,0.899189
12966,970.6852,980.5270,5,970.6852-980.527,0.850064,0.792149,0.839408,0.851767,0.847750


time: 18.6 s (started: 2023-05-19 20:43:30 -05:00)


In [None]:
# df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], 2))
# df_edges[(df_edges["source"] == 986.7752) & (df_edges["target"] == 989.4484)].iloc[0, -1]

time: 561 µs (started: 2023-05-19 20:43:49 -05:00)


In [None]:
# Dispersion (std)
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight.copy()
df_edge_embeddings_join_filter_count_weight_std["std"] = np.std(df_edge_embeddings_join_filter_count_weight_std.iloc[:, -len(subgroups):], axis=1)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5,std
21472,59.0137,349.2758,5,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207,0.118268
21465,59.0137,350.2793,5,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223,0.112406
21459,59.0137,350.3033,5,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973,0.128241
21913,59.0137,389.2739,5,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952,0.145148
21934,59.0137,389.3284,5,59.0137-389.3284,0.861323,0.889178,0.521535,0.795494,0.890928,0.139427
...,...,...,...,...,...,...,...,...,...,...
12975,970.6852,978.5535,5,970.6852-978.5535,0.786612,0.789379,0.837201,0.859124,0.834018,0.028522
12960,970.6852,978.5819,5,970.6852-978.5819,0.674501,0.612071,0.652370,0.759533,0.756092,0.058186
12962,970.6852,979.5558,5,970.6852-979.5558,0.706012,0.796137,0.834149,0.858540,0.899189,0.065593
12966,970.6852,980.5270,5,970.6852-980.527,0.850064,0.792149,0.839408,0.851767,0.847750,0.022444


time: 29.1 ms (started: 2023-05-19 20:43:49 -05:00)


In [None]:
# Filter by std
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight_std[df_edge_embeddings_join_filter_count_weight_std["std"] <= 0.3]
df_edge_embeddings_join_filter_count_weight_std.to_csv("{}/output_{}/edges_filter_weight_std/{}_edge-filter-weight-std_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=False)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5,std
21472,59.0137,349.2758,5,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207,0.118268
21465,59.0137,350.2793,5,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223,0.112406
21459,59.0137,350.3033,5,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973,0.128241
21913,59.0137,389.2739,5,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952,0.145148
21934,59.0137,389.3284,5,59.0137-389.3284,0.861323,0.889178,0.521535,0.795494,0.890928,0.139427
...,...,...,...,...,...,...,...,...,...,...
12975,970.6852,978.5535,5,970.6852-978.5535,0.786612,0.789379,0.837201,0.859124,0.834018,0.028522
12960,970.6852,978.5819,5,970.6852-978.5819,0.674501,0.612071,0.652370,0.759533,0.756092,0.058186
12962,970.6852,979.5558,5,970.6852-979.5558,0.706012,0.796137,0.834149,0.858540,0.899189,0.065593
12966,970.6852,980.5270,5,970.6852-980.527,0.850064,0.792149,0.839408,0.851767,0.847750,0.022444


time: 232 ms (started: 2023-05-19 20:43:49 -05:00)


In [None]:
df_edge_embeddings_join_filter_count_weight_std = pd.read_csv("{}/output_{}/edges_filter_weight_std/{}_edge-filter-weight-std_{}_{}.csv".format(dir, method, group[0], dimension, "L2"))
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5,std
0,59.0137,349.2758,5,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207,0.118268
1,59.0137,350.2793,5,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223,0.112406
2,59.0137,350.3033,5,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973,0.128241
3,59.0137,389.2739,5,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952,0.145148
4,59.0137,389.3284,5,59.0137-389.3284,0.861323,0.889178,0.521535,0.795494,0.890928,0.139427
...,...,...,...,...,...,...,...,...,...,...
23933,970.6852,978.5535,5,970.6852-978.5535,0.786612,0.789379,0.837201,0.859124,0.834018,0.028522
23934,970.6852,978.5819,5,970.6852-978.5819,0.674501,0.612071,0.652370,0.759533,0.756092,0.058186
23935,970.6852,979.5558,5,970.6852-979.5558,0.706012,0.796137,0.834149,0.858540,0.899189,0.065593
23936,970.6852,980.5270,5,970.6852-980.527,0.850064,0.792149,0.839408,0.851767,0.847750,0.022444


time: 51.5 ms (started: 2023-05-19 20:43:49 -05:00)


In [None]:
# Average weight
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std.copy()
df_edge_embeddings_join_filter_count_weight_std_avg["weight"] = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, -(len(subgroups) + 1):-1].mean(axis=1)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5,std,weight
0,59.0137,349.2758,5,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207,0.118268,0.701044
1,59.0137,350.2793,5,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223,0.112406,0.706849
2,59.0137,350.3033,5,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973,0.128241,0.736142
3,59.0137,389.2739,5,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952,0.145148,0.773249
4,59.0137,389.3284,5,59.0137-389.3284,0.861323,0.889178,0.521535,0.795494,0.890928,0.139427,0.791692
...,...,...,...,...,...,...,...,...,...,...,...
23933,970.6852,978.5535,5,970.6852-978.5535,0.786612,0.789379,0.837201,0.859124,0.834018,0.028522,0.821267
23934,970.6852,978.5819,5,970.6852-978.5819,0.674501,0.612071,0.652370,0.759533,0.756092,0.058186,0.690913
23935,970.6852,979.5558,5,970.6852-979.5558,0.706012,0.796137,0.834149,0.858540,0.899189,0.065593,0.818805
23936,970.6852,980.5270,5,970.6852-980.527,0.850064,0.792149,0.839408,0.851767,0.847750,0.022444,0.836227


time: 27.3 ms (started: 2023-05-19 20:43:49 -05:00)


In [None]:
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, [0, 1, -1]]
df_edge_embeddings_join_filter_count_weight_std_avg.to_csv("{}/output_{}/edges_filter_weight_std_avg/{}_edge-filter-weight-std-avg_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,349.2758,0.701044
1,59.0137,350.2793,0.706849
2,59.0137,350.3033,0.736142
3,59.0137,389.2739,0.773249
4,59.0137,389.3284,0.791692
...,...,...,...
23933,970.6852,978.5535,0.821267
23934,970.6852,978.5819,0.690913
23935,970.6852,979.5558,0.818805
23936,970.6852,980.5270,0.836227


time: 82 ms (started: 2023-05-19 20:43:50 -05:00)


In [None]:
df_edge_embeddings_join_filter_count_weight_std_avg = pd.read_csv("{}/output_{}/edges_filter_weight_std_avg/{}_edge-filter-weight-std-avg_{}_{}.csv".format(dir, method, group[0], dimension, "L2"))
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,349.2758,0.701044
1,59.0137,350.2793,0.706849
2,59.0137,350.3033,0.736142
3,59.0137,389.2739,0.773249
4,59.0137,389.3284,0.791692
...,...,...,...
23933,970.6852,978.5535,0.821267
23934,970.6852,978.5819,0.690913
23935,970.6852,979.5558,0.818805
23936,970.6852,980.5270,0.836227


time: 25.7 ms (started: 2023-05-19 20:43:50 -05:00)


In [None]:
G = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg, "source", "target", edge_attr=["weight"])
graph_detail(G)

Num. nodes: 2082
Num. edges: 23938

time: 61.8 ms (started: 2023-05-19 20:43:50 -05:00)
