### Imports

In [14]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.32 ms (started: 2023-05-25 12:11:35 -05:00)


### Parameters

In [15]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
dimension = dimensions[0]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 dgi
Group:		 ['pck1^', 2]
Subgroup:	 ['1', '2']
Dimensions:	 [3]
time: 2.94 ms (started: 2023-05-25 12:11:40 -05:00)


### Get common edges in iterations: 1, 2, 3

In [3]:
list_df_edge_embeddings_join_filter_count = []
for iter in range(1, 4):
    df_edge_embeddings_join_filter_count = pd.read_csv("{}/output_{}_{}/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, iter, group[0], 3, "L2"))
    df_edge_embeddings_join_filter_count.columns = ["source", "target", "count"]
    # print(df_edge_embeddings_join_filter_count.info())
    # df_edge_embeddings_join_filter_count.sort_values(by=["source", "target"])
    list_df_edge_embeddings_join_filter_count.append(df_edge_embeddings_join_filter_count)

time: 261 ms (started: 2023-05-24 10:47:54 -05:00)


In [4]:
df_temp = pd.concat(list_df_edge_embeddings_join_filter_count, ignore_index=True)
df_temp

Unnamed: 0,source,target,count
0,479.3735,869.5574,2
1,483.0778,560.1392,2
2,334.0560,579.1397,2
3,227.2685,238.9223,2
4,483.0778,562.1602,2
...,...,...,...
1455946,173.0822,230.9876,2
1455947,59.0137,677.4234,2
1455948,424.1145,780.5817,2
1455949,807.4405,812.4817,2


time: 19 ms (started: 2023-05-24 10:47:54 -05:00)


In [5]:
# Count
temp = df_temp[["source", "target"]].value_counts().to_frame()
df_edge_embeddings_join_filter_count = temp[temp["count"] == 3]
# df_edge_embeddings_join_filter_count.to_csv("{}/output_{}/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=True)
df_edge_embeddings_join_filter_count

Unnamed: 0_level_0,Unnamed: 1_level_0,count
source,target,Unnamed: 2_level_1
59.0137,59.0370,3
483.1335,270.2095,3
483.1335,363.1165,3
483.1335,362.0509,3
483.1335,361.1515,3
...,...,...
286.9079,390.8431,3
286.9079,384.8403,3
286.9079,383.8429,3
286.9079,382.8642,3


time: 239 ms (started: 2023-05-24 10:47:54 -05:00)


In [6]:
df_edge_embeddings_join_filter_count.reset_index(inplace=True)
df_edge_embeddings_join_filter_count

Unnamed: 0,source,target,count
0,59.0137,59.0370,3
1,483.1335,270.2095,3
2,483.1335,363.1165,3
3,483.1335,362.0509,3
4,483.1335,361.1515,3
...,...,...,...
485312,286.9079,390.8431,3
485313,286.9079,384.8403,3
485314,286.9079,383.8429,3
485315,286.9079,382.8642,3


time: 17.1 ms (started: 2023-05-24 10:47:55 -05:00)


In [7]:
# Get weight
df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
s = []
t = []
for row in df_edge_embeddings_join_filter_count_weight.itertuples():
    if row[1] > row[2]:
        s.append(row[2])
        t.append(row[1])
    else:
        s.append(row[1])
        t.append(row[2])
df_edge_embeddings_join_filter_count_weight["source"] = s
df_edge_embeddings_join_filter_count_weight["target"] = t

# df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
df_edge_embeddings_join_filter_count_weight.sort_values(["source", "target"], ascending=True, inplace=True)
df_edge_embeddings_join_filter_count_weight["idx"] = df_edge_embeddings_join_filter_count_weight["source"].astype(str) + "-" + df_edge_embeddings_join_filter_count_weight["target"].astype(str)
list_aux = df_edge_embeddings_join_filter_count_weight.iloc[:, -1].values

for i in tqdm(subgroups):
    df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], i))
    df_edges.sort_values(["source", "target"], ascending=True, inplace=True)
    df_edges["idx"] = df_edges["source"].astype(str) + "-" + df_edges["target"].astype(str)
    
    filter = df_edges["idx"].isin(list_aux)
    temp = df_edges[filter]
    list_temp = temp.iloc[:, -2].values
    df_edge_embeddings_join_filter_count_weight["subgroup{}".format(i)] = list_temp
df_edge_embeddings_join_filter_count_weight


100%|██████████| 2/2 [00:09<00:00,  4.84s/it]


Unnamed: 0,source,target,count,idx,subgroup1,subgroup2
0,59.0137,59.0370,3,59.0137-59.037,0.653159,0.828448
242659,59.0137,74.0249,3,59.0137-74.0249,0.756185,0.617955
323555,59.0137,102.0562,3,59.0137-102.0562,0.618107,0.777138
323554,59.0137,109.0407,3,59.0137-109.0407,0.654224,0.796294
323553,59.0137,113.0246,3,59.0137-113.0246,0.514337,0.841330
...,...,...,...,...,...,...
161792,986.7752,989.4484,3,986.7752-989.4484,0.527028,0.668604
161762,987.5639,990.5673,3,987.5639-990.5673,0.765752,0.633906
161783,988.4452,988.5664,3,988.4452-988.5664,0.671791,0.761600
161782,988.4452,990.5673,3,988.4452-990.5673,0.594618,0.756580


time: 10.5 s (started: 2023-05-24 10:47:55 -05:00)


In [8]:
# Dispersion (std)
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight.copy()
df_edge_embeddings_join_filter_count_weight_std["std"] = np.std(df_edge_embeddings_join_filter_count_weight_std.iloc[:, -len(subgroups):], axis=1)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std
0,59.0137,59.0370,3,59.0137-59.037,0.653159,0.828448,0.087645
242659,59.0137,74.0249,3,59.0137-74.0249,0.756185,0.617955,0.069115
323555,59.0137,102.0562,3,59.0137-102.0562,0.618107,0.777138,0.079515
323554,59.0137,109.0407,3,59.0137-109.0407,0.654224,0.796294,0.071035
323553,59.0137,113.0246,3,59.0137-113.0246,0.514337,0.841330,0.163496
...,...,...,...,...,...,...,...
161792,986.7752,989.4484,3,986.7752-989.4484,0.527028,0.668604,0.070788
161762,987.5639,990.5673,3,987.5639-990.5673,0.765752,0.633906,0.065923
161783,988.4452,988.5664,3,988.4452-988.5664,0.671791,0.761600,0.044904
161782,988.4452,990.5673,3,988.4452-990.5673,0.594618,0.756580,0.080981


time: 90.5 ms (started: 2023-05-24 10:48:05 -05:00)


In [9]:
# Filter by std
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight_std[df_edge_embeddings_join_filter_count_weight_std["std"] <= 0.3]
# df_edge_embeddings_join_filter_count_weight_std.to_csv("{}/output_{}/edges_filter_weight_std/{}_edge-filter-weight-std_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=False)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std
0,59.0137,59.0370,3,59.0137-59.037,0.653159,0.828448,0.087645
242659,59.0137,74.0249,3,59.0137-74.0249,0.756185,0.617955,0.069115
323555,59.0137,102.0562,3,59.0137-102.0562,0.618107,0.777138,0.079515
323554,59.0137,109.0407,3,59.0137-109.0407,0.654224,0.796294,0.071035
323553,59.0137,113.0246,3,59.0137-113.0246,0.514337,0.841330,0.163496
...,...,...,...,...,...,...,...
161792,986.7752,989.4484,3,986.7752-989.4484,0.527028,0.668604,0.070788
161762,987.5639,990.5673,3,987.5639-990.5673,0.765752,0.633906,0.065923
161783,988.4452,988.5664,3,988.4452-988.5664,0.671791,0.761600,0.044904
161782,988.4452,990.5673,3,988.4452-990.5673,0.594618,0.756580,0.080981


time: 34.3 ms (started: 2023-05-24 10:48:05 -05:00)


In [10]:
# Average weight
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std.copy()
df_edge_embeddings_join_filter_count_weight_std_avg["weight"] = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, -(len(subgroups) + 1):-1].mean(axis=1)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std,weight
0,59.0137,59.0370,3,59.0137-59.037,0.653159,0.828448,0.087645,0.740804
242659,59.0137,74.0249,3,59.0137-74.0249,0.756185,0.617955,0.069115,0.687070
323555,59.0137,102.0562,3,59.0137-102.0562,0.618107,0.777138,0.079515,0.697622
323554,59.0137,109.0407,3,59.0137-109.0407,0.654224,0.796294,0.071035,0.725259
323553,59.0137,113.0246,3,59.0137-113.0246,0.514337,0.841330,0.163496,0.677833
...,...,...,...,...,...,...,...,...
161792,986.7752,989.4484,3,986.7752-989.4484,0.527028,0.668604,0.070788,0.597816
161762,987.5639,990.5673,3,987.5639-990.5673,0.765752,0.633906,0.065923,0.699829
161783,988.4452,988.5664,3,988.4452-988.5664,0.671791,0.761600,0.044904,0.716696
161782,988.4452,990.5673,3,988.4452-990.5673,0.594618,0.756580,0.080981,0.675599


time: 61.5 ms (started: 2023-05-24 10:48:05 -05:00)


In [11]:
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, [0, 1, -1]]
df_edge_embeddings_join_filter_count_weight_std_avg.to_csv("{}/output_edges/edges_filter_weight_std_avg/{}_{}_edge-filter-weight-std-avg_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0370,0.740804
242659,59.0137,74.0249,0.687070
323555,59.0137,102.0562,0.697622
323554,59.0137,109.0407,0.725259
323553,59.0137,113.0246,0.677833
...,...,...,...
161792,986.7752,989.4484,0.597816
161762,987.5639,990.5673,0.699829
161783,988.4452,988.5664,0.716696
161782,988.4452,990.5673,0.675599


time: 1.13 s (started: 2023-05-24 10:48:06 -05:00)


In [12]:
df_edge_embeddings_join_filter_count_weight_std_avg = pd.read_csv("{}/output_edges/edges_filter_weight_std_avg/{}_{}_edge-filter-weight-std-avg_{}_{}.csv".format(dir, method, group[0], dimension, "L2"))
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0370,0.740804
1,59.0137,74.0249,0.687070
2,59.0137,102.0562,0.697622
3,59.0137,109.0407,0.725259
4,59.0137,113.0246,0.677833
...,...,...,...
426349,986.7752,989.4484,0.597816
426350,987.5639,990.5673,0.699829
426351,988.4452,988.5664,0.716696
426352,988.4452,990.5673,0.675599


time: 123 ms (started: 2023-05-24 10:48:07 -05:00)


In [13]:
G = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg, "source", "target", edge_attr=["weight"])
graph_detail(G)

Num. nodes: 5261
Num. edges: 426354

time: 908 ms (started: 2023-05-24 10:48:07 -05:00)
