In [1]:
import json
import os
from collections import defaultdict

import networkx as nx
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def calc_cluster_sizes_and_edge_probs(G):
    nodelist, partition_raw = zip(*nx.get_node_attributes(G, 'gt').items())
    partition = defaultdict(list)
    for idx, class_name in enumerate(partition_raw):
        partition[class_name].append(idx)
    partition = list(partition.values())
    partition = sorted(partition, key=lambda x: -len(x))
    
    # cluster sizes
    n_clusters = len(partition)
    cluster_sizes = [len(x) for x in partition]
    
    # edge probabilities
    P = np.zeros((n_clusters, n_clusters))
    A = nx.adjacency_matrix(G, nodelist=nodelist)
    for i in range(n_clusters):
        nodes_i = partition[i]
        for j in range(n_clusters):
            nodes_j = partition[j]
            n_potential_edges = (len(nodes_i) * len(nodes_j)) if i != j else (len(nodes_i) * (len(nodes_i) - 1) / 2)
            if n_potential_edges > 0:
                adj = A[nodes_i][:, nodes_j]  # IxJ matrix with all common edges
                n_edges = np.sum(adj) if i != j else np.sum(adj) / 2
                P[i, j] = n_edges / n_potential_edges
    P = P.tolist()
    return cluster_sizes, P

# Build info table

In [3]:
ROOT = './gml_connected_subgraphs'  # './gml_graphs'
EMPTY_FAMILY = ''

In [4]:
paths = []
for fname in sorted(os.listdir(ROOT)):
    path = f'./gml_connected_subgraphs/{fname}'
    if fname.endswith('.gml'):
        paths.append((EMPTY_FAMILY, path))
    elif os.path.isdir(path):  # this is a family
        family = fname
        paths_family = [(family, f'{path}/{fname}') for fname in sorted(os.listdir(path)) if fname.endswith('.gml')]
        paths.extend(paths_family)
paths

[('', './gml_connected_subgraphs/as.gml'),
 ('', './gml_connected_subgraphs/citeseer.gml'),
 ('cora', './gml_connected_subgraphs/cora/cora.gml'),
 ('cora', './gml_connected_subgraphs/cora/cora_full.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Artificial_Intelligence.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Artificial_Intelligence__Machine_Learning.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Data_Structures__Algorithms_and_Theory.gml'),
 ('cora_subset', './gml_connected_subgraphs/cora_subset/Databases.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Encryption_and_Compression.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Hardware_and_Architecture.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Human_Computer_Interaction.gml'),
 ('cora_subset',
  './gml_connected_subgraphs/cora_subset/Information_Retrieval.gml'),
 ('cora_subset', './gml_connected_subgraphs/cora_subset/Netw

In [5]:
stat = []
for family, path in tqdm(paths):
    G = nx.read_gml(path)
    cluster_sizes, edge_probs = calc_cluster_sizes_and_edge_probs(G)
    stat.append({
        'family': family,
        'name': os.path.basename(path)[:-4],
        'n_nodes': G.number_of_nodes(),
        'n_edges': G.number_of_edges(),
        'n_classes': len(set(nx.get_node_attributes(G, 'gt').values())),
        'directed': G.is_directed(),
        'weighted': len(nx.get_edge_attributes(G, 'weight')) > 0,
        'cluster_sizes': cluster_sizes,
        'edge_probs': edge_probs
    })

HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




In [6]:
df = pd.DataFrame(stat)
df

Unnamed: 0,cluster_sizes,directed,edge_probs,family,n_classes,n_edges,n_nodes,name,weighted
0,"[7115, 6624, 1149, 636, 629, 533, 439, 416, 40...",False,"[[0.00048616932435147623, 0.000248844891210988...",,176,58414,23748,as,False
1,"[419, 355, 301, 258, 244, 92]",False,"[[0.00736545203320734, 0.00020168745167904804,...",,6,3731,2120,citeseer,False
2,"[726, 406, 379, 344, 285, 214, 131]",False,"[[0.004213926094803838, 0.0001798097409382676,...",cora,7,5069,2485,cora,False
3,"[1089, 936, 921, 821, 751, 730, 672, 651, 621,...",False,"[[0.0031953897261383893, 0.0003090343999434908...",cora,70,89157,23166,cora_full,False
4,"[740, 658, 613, 564, 376, 373, 372, 372, 263, ...",False,"[[0.005833302856306916, 0.000359401955146636, ...",cora_subset,11,12958,4633,Artificial_Intelligence,False
5,"[977, 507, 480, 462, 389, 331, 177]",False,"[[0.003913787606758729, 0.0002099572212161772,...",cora_subset,7,10603,3323,Artificial_Intelligence__Machine_Learning,False
6,"[386, 360, 274, 221, 148, 148, 121, 62, 57]",False,"[[0.008895767444990243, 0.0002518710420264824,...",cora_subset,9,4323,1777,Data_Structures__Algorithms_and_Theory,False
7,"[282, 141, 133, 126, 124, 108, 92]",False,"[[0.01620352843189218, 0.0018107741059302852, ...",cora_subset,7,3155,1006,Databases,False
8,"[439, 147, 6]",False,"[[0.012658491174420902, 0.002417367858305053, ...",cora_subset,3,1579,592,Encryption_and_Compression,False
9,"[238, 172, 105, 66, 29, 15, 1]",False,"[[0.01595574938836294, 0.002051983584131327, 0...",cora_subset,7,1486,626,Hardware_and_Architecture,False


In [7]:
for idx, cols in enumerate(df[['family', 'name', 'n_nodes', 'n_edges', 'n_classes', 'directed', 'weighted']].to_numpy().tolist()):
    print(' | '.join([str(x) for x in [idx] + cols]))

0 |  | as | 23748 | 58414 | 176 | False | False
1 |  | citeseer | 2120 | 3731 | 6 | False | False
2 | cora | cora | 2485 | 5069 | 7 | False | False
3 | cora | cora_full | 23166 | 89157 | 70 | False | False
4 | cora_subset | Artificial_Intelligence | 4633 | 12958 | 11 | False | False
5 | cora_subset | Artificial_Intelligence__Machine_Learning | 3323 | 10603 | 7 | False | False
6 | cora_subset | Data_Structures__Algorithms_and_Theory | 1777 | 4323 | 9 | False | False
7 | cora_subset | Databases | 1006 | 3155 | 7 | False | False
8 | cora_subset | Encryption_and_Compression | 592 | 1579 | 3 | False | False
9 | cora_subset | Hardware_and_Architecture | 626 | 1486 | 7 | False | False
10 | cora_subset | Human_Computer_Interaction | 1053 | 2350 | 5 | False | False
11 | cora_subset | Information_Retrieval | 418 | 1110 | 4 | False | False
12 | cora_subset | Networking | 1167 | 3904 | 4 | False | False
13 | cora_subset | Operating_Systems | 2068 | 8654 | 4 | False | False
14 | cora_subset | Progr

In [None]:
with open('./gml_connected_subgraphs/stat.json', 'w') as f:
     json.dump({x['name']: x for x in df.to_dict('records')}, f, indent=4)