# Compute the statistics of datasets

In [1]:
import random
from dgl.data.utils import load_graphs
import os
import json
import numpy as np
import dgl
import networkx as nx

In [2]:
class Dataset:
    def __init__(self, name='tfinance', prefix='datasets/'):
        graph = load_graphs(prefix + name)[0][0]
        self.name = name
        self.graph = graph

    def split(self, semi_supervised=True, trial_id=0):
        if semi_supervised:
            trial_id += 10
        self.graph.ndata['train_mask'] = self.graph.ndata['train_masks'][:,trial_id]
        self.graph.ndata['val_mask'] = self.graph.ndata['val_masks'][:,trial_id]
        self.graph.ndata['test_mask'] = self.graph.ndata['test_masks'][:,trial_id]
        print(self.graph.ndata['train_mask'].sum(), self.graph.ndata['val_mask'].sum(), self.graph.ndata['test_mask'].sum())

In [7]:
def basic_info(graph):
    """
        graph: dgl.DGLGraph
    """
    n_nodes = graph.num_nodes()
    n_edges = graph.num_edges()
    n_features = graph.ndata["feature"].shape[1]

    avg_degree = graph.in_degrees().float().mean().item()

    return n_nodes, n_edges, n_features, avg_degree
def adv_info(graph):
    """
        graph: dgl.DGLGraph
    """

    graph = graph.remove_self_loop()
    graph = dgl.to_simple(graph)
    graph = dgl.to_networkx(graph)
    graph = nx.Graph(graph)

    

    # print(graph)
    # print(graph[0])

    largest_cc = max(nx.connected_components(graph), key=len)
    graph = graph.subgraph(largest_cc).copy()
    print(graph)

    
    n_nodes = graph.number_of_nodes()
    n_edges = graph.number_of_edges()    
    diameter = nx.diameter(graph)
    n_triangles = np.mean(list(nx.triangles(graph).values()))
    global_clustering_coefficient = nx.transitivity(graph)
    average_local_clustering_coefficient = nx.average_clustering(graph)


    return n_nodes, n_edges, diameter, n_triangles, global_clustering_coefficient, average_local_clustering_coefficient

In [3]:
dataroot = "../datasets/"

In [4]:
datasets = ['reddit', 'weibo', 'amazon', 'yelp', 'tfinance',
            'elliptic', 'tolokers', 'questions', 'dgraphfin', 'tsocial']

In [None]:
bis = []
ais = []

for dataset in datasets:
    print(f"{dataset}")
    
    data = Dataset(dataset, dataroot)
    graph = data.graph
    labels = graph.ndata['label']

    # print(graph.edata)
    # print(graph.out_edges(0))

    bi = basic_info(graph)
    print(bi)
    bis.append(bi)

    ai = adv_info(graph)
    print(ai)
    ais.append(ai)


reddit
(10984, 168016, 64, 15.296431541442871)
Graph with 10980 nodes and 78514 edges
(10980, 78514, 8, 0.0, 0, 0.0)
weibo
(8405, 416368, 400, 49.53813171386719)
Graph with 8403 nodes and 377270 edges
(8403, 377270, 6, 1708.2088539807212, 0.10970416669554976, 0.4333108922611914)
amazon
(11944, 8847096, 25, 740.7146606445312)
Graph with 11944 nodes and 4417576 edges
(11944, 4417576, 4, 274298.2932016075, 0.4084815109209683, 0.5091394903623276)
yelp
(45954, 7739912, 32, 168.42738342285156)
Graph with 45900 nodes and 3846910 edges
(45900, 3846910, 12, 15595.906078431373, 0.8439590175168633, 0.7742964904622189)
tfinance
(39357, 42484443, 10, 1079.4635009765625)
Graph with 39351 nodes and 21222540 edges


## Tranform the datasets into Edgelists

In [5]:

for dataset in datasets:
    print(f"{dataset}")
    
    data = Dataset(dataset, dataroot)
    graph = data.graph

    # graph = graph.remove_self_loop()
    # graph = dgl.to_simple(graph)
    graph = dgl.to_networkx(graph)
    graph = nx.Graph(graph)

    nx.write_weighted_edgelist(graph, f"{dataset}.weighted.edgelist")
    


reddit
weibo
amazon
yelp
tfinance
elliptic
tolokers
questions
dgraphfin
tsocial


In [5]:

for dataset in datasets:
    print(f"{dataset}")

    graph = nx.read_weighted_edgelist(f"{dataset}.weighted.edgelist")
    print(f"{graph.number_of_nodes()}, {graph.number_of_edges()}")

    with open(f"{dataset}.weighted.edgelist", "r") as f:
        lines = f.readlines()
    extralines = [f"# Undirected graph: {dataset}.weighted.edgelist \n"] + \
        ["# Save as space-separated list of edges\n"] + \
        [f"# Nodes: {graph.number_of_nodes()} Edges: {graph.number_of_edges()}\n"] + \
        ["# FromNodeId	ToNodeId\n"]
    lines = extralines + lines

    with open(f"{dataset}.snap", "w") as f:
        for line in lines:
            f.write(line)
    


reddit
10984, 89500
weibo
8405, 385676
amazon
11944, 4429520
yelp
45954, 3892933
tfinance
39357, 21261900
elliptic
203769, 438124
tolokers
11758, 530758
questions
48921, 202461
dgraphfin
3700550, 7697810
tsocial
5781065, 78886573
