In [65]:
import torch
from torch_geometric.data import Data
from torch_geometric.datasets import EllipticBitcoinDataset, EllipticBitcoinTemporalDataset

from ogb.nodeproppred import PygNodePropPredDataset

In [2]:
from pathlib import Path
from torch_geometric.data import Dataset
import networkx as nx
import numpy as np
import torch_geometric

def built_dblp():
    path = Path('../data/dblp-hard-citation-graph/dblp-hard/')
    y = np.load(path/"y.npy")
    x = np.load(path/"X.npy")
    node_year = np.load(path/"t.npy")
    nx_graph = nx.read_adjlist(path/"adjlist.txt", nodetype=int)
    data = torch_geometric.utils.from_networkx(nx_graph)
    data.x = torch.tensor(x)
    data.y = torch.unsqueeze(torch.tensor(y), 1)
    data.node_year = torch.unsqueeze(torch.tensor(node_year),1)
    num_classes = np.unique(y).shape[0]
    #ds = Dataset(data, num_classes)
    print(num_classes)
    return data

In [3]:
elliptic = EllipticBitcoinDataset(root='./data/Elliptic/') # labels 0: licit, 1: illicit, 2: unlabeled
ogbn_arxiv = PygNodePropPredDataset(name="ogbn-arxiv", root = './data/Ogbn/')
dblp = built_dblp()

73


In [4]:
from torch_geometric.utils import homophily
edge_index = elliptic.edge_index
y = elliptic.y
homophily(edge_index, y, method='edge')

0.7113396525382996

In [5]:
edge_index = ogbn_arxiv.edge_index
y = ogbn_arxiv.y
homophily(edge_index, y, method='edge')

0.6550830602645874

In [6]:
edge_index = dblp.edge_index
y = dblp.y
homophily(edge_index, y, method='edge')

0.1613084077835083

In [9]:
import networkx as nx
import numpy as np

def convert_labels_to_consecutive_integers(labels):
    unique_labels = np.unique(labels)
    labels_map = {label: i for i, label in enumerate(unique_labels)}
    new_labels = np.array([labels_map[label] for label in labels])

    return new_labels

def h_adj(graph, labels):
    """Compute adjusted homophily."""
    labels = convert_labels_to_consecutive_integers(labels)

    num_classes = len(np.unique(labels))

    degree_sums = np.zeros((num_classes,))
    for u in graph.nodes:
        label = labels[u]
        degree_sums[label] += graph.degree(u)

    adjust = (degree_sums ** 2 / (len(graph.edges) * 2) ** 2).sum()

    h_adj = (h_edge(graph, labels) - adjust) / (1 - adjust)

    return h_adj

In [25]:
torch.unique(ogbn_arxiv.data.node_year)

tensor([1971, 1986, 1987, 1988, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
        1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
        2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [68]:
running_sum = 0
for year in torch.unique(ogbn_arxiv.data.node_year):
    s= len(ogbn_arxiv.data.x[ogbn_arxiv.data.node_year.squeeze() == year])
    running_sum += s
    print(year, s, running_sum)

tensor(1971) 1 1
tensor(1986) 1 2
tensor(1987) 1 3
tensor(1988) 1 4
tensor(1990) 3 7
tensor(1991) 3 10
tensor(1992) 1 11
tensor(1993) 8 19
tensor(1994) 19 38
tensor(1995) 25 63
tensor(1996) 31 94
tensor(1997) 33 127
tensor(1998) 123 250
tensor(1999) 165 415
tensor(2000) 261 676
tensor(2001) 248 924
tensor(2002) 355 1279
tensor(2003) 387 1666
tensor(2004) 415 2081
tensor(2005) 629 2710
tensor(2006) 968 3678
tensor(2007) 1302 4980
tensor(2008) 1931 6911
tensor(2009) 2499 9410
tensor(2010) 3564 12974
tensor(2011) 4427 17401
tensor(2012) 6435 23836
tensor(2013) 8135 31971
tensor(2014) 9154 41125
tensor(2015) 12035 53160
tensor(2016) 16339 69499
tensor(2017) 21442 90941
tensor(2018) 29799 120740
tensor(2019) 39711 160451
tensor(2020) 8892 169343




In [83]:
for time in range(1, 50):
    elliptic = EllipticBitcoinTemporalDataset(root='./data/EllipticTemporal/', t=time)
    print(time, (elliptic.data.y == 0).sum())

1 tensor(2130)
2 tensor(1099)
3 tensor(1268)
4 tensor(1410)
5 tensor(1874)
6 tensor(480)
7 tensor(1101)
8 tensor(1098)
9 tensor(530)
10 tensor(954)
11 tensor(565)
12 tensor(490)
13 tensor(518)
14 tensor(374)
15 tensor(471)
16 tensor(402)
17 tensor(712)
18 tensor(337)
19 tensor(665)
20 tensor(640)
21 tensor(541)
22 tensor(1605)
23 tensor(1134)
24 tensor(989)
25 tensor(476)
26 tensor(421)
27 tensor(182)
28 tensor(199)
29 tensor(845)
30 tensor(441)
31 tensor(604)
32 tensor(981)
33 tensor(418)
34 tensor(478)
35 tensor(1159)
36 tensor(1675)
37 tensor(458)
38 tensor(645)
39 tensor(1102)
40 tensor(1099)
41 tensor(1016)
42 tensor(1915)
43 tensor(1346)
44 tensor(1567)
45 tensor(1216)
46 tensor(710)
47 tensor(824)
48 tensor(435)
49 tensor(420)


In [97]:
from collections import defaultdict
counter = defaultdict(int)
for t in dblp.node_year.squeeze(): # TODO: find out the number of distinct classes for each year so we know with wich year to start in T1
    counter[t.item()] += 1
sorted(counter.items())

[(1990, 1725),
 (1991, 1822),
 (1992, 2366),
 (1993, 2739),
 (1994, 3095),
 (1995, 3294),
 (1996, 3486),
 (1997, 3552),
 (1998, 3637),
 (1999, 4152),
 (2000, 4519),
 (2001, 5015),
 (2002, 5918),
 (2003, 6904),
 (2004, 8129),
 (2005, 9114),
 (2006, 10069),
 (2007, 11105),
 (2008, 11370),
 (2009, 13499),
 (2010, 12720),
 (2011, 13827),
 (2012, 14054),
 (2013, 14731),
 (2014, 14033),
 (2015, 13800)]