In [1]:
import networkx as nx
import scipy.sparse as sp
import numpy as np
import utils
import torch
import torch
from collections import defaultdict
import numpy as np
import time
import json

In [2]:
seed=114514
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.seed_all()

# **Read data**

In [3]:
loader = np.load('citeseer.npz')
train_ones = loader['edges']
print(len(train_ones))
print(train_ones[:5])

7358
[[1370  823]
 [  65   63]
 [ 262  247]
 [ 398  308]
 [ 859  383]]


In [4]:
adj_sparse = np.zeros((np.max(train_ones)+1,np.max(train_ones)+1))
for e in train_ones:
    adj_sparse[e[0],e[1]]=1
    adj_sparse[e[1],e[0]]=1
    
adj_sparse = sp.coo_matrix(adj_sparse).tocsr()

lcc = graph_utils.largest_connected_components(adj_sparse)
adj_sparse= adj_sparse[lcc,:][:,lcc]
_N = adj_sparse.shape[0]
print('n',_N)
_Edges=[]
for x in np.column_stack(adj_sparse.nonzero()):
    if not x[0]==x[1]:
        _Edges.append((x[0],x[1]))
_num_of_edges=int(len(_Edges)/2)
print('m',_num_of_edges)

dic=defaultdict(set)
for x in _Edges:
    a1=x[0]
    a2=x[1]
    dic[a1].add(a2)
    dic[a2].add(a1)
    

adj_origin=np.zeros((_N,_N)).astype(int)
for (i,j) in _Edges:
    adj_origin[i][j]=1
    adj_origin[j][i]=1
assert(np.sum(adj_origin==adj_origin.T)==_N*_N)
assert(np.sum(adj_origin)==_num_of_edges*2)

Selecting 1 largest connected components
n 2120
m 3679


# **1.Get link prediction model and embedding**

In [5]:
embedding_dim=128

graphsagemodel=graph_utils.GraphSAGE(_N=_N,_M=_num_of_edges,adj_origin=adj_origin,
                                         adj_dic=dic,embedding_dim=embedding_dim)


# *1.1 Training step*

In [6]:
graphsagemodel.graphsage_train(boost_times=20,add_edges=1000,training_epoch=2500,
                               boost_epoch=4000,learning_rate=0.001,save_number=0)
graphsagemodel.save_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

2246140
3679
-7358
Training GraphSAGE model
Epoch:0,Loss:0.693546,estimated time:681.26
 acc:0.5
Epoch:1000,Loss:0.010743,estimated time:122.32
 acc:0.9976895895623811
Epoch:2000,Loss:0.001174,estimated time:35.165
 acc:0.9979614025550422
2119/2120[(1.0, 0, 1), (1.0, 0, 2), (1.0, 0, 199), (1.0, 0, 38), (1.0, 0, 10), (1.0, 0, 217), (1.0, 0, 471), (1.0, 0, 129), (1.0, 0, 156), (1.0, 0, 162)]
1.0
(2120, 2120)
7358
7358
True Positve:434, 0.06
False Positve:6924, 0.94
True Negative:4480118, 1.00
False Negative:6924, 0.00
Positive:0.00
Negative:1.00
Start boosting
boost iter:0
train_add:0
train added: 1000
current training set length: 8358
current save path: graphsage_model/graphsage0_0.pth
Epoch:0,Loss:1.171569,estimated time:293.89
 acc:0.9167264895908112
Epoch:1000,Loss:0.021134,estimated time:224.63
 acc:0.9956927494615937
Epoch:2000,Loss:0.013122,estimated time:197.37
 acc:0.994256999282125
Epoch:3000,Loss:0.000916,estimated time:74.615
 acc:0.998205312275664
2119/2120[(1.0, 0, 1), (1.0

In [7]:
# if you want to do reboost
graphsagemodel.load_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

In [8]:

graphsagemodel.graphsage_train(boost_times=10,add_edges=1000,training_epoch=2500,
                               boost_epoch=4000,learning_rate=0.001,save_number=1)
graphsagemodel.save_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

2246140
3679
-7358
Training GraphSAGE model
Epoch:0,Loss:0.021567,estimated time:244.03
 acc:0.9982332155477032
Epoch:1000,Loss:0.000063,estimated time:111.32
 acc:0.9997281870073389
Epoch:2000,Loss:0.000566,estimated time:40.755
 acc:1.0
2119/2120[(1.0, 0, 3), (1.0, 0, 8), (1.0, 0, 6), (1.0, 0, 156), (1.0, 0, 680), (1.0, 0, 1544), (1.0, 13, 1683), (1.0, 0, 552), (1.0, 6, 490), (1.0, 8, 542)]
1.0
(2120, 2120)
7358
7358
True Positve:2530, 0.34
False Positve:4828, 0.66
True Negative:4482214, 1.00
False Negative:4828, 0.00
Positive:0.00
Negative:1.00
Start boosting
boost iter:0
train_add:965
train added: 1000
current training set length: 8358
current save path: graphsage_model/graphsage1_0.pth
Epoch:0,Loss:0.546653,estimated time:321.03
 acc:0.9277339076334051
Epoch:1000,Loss:0.001672,estimated time:243.18
 acc:0.9979660205790859
Epoch:2000,Loss:0.000241,estimated time:154.46
 acc:0.9996410624551328
Epoch:3000,Loss:0.000089,estimated time:87.057
 acc:0.9997607083034219
2119/2120[(0.999991

# *1.2 Load pretrained model*

In [12]:
graphsagemodel.load_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

In [13]:
embedding_matrix_numpy=graphsagemodel.embedding_matrix_numpy
link_prediction_model=graphsagemodel.graphsage_link_prediction_from_embedding_one_to_other
predict_adj=graph_utils.evaluate_overlap_torch(_N=_N,
                                                    _num_of_edges=_num_of_edges,
                                                    adj_origin=adj_origin,
                                                    embedding_matrix_numpy=embedding_matrix_numpy,
                                                    link_prediction_from_embedding_one_to_other=link_prediction_model)

2119/2120[(0.8820486, 197, 767), (0.8822426, 401, 1177), (0.88220215, 1161, 1210), (0.8867349, 257, 415), (0.8824132, 372, 694), (0.88623327, 469, 476), (0.89217174, 862, 1159), (0.88820344, 681, 682), (0.8890421, 719, 2016), (0.8982855, 1159, 1195)]
1.0
(2120, 2120)
7358
7358
True Positve:6960, 0.95
False Positve:398, 0.05
True Negative:4486644, 1.00
False Negative:398, 0.00
Positive:0.00
Negative:1.00


In [14]:
metric_embedding=graph_utils.compute_graph_statistics(predict_adj)
metric_origin=graph_utils.compute_graph_statistics(adj_origin)

Values less than or equal to 0 in data. Throwing out 0 or negative values
  (Theoretical_CDF * (1 - Theoretical_CDF))


In [15]:
for x in metric_origin:
    print('%-25s origin:%17.8f, link_pred:%17.8f'%(x,metric_origin[x],metric_embedding[x]))

d_max                     origin:      99.00000000, link_pred:      75.00000000
d_min                     origin:       1.00000000, link_pred:       0.00000000
d                         origin:       3.47075472, link_pred:       3.47075472
LCC                       origin:    2120.00000000, link_pred:    2096.00000000
wedge_count               origin:   25974.00000000, link_pred:   22890.00000000
claw_count                origin:  250382.00000000, link_pred:  145966.00000000
triangle_count            origin:    1084.00000000, link_pred:     868.00000000
square_count              origin:     249.00000000, link_pred:     120.00000000
power_law_exp             origin:       2.07078423, link_pred:       2.04773937
gini                      origin:       0.42760979, link_pred:       0.42354336
rel_edge_distr_entropy    origin:       0.95385514, link_pred:       0.95643662
assortativity             origin:       0.00750617, link_pred:      -0.01848155
clustering_coefficient    origin:       