In [1]:
import networkx as nx
import scipy.sparse as sp
import numpy as np
import utils
import torch
import torch
from collections import defaultdict
import numpy as np
import time
import json

In [2]:
seed=114514
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.seed_all()

# **Read data**

In [3]:
loader = np.load('split.npy').item()
train_ones = loader['train_ones']
val_ones = loader['val_ones']
val_zeros = loader['val_zeros']
test_ones = loader['test_ones']
test_zeros = loader['test_zeros']
test_edges=test_ones
test_edges_false=test_zeros
print(len(train_ones))
print(np.max(train_ones))
print(np.max(test_ones))

13566
2809
2796


In [4]:
adj_sparse = np.zeros((np.max(train_ones)+1,np.max(train_ones)+1))
for e in train_ones:
    adj_sparse[e[0],e[1]]=1
    adj_sparse[e[1],e[0]]=1
    
adj_sparse = sp.coo_matrix(adj_sparse).tocsr()

lcc = utils.largest_connected_components(adj_sparse)
adj_sparse= adj_sparse[lcc,:][:,lcc]
_N = adj_sparse.shape[0]
print('n',_N)
_Edges=[]
for x in np.column_stack(adj_sparse.nonzero()):
    if not x[0]==x[1]:
        _Edges.append((x[0],x[1]))
_num_of_edges=int(len(_Edges)/2)
print('m',_num_of_edges)

dic=defaultdict(set)
for x in _Edges:
    a1=x[0]
    a2=x[1]
    dic[a1].add(a2)
    dic[a2].add(a1)
    

adj_origin=np.zeros((_N,_N)).astype(int)
for (i,j) in _Edges:
    adj_origin[i][j]=1
    adj_origin[j][i]=1
assert(np.sum(adj_origin==adj_origin.T)==_N*_N)
assert(np.sum(adj_origin)==_num_of_edges*2)

Selecting 1 largest connected components
n 2810
m 6783


# **1.Get link prediction model and embedding**

In [5]:
embedding_dim=128

graphsagemodel=utils.GraphSAGE(_N=_N,_M=_num_of_edges,adj_origin=adj_origin,
                                         adj_dic=dic,embedding_dim=embedding_dim)


# *1.1 Training step*

In [6]:
graphsagemodel.graphsage_train(boost_times=20,add_edges=1000,training_epoch=5000,
                               boost_epoch=5000,learning_rate=0.001,save_number=0)
graphsagemodel.save_model(path='graphsage_model/graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

3946645
6783
-13566
Training GraphSAGE model
Epoch:0,Loss:0.693416,estimated time:1499.39
 acc:0.5
Epoch:1000,Loss:0.033757,estimated time:527.80
 acc:0.9844464101430046
Epoch:2000,Loss:0.013573,estimated time:384.58
 acc:0.9918177797434763
Epoch:3000,Loss:0.029529,estimated time:243.99
 acc:0.9917440660474717
Epoch:4000,Loss:0.002887,estimated time:132.59
 acc:0.9946926138876603
2809/2810[(1.0, 0, 39), (1.0, 0, 83), (1.0, 0, 1550), (1.0, 0, 1245), (1.0, 0, 1297), (1.0, 0, 1556), (1.0, 1, 834), (1.0, 0, 2158), (1.0, 0, 1250), (1.0, 0, 1298)]
1.0
(2810, 2810)
13566
13566
True Positve:1016, 0.07
False Positve:12550, 0.93
True Negative:7869984, 1.00
False Negative:12550, 0.00
Positive:0.00
Negative:1.00
Start boosting
boost iter:0
train_add:0
train added: 1000
current training set length: 14566
current save path: graphsage_model/graphsage0_0.pth
Epoch:0,Loss:0.378265,estimated time:665.21
 acc:0.9395166826857064
Epoch:1000,Loss:0.011428,estimated time:473.83
 acc:0.990182617053412
Epoch:2

In [7]:
# if you want to do reboost
graphsagemodel.load_model(path='graphsage_model/graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

In [8]:

graphsagemodel.graphsage_train(boost_times=20,add_edges=2000,training_epoch=1500,
                               boost_epoch=2500,learning_rate=0.001,save_number=1)
graphsagemodel.save_model(path='graphsage_model/graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

3946645
6783
-13566
Training GraphSAGE model
Epoch:0,Loss:0.011323,estimated time:214.51
 acc:0.9965354562877783
Epoch:1000,Loss:0.000810,estimated time:64.95
 acc:0.9988942945599293
2809/2810[(0.99999833, 215, 240), (0.99999833, 242, 265), (0.99999833, 268, 1704), (0.99999833, 525, 611), (0.99999833, 274, 1803), (0.99999833, 322, 401), (0.99999833, 338, 2297), (0.99999833, 614, 1973), (0.99999833, 591, 602), (0.99999833, 695, 1127)]
1.0
(2810, 2810)
13566
13566
True Positve:8268, 0.61
False Positve:5298, 0.39
True Negative:7877236, 1.00
False Negative:5298, 0.00
Positive:0.00
Negative:1.00
Start boosting
boost iter:0
train_add:1910
train added: 2000
current training set length: 15566
current save path: graphsage_model/graphsage1_0.pth
Epoch:0,Loss:0.596455,estimated time:332.45
 acc:0.9456507773352177
Epoch:1000,Loss:0.022503,estimated time:199.94
 acc:0.9958884748811512
Epoch:2000,Loss:0.009244,estimated time:60.912
 acc:0.9968521135808814
2809/2810[(0.9999162, 1, 2036), (0.9999162, 

# *1.2 Load pretrained model*

In [9]:
graphsagemodel.load_model(path='graphsage_model/graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

In [10]:
embedding_matrix_numpy=graphsagemodel.embedding_matrix_numpy
link_prediction_model=graphsagemodel.graphsage_link_prediction_from_embedding_one_to_other
predict_adj=utils.evaluate_overlap_torch(_N=_N,
                                                    _num_of_edges=_num_of_edges,
                                                    adj_origin=adj_origin,
                                                    embedding_matrix_numpy=embedding_matrix_numpy,
                                                    link_prediction_from_embedding_one_to_other=link_prediction_model)

2809/2810[(0.9160829, 70, 2259), (0.91639525, 328, 1435), (0.9161416, 335, 1435), (0.9166448, 533, 558), (0.91692716, 1383, 2055), (0.9164858, 4, 444), (0.9167271, 1801, 2275), (0.9216437, 33, 59), (0.9177841, 1276, 1315), (0.91878307, 118, 1810)]
1.0
(2810, 2810)
13566
13566
True Positve:12828, 0.95
False Positve:738, 0.05
True Negative:7881796, 1.00
False Negative:738, 0.00
Positive:0.00
Negative:1.00


In [11]:
metric_embedding=utils.compute_graph_statistics(predict_adj)
metric_origin=utils.compute_graph_statistics(adj_origin)

Values less than or equal to 0 in data. Throwing out 0 or negative values
  (Theoretical_CDF * (1 - Theoretical_CDF))


In [12]:
for x in metric_origin:
    print('%-25s origin:%17.8f, link_pred:%17.8f'%(x,metric_origin[x],metric_embedding[x]))

d_max                     origin:     240.00000000, link_pred:     223.00000000
d_min                     origin:       1.00000000, link_pred:       0.00000000
d                         origin:       4.82775801, link_pred:       4.82775801
LCC                       origin:    2810.00000000, link_pred:    2795.00000000
wedge_count               origin:  101872.00000000, link_pred:   97846.00000000
claw_count                origin: 3094240.00000000, link_pred: 2650529.00000000
triangle_count            origin:    2814.00000000, link_pred:    2690.00000000
square_count              origin:     517.00000000, link_pred:     500.00000000
power_law_exp             origin:       1.85595899, link_pred:       1.84427874
gini                      origin:       0.48239581, link_pred:       0.47711428
rel_edge_distr_entropy    origin:       0.94070677, link_pred:       0.94198854
assortativity             origin:      -0.07543032, link_pred:      -0.07591085
clustering_coefficient    origin:       