In [1]:
import networkx as nx
import scipy.sparse as sp
import numpy as np
import graph_utils as utils
import torch
import os
from collections import defaultdict
import numpy as np
import time
import json

In [2]:
seed=114514
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.seed_all()

# **Read data**

In [3]:
loader = np.load('gene_with_label.npz')
train_ones = loader['edges']
print(len(train_ones))
print(train_ones[:5])

2872
[[  0  60]
 [  0 622]
 [  1 345]
 [  2  39]
 [  2  56]]


In [4]:
adj_sparse = np.zeros((np.max(train_ones)+1,np.max(train_ones)+1))
for e in train_ones:
    adj_sparse[e[0],e[1]]=1
    adj_sparse[e[1],e[0]]=1
    
adj_sparse = sp.coo_matrix(adj_sparse).tocsr()

lcc = utils.largest_connected_components(adj_sparse)
adj_sparse= adj_sparse[lcc,:][:,lcc]
_N = adj_sparse.shape[0]
print('n',_N)
_Edges=[]
for x in np.column_stack(adj_sparse.nonzero()):
    if not x[0]==x[1]:
        _Edges.append((x[0],x[1]))
_num_of_edges=int(len(_Edges)/2)
print('m',_num_of_edges)

dic=defaultdict(set)
for x in _Edges:
    a1=x[0]
    a2=x[1]
    dic[a1].add(a2)
    dic[a2].add(a1)
    

adj_origin=np.zeros((_N,_N)).astype(int)
for (i,j) in _Edges:
    adj_origin[i][j]=1
    adj_origin[j][i]=1
assert(np.sum(adj_origin==adj_origin.T)==_N*_N)
assert(np.sum(adj_origin)==_num_of_edges*2)

Selecting 1 largest connected components
n 814
m 1436


# **1.Get link prediction model and embedding**

In [5]:
embedding_dim=128

graphsagemodel=utils.GraphSAGE(_N=_N,_M=_num_of_edges,adj_origin=adj_origin,
                                         adj_dic=dic,embedding_dim=embedding_dim)


# *1.1 Training step*

In [6]:
graphsagemodel.graphsage_train(boost_times=20,add_edges=1000,training_epoch=2500,
                               boost_epoch=4000,learning_rate=0.001,save_number=0)
graphsagemodel.save_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

330891
1436
-2872
Training GraphSAGE model
Epoch:0,Loss:0.693375,estimated time:90.94
 acc:0.5
Epoch:1000,Loss:0.001911,estimated time:46.95
 acc:0.9986072423398329
Epoch:2000,Loss:0.003433,estimated time:15.61
 acc:0.9996518105849582
813/814[(1.0, 0, 85), (1.0, 0, 143), (1.0, 0, 101), (1.0, 0, 154), (1.0, 0, 167), (1.0, 0, 193), (1.0, 1, 116), (1.0, 0, 260), (1.0, 0, 304), (1.0, 0, 170)]
1.0
(814, 814)
2872
2872
True Positve:290, 0.10
False Positve:2582, 0.90
True Negative:657142, 1.00
False Negative:2582, 0.00
Positive:0.00
Negative:1.00
Start boosting
boost iter:0
train_add:0
train added: 1000
current training set length: 3872
current save path: graphsage_model/graphsage0_0.pth
Epoch:0,Loss:1.756825,estimated time:170.28
 acc:0.8075929752066116
Epoch:1000,Loss:0.067748,estimated time:100.98
 acc:0.987345041322314
Epoch:2000,Loss:0.010994,estimated time:59.956
 acc:0.9940599173553719
Epoch:3000,Loss:0.013563,estimated time:32.44
 acc:0.9963842975206612
813/814[(0.9999999, 426, 482), 

In [7]:
# if you want to do reboost
graphsagemodel.load_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

In [None]:

graphsagemodel.graphsage_train(boost_times=10,add_edges=1000,training_epoch=2500,
                               boost_epoch=4000,learning_rate=0.001,save_number=1)
graphsagemodel.save_model(path='graphsage_model/graph_graphsageAA.pth',embedding_path='graphsage_model/embeddingsAA.npy')

# *1.2 Load pretrained model*

In [8]:
graphsagemodel.load_model(path='graphsage_model/graph_graphsage.pth',embedding_path='graphsage_model/embeddings.npy')

In [9]:
embedding_matrix_numpy=graphsagemodel.embedding_matrix_numpy
link_prediction_model=graphsagemodel.graphsage_link_prediction_from_embedding_one_to_other
predict_adj=utils.evaluate_overlap_torch(_N=_N,
                                                    _num_of_edges=_num_of_edges,
                                                    adj_origin=adj_origin,
                                                    embedding_matrix_numpy=embedding_matrix_numpy,
                                                    link_prediction_from_embedding_one_to_other=link_prediction_model)

813/814[(0.6239317, 114, 297), (0.6242199, 511, 797), (0.63082474, 120, 297), (0.6310142, 386, 802), (0.62630093, 477, 731), (0.6699211, 159, 305), (0.6359365, 408, 668), (0.6691241, 120, 166), (0.6322952, 120, 150), (0.63624847, 183, 769)]
1.0
(814, 814)
2872
2872
True Positve:2778, 0.97
False Positve:94, 0.03
True Negative:659630, 1.00
False Negative:94, 0.00
Positive:0.00
Negative:1.00


In [10]:
metric_embedding=utils.compute_graph_statistics(predict_adj)
metric_origin=utils.compute_graph_statistics(adj_origin)

Values less than or equal to 0 in data. Throwing out 0 or negative values
  (Theoretical_CDF * (1 - Theoretical_CDF))


In [11]:
for x in metric_origin:
    print('%-25s origin:%17.8f, link_pred:%17.8f'%(x,metric_origin[x],metric_embedding[x]))

d_max                     origin:      29.00000000, link_pred:      29.00000000
d_min                     origin:       1.00000000, link_pred:       0.00000000
d                         origin:       3.52825553, link_pred:       3.52825553
LCC                       origin:     814.00000000, link_pred:     806.00000000
wedge_count               origin:    7786.00000000, link_pred:    7526.00000000
claw_count                origin:   23322.00000000, link_pred:   21661.00000000
triangle_count            origin:     809.00000000, link_pred:     681.00000000
square_count              origin:     968.00000000, link_pred:     537.00000000
power_law_exp             origin:       2.05426571, link_pred:       2.03385673
gini                      origin:       0.42572701, link_pred:       0.41770239
rel_edge_distr_entropy    origin:       0.95303593, link_pred:       0.95496390
assortativity             origin:       0.13561540, link_pred:       0.03567548
clustering_coefficient    origin:       