In [25]:
import numpy as np
import pandas as pd
import pickle as pkl
import json
import scipy.sparse as sp

import networkx as nx
from networkx.readwrite import json_graph

from gae.model import *
from gae.optimizer import *
from gae.utils import *

import torch
from torch import optim
import torch.nn.functional as F

from datetime import datetime, timedelta
import time
import random
from collections import OrderedDict
import warnings; warnings.filterwarnings('ignore')

In [26]:
graphs = ['original', 'train', 'test']
obj = []
for graph in graphs:
    with open(graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
orig_g = json_graph.node_link_graph(obj[0])
train_g = json_graph.node_link_graph(obj[1])
test_g = json_graph.node_link_graph(obj[2])

In [27]:
train_adj = nx.adjacency_matrix(train_g, nodelist=train_g.nodes())
orig_adj = nx.adjacency_matrix(orig_g, nodelist=train_g.nodes())

In [28]:
idx2nodes = {}
nodes2idx = {}
for idx, node in enumerate(train_g.nodes()):
    idx2nodes[idx] = node
    nodes2idx[node] = idx

In [29]:
test_edges_name = []
for i, j in test_g.edges():
    if all(x in train_g.nodes() for x in [i, j]) and (i, j) not in train_g.edges() and (j, i) not in train_g.edges():
        test_edges_name.append((i,j))

In [30]:
print(len(test_edges_name))
print(test_edges_name[:5])

177
[('a61b', 'a61h'), ('a61p', 'c07k'), ('a62b', 'd01d'), ('a62b', 'd01f'), ('a62b', 'd03d')]


In [31]:
test_edges = []
for i, j in test_edges_name:
    test_edges.append((nodes2idx[i], nodes2idx[j]))

In [32]:
def ismember(idx_i, idx_j, edgelist):
    if ((idx_i, idx_j) in edgelist) or ((idx_j, idx_i) in edgelist):
        return True
    else:
        return False

In [33]:
test_edges_false_name = []
while len(test_edges_false_name) < len(test_edges):
    idx_i, idx_j = random.sample(train_g.nodes(), 2)
    if ismember(idx_i, idx_j,  orig_g.edges()):
        continue
    if ismember(idx_i, idx_j, test_edges_false_name):
        continue
    test_edges_false_name.append((idx_i, idx_j))
len(test_edges_false_name)

177

In [34]:
print(len(test_edges_false_name))
print(test_edges_false_name[:5])

177
[('b65h', 'h05h'), ('f41g', 'a63f'), ('b23c', 'c02f'), ('c09k', 'd06c'), ('c03c', 'a62b')]


In [35]:
test_edges_false = []
for i, j in test_edges_false_name:
    test_edges_false.append((nodes2idx[i], nodes2idx[j]))

In [36]:
test_edges_false[:5]

[(96, 320), (219, 18), (42, 113), (133, 156), (115, 14)]

### Model

In [37]:
train_adj_norm = preprocess_graph(train_adj)

In [38]:
pos_weight = torch.Tensor([float(train_adj.shape[0] * train_adj.shape[0] - train_adj.sum()) / train_adj.sum()])
norm = train_adj.shape[0] * train_adj.shape[0] / float((train_adj.shape[0] * train_adj.shape[0] - train_adj.sum()) * 2)

In [39]:
device = torch.device('cpu')

In [40]:
adj_label = train_adj + sp.eye(train_adj.shape[0])
adj_label = torch.FloatTensor(adj_label.toarray())

In [41]:
features = sp.identity(train_adj.shape[0])
features = torch.FloatTensor(np.array(features.todense()))
n_nodes, feat_dim = features.shape

In [42]:
train_adj_norm, features, adj_label, pos_weight = train_adj_norm.to(device), features.to(device), adj_label.to(device), pos_weight.to(device)

In [150]:
hidden1 = 16
hidden2 = 8
lr = 0.0001
dropout = 0.
epochs = 200

In [151]:
model_name = 'GAE'

In [152]:
n_iter = 10

In [153]:
roc_results = []
ap_results = []
for i in range(n_iter):
    if model_name == 'GAE':
        model = GCN_AE(feat_dim, hidden1, hidden2, dropout)
    elif model_name == 'VGAE':
        model = GCN_VAE(feat_dim, hidden1, hidden2, dropout)
    model= model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    hidden_emb = None
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
    
        if model_name == 'GAE':
            recovered = model(features, train_adj_norm)
            loss = loss_function_gae(preds=recovered, labels=adj_label, norm=norm, pos_weight=pos_weight)
        else:
            recovered, mu, logvar = model(features, train_adj_norm)
            loss = loss_function_vgae(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight)

        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = recovered.data.cpu().numpy()
#     roc_curr, ap_curr = patent_roc_score(hidden_emb, G1_adj, val_edges, val_edges_false)
#         if (epoch % 50 == 0) or (epoch==199):
#             print('Epoch: {}/{}'.format(epoch+1, epochs), end='\n', flush=True)
    roc_score, ap_score, recon_adj = roc_ap_score(hidden_emb, orig_adj, test_edges, test_edges_false)
    print('Expriments {} result: Test_ROC: {:.4f}, Test_AP: {:.4f}'.format(i+1, roc_score*100, ap_score*100), end='\n')
    roc_results.append(roc_score)
    ap_results.append(ap_score)
print('\nAverage ROC: {} ± {} Average AP: {} ± {}'.format(np.round(np.mean(roc_results)*100, 2), np.round(np.std(roc_results)*100, 2), np.round(np.mean(ap_results)*100, 2), np.round(np.std(ap_results)*100, 2) ))

Expriments 1 result: Test_ROC: 83.7818, Test_AP: 82.6851
Expriments 2 result: Test_ROC: 81.4421, Test_AP: 83.6912
Expriments 3 result: Test_ROC: 82.0326, Test_AP: 82.8310
Expriments 4 result: Test_ROC: 80.1207, Test_AP: 81.3043
Expriments 5 result: Test_ROC: 82.8019, Test_AP: 83.5342
Expriments 6 result: Test_ROC: 78.2885, Test_AP: 79.9596
Expriments 7 result: Test_ROC: 80.9123, Test_AP: 81.5603
Expriments 8 result: Test_ROC: 80.5165, Test_AP: 81.1677
Expriments 9 result: Test_ROC: 81.6943, Test_AP: 83.1263
Expriments 10 result: Test_ROC: 83.5201, Test_AP: 83.7212

Average ROC: 81.51 ± 1.58 Average AP: 82.36 ± 1.22
