In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
import json
import scipy.sparse as sp

import networkx as nx
from networkx.readwrite import json_graph

from gae.model import *
from gae.optimizer import *
from gae.utils import *

import torch
from torch import optim
import torch.nn.functional as F

from datetime import datetime
import random
from collections import OrderedDict
import warnings; warnings.filterwarnings('ignore')

In [2]:
device = torch.device('cpu')

In [6]:
graphs = ['reduced', 'reduced_train', 'reduced_val']
obj = []
for graph in graphs:
    with open('../data/'+graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
reduced_g = json_graph.node_link_graph(obj[0])
train_g = json_graph.node_link_graph(obj[1])
val_g = json_graph.node_link_graph(obj[2])
pd.DataFrame([[len(reduced_g.nodes()),len(reduced_g.edges())],[len(train_g.nodes()),len(train_g.edges())],[len(val_g.nodes()),len(val_g.edges())]],
             index=['Reduced graph', 'Reduced graph - train(1979-2020)', 'Reduced graph - val(2019-2020)'], columns=['nodes', 'edges'])

Unnamed: 0,nodes,edges
Reduced graph,322,1551
Reduced graph - train(1979-2020),322,1374
Reduced graph - val(2019-2020),322,177


In [7]:
val_set = ['val_edges_name', 'val_non_edges_name', 'val_edges', 'val_non_edges']
vals = []
for val in val_set:
    with open('../data/'+ val +'.pkl', 'rb') as f:
        data = pkl.load(f)
    vals.append(data)
# val_edges_name = vals[0] # not used
# val_non_edges_name = vals[1] # not used
val_edges = vals[2]
val_non_edges = vals[3]

In [8]:
reduced_adj = nx.adjacency_matrix(reduced_g, nodelist=train_g.nodes())
train_adj = nx.adjacency_matrix(train_g, nodelist=train_g.nodes())
print('total edge num:', int(np.count_nonzero(reduced_adj.todense())/2))
print('train edge num:', int(np.count_nonzero(train_adj.todense())/2))
print('test edge num:', len(val_edges))

total edge num: 1551
train edge num: 1374
test edge num: 177


### Model

In [9]:
model_name = 'GAE'

In [10]:
train_adj_norm = preprocess_graph(train_adj)
pos_weight = torch.Tensor([float(train_adj.shape[0] * train_adj.shape[0] - train_adj.sum()) / train_adj.sum()])
norm = train_adj.shape[0] * train_adj.shape[0] / float((train_adj.shape[0] * train_adj.shape[0] - train_adj.sum()) * 2)

In [11]:
adj_label = train_adj + sp.eye(train_adj.shape[0])
adj_label = torch.FloatTensor(adj_label.toarray())
features = sp.identity(train_adj.shape[0])
features = torch.FloatTensor(np.array(features.todense()))
n_nodes, feat_dim = features.shape
train_adj_norm, features, adj_label, pos_weight = train_adj_norm.to(device), features.to(device), adj_label.to(device), pos_weight.to(device)

In [41]:
hidden1 = 16
hidden2 = 8
lr = 0.00025
dropout = 0.
epochs = 200
n_iter = 10

In [44]:
roc_results = []
ap_results = []
for i in range(n_iter):
    if model_name == 'GAE':
        model = GCN_AE(feat_dim, hidden1, hidden2, dropout)
    elif model_name == 'VGAE':
        model = GCN_VAE(feat_dim, hidden1, hidden2, dropout)
    model= model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    hidden_emb = None
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
    
        if model_name == 'GAE':
            recovered = model(features, train_adj_norm)
            loss = loss_function_gae(preds=recovered, labels=adj_label, norm=norm, pos_weight=pos_weight)
        elif model_name == 'VGAE':
            recovered, mu, logvar = model(features, train_adj_norm)
            loss = loss_function_vgae(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight)

        loss.backward()
        cur_loss = loss.item()
        optimizer.step()
        hidden_emb = recovered.data.cpu().numpy()
        
    roc_score, ap_score, recon_adj = roc_ap_score(hidden_emb, reduced_adj, val_edges, val_non_edges)
    print('Expriments {} result: Test_ROC: {:.4f}, Test_AP: {:.4f}'.format(i+1, roc_score*100, ap_score*100), end='\n')
    roc_results.append(roc_score)
    ap_results.append(ap_score)
print('\nAverage ROC: {} ± {} Average AP: {} ± {}'.format(np.round(np.mean(roc_results)*100, 2), np.round(np.std(roc_results)*100, 2), np.round(np.mean(ap_results)*100, 2), np.round(np.std(ap_results)*100, 2) ))

Expriments 1 result: Test_ROC: 86.6992, Test_AP: 87.0716
Expriments 2 result: Test_ROC: 85.3458, Test_AP: 85.9094
Expriments 3 result: Test_ROC: 85.4608, Test_AP: 86.3193
Expriments 4 result: Test_ROC: 85.6363, Test_AP: 86.6433
Expriments 5 result: Test_ROC: 84.5926, Test_AP: 86.0610
Expriments 6 result: Test_ROC: 85.9140, Test_AP: 86.5969
Expriments 7 result: Test_ROC: 85.7193, Test_AP: 86.5742
Expriments 8 result: Test_ROC: 85.4416, Test_AP: 86.3951
Expriments 9 result: Test_ROC: 84.2734, Test_AP: 85.7434
Expriments 10 result: Test_ROC: 84.8639, Test_AP: 86.1968

Average ROC: 85.39 ± 0.66 Average AP: 86.35 ± 0.37


In [43]:
print(len(roc_results), len(ap_results))

10 10


In [15]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['settings'] = {'epochs':epochs, 'hidden1':hidden1, 'hidden2':hidden2, 'lr':lr, 'dropout': dropout, 'niter':10}
log_dict['roc_all'] = roc_results
log_dict['roc_mean'] = np.mean(roc_results)
log_dict['roc_std'] = np.std(roc_results)
log_dict['ap_all'] = ap_results
log_dict['ap_mean'] = np.mean(ap_results)
log_dict['ap_std'] = np.std(ap_results)

In [16]:
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,GAE
1,datetime,2022-03-05 17:42:45
2,settings,"{'epochs': 200, 'hidden1': 16, 'hidden2': 8, '..."
3,roc_all,"[0.8559481630438251, 0.8637524338472342, 0.851..."
4,roc_mean,0.856839
5,roc_std,0.00584736
6,ap_all,"[0.8654513689146641, 0.8667838084525464, 0.857..."
7,ap_mean,0.863402
8,ap_std,0.00451909


In [35]:
save_file_name = 'GAE_results_ADD_patent.json'

In [37]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-03-05 18:00:18
Total data num: 1


In [36]:
empty = []
with open('../results/'+save_file_name, 'w') as f:
    json.dump(empty, f)