In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
import json
import scipy.sparse as sp

import networkx as nx
from networkx.readwrite import json_graph

from sc_dw.utils import *
from datetime import datetime, timedelta
import time
import random
from collections import OrderedDict
import warnings; warnings.filterwarnings('ignore')

In [2]:
graphs = ['reduced', 'reduced_train', 'reduced_val']
obj = []
for graph in graphs:
    with open('../data/'+graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
reduced_g = json_graph.node_link_graph(obj[0])
train_g = json_graph.node_link_graph(obj[1])
val_g = json_graph.node_link_graph(obj[2])
pd.DataFrame([[len(reduced_g.nodes()),len(reduced_g.edges())],[len(train_g.nodes()),len(train_g.edges())],[len(val_g.nodes()),len(val_g.edges())]],
             index=['Reduced graph', 'Reduced graph - train(1979-2020)', 'Reduced graph - val(2019-2020)'], columns=['nodes', 'edges'])

Unnamed: 0,nodes,edges
Reduced graph,322,1551
Reduced graph - train(1979-2020),322,1374
Reduced graph - val(2019-2020),322,177


In [3]:
val_set = ['val_edges_name', 'val_non_edges_name', 'val_edges', 'val_non_edges']
vals = []
for val in val_set:
    with open('../data/'+ val +'.pkl', 'rb') as f:
        data = pkl.load(f)
    vals.append(data)
val_edges_name = vals[0]
val_non_edges_name = vals[1]
val_edges = vals[2]
val_non_edges = vals[3]

In [4]:
reduced_adj = nx.adjacency_matrix(reduced_g, nodelist=train_g.nodes())
train_adj = nx.adjacency_matrix(train_g, nodelist=train_g.nodes())
print('total edge num:', int(np.count_nonzero(reduced_adj.todense())/2))
print('train edge num:', int(np.count_nonzero(train_adj.todense())/2))
print('test edge num:', len(val_edges))

total edge num: 1551
train edge num: 1374
test edge num: 177


### Model

In [5]:
emb = pd.DataFrame(index=list(train_g.nodes()))
emb

a01k
a01m
a01n
a41d
a42b
...
h04w
h05b
h05f
h05h
h05k


In [6]:
emb['degcen'] = [np.round(i, 6) for i in nx.degree_centrality(train_g).values()]
emb['betcen'] = [np.round(i, 6) for i in nx.betweenness_centrality(train_g).values()]
emb['clscen'] = [np.round(i, 6) for i in nx.closeness_centrality(train_g).values()]

In [7]:
emb

Unnamed: 0,degcen,betcen,clscen
a01k,0.009346,0.000048,0.276504
a01m,0.000000,0.000000,0.000000
a01n,0.009346,0.000199,0.240471
a41d,0.006231,0.000000,0.300450
a42b,0.006231,0.000000,0.301495
...,...,...,...
h04w,0.059190,0.002874,0.360374
h05b,0.043614,0.006553,0.325645
h05f,0.000000,0.000000,0.000000
h05h,0.034268,0.008745,0.348313


In [23]:
recon = np.dot(emb, emb.T)

In [25]:
pd.DataFrame(recon, index=emb.index, columns=emb.index)

Unnamed: 0,a01k,a01m,a01n,a41d,a42b,a47b,a61b,a61f,a61h,a61k,...,h04m,h04n,h04q,h04r,h04s,h04w,h05b,h05f,h05h,h05k
a01k,0.076542,0.0,0.066579,0.083134,0.083423,0.075107,0.098470,0.0,0.087379,0.078975,...,0.084094,0.109685,0.088341,0.086261,0.064036,0.100198,0.090450,0.0,0.096631,0.097224
a01m,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
a01n,0.066579,0.0,0.057914,0.072308,0.072559,0.065330,0.085681,0.0,0.076016,0.068719,...,0.073154,0.095504,0.076851,0.075047,0.055695,0.087213,0.078717,0.0,0.084081,0.084626
a41d,0.083134,0.0,0.072308,0.090309,0.090623,0.081575,0.106863,0.0,0.094873,0.085704,...,0.091315,0.118829,0.095918,0.093646,0.069569,0.108643,0.098112,0.0,0.104864,0.105422
a42b,0.083423,0.0,0.072559,0.090623,0.090938,0.081858,0.107234,0.0,0.095203,0.086002,...,0.091633,0.119241,0.096251,0.093971,0.069811,0.109020,0.098452,0.0,0.105228,0.105788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
h04w,0.100198,0.0,0.087213,0.108643,0.109020,0.098328,0.129967,0.0,0.114772,0.104267,...,0.110335,0.147235,0.116018,0.113468,0.083606,0.133381,0.119954,0.0,0.127576,0.129419
h05b,0.090450,0.0,0.078717,0.098112,0.098452,0.088760,0.117130,0.0,0.103540,0.093968,...,0.099549,0.132201,0.104656,0.102342,0.075518,0.119954,0.107990,0.0,0.114978,0.116491
h05f,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
h05h,0.096631,0.0,0.084081,0.104864,0.105228,0.094822,0.124870,0.0,0.110522,0.100173,...,0.106286,0.140305,0.111711,0.109206,0.080736,0.127576,0.114978,0.0,0.122573,0.123948


In [26]:
val_roc, val_ap = get_roc_score(val_edges, val_non_edges, recon, apply_sigmoid=True)
print('Experiment result - ROC(AUC) score: {}, AP score: {}'.format(np.round(val_roc, 4), np.round(val_ap, 4)), end='\n')

Experiment result - ROC(AUC) score: 0.8385, AP score: 0.8502


In [27]:
model_name = 'node_emb_centrality'

In [28]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['settings'] = {'centralities':'degree cen, betweenness cen, closeness cen', 'methods':'dot_product_sigmoid', 'niter':1}
log_dict['roc'] = val_roc
log_dict['ap'] = val_ap

In [29]:
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,node_emb_centrality
1,datetime,2022-02-26 13:14:49
2,settings,"{'centralities': 'degree cen, betweenness cen,..."
3,roc,0.83852
4,ap,0.850229


In [32]:
save_file_name = 'node_emb_results_ADD_patent.json'

In [33]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-02-26 13:15:16
Total data num: 1


In [31]:
# empty = []
# with open('../results/'+save_file_name, 'w') as f:
#    json.dump(empty, f)