In [66]:
import numpy as np
import pandas as pd
import pickle as pkl
import json
import scipy.sparse as sp

import networkx as nx
from networkx.readwrite import json_graph

from sc_dw.utils import *

from datetime import datetime, timedelta
import time
import random
from collections import OrderedDict
import warnings; warnings.filterwarnings('ignore')

In [2]:
graphs = ['reduced', 'reduced_train', 'reduced_val']
obj = []
for graph in graphs:
    with open('../data/'+graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
reduced_g = json_graph.node_link_graph(obj[0])
train_g = json_graph.node_link_graph(obj[1])
val_g = json_graph.node_link_graph(obj[2])
pd.DataFrame([[len(reduced_g.nodes()),len(reduced_g.edges())],[len(train_g.nodes()),len(train_g.edges())],[len(val_g.nodes()),len(val_g.edges())]],
             index=['Reduced graph', 'Reduced graph - train(1979-2020)', 'Reduced graph - val(2019-2020)'], columns=['nodes', 'edges'])

Unnamed: 0,nodes,edges
Reduced graph,322,1551
Reduced graph - train(1979-2020),322,1374
Reduced graph - val(2019-2020),322,177


In [3]:
val_set = ['val_edges_name', 'val_non_edges_name', 'val_edges', 'val_non_edges']
vals = []
for val in val_set:
    with open('../data/'+ val +'.pkl', 'rb') as f:
        data = pkl.load(f)
    vals.append(data)
val_edges_name = vals[0]
val_non_edges_name = vals[1]
val_edges = vals[2]
val_non_edges = vals[3]

In [4]:
reduced_adj = nx.adjacency_matrix(reduced_g, nodelist=train_g.nodes())
train_adj = nx.adjacency_matrix(train_g, nodelist=train_g.nodes())
print('total edge num:', int(np.count_nonzero(reduced_adj.todense())/2))
print('train edge num:', int(np.count_nonzero(train_adj.todense())/2))
print('test edge num:', len(val_edges))

total edge num: 1551
train edge num: 1374
test edge num: 177


#### Set save file

In [68]:
save_file_name = 'topo_edge_score_results_ADD_patent.json'

In [69]:
# empty = []
# with open('../results/'+save_file_name, 'w') as f:
#     json.dump(empty, f)

In [5]:
print(len(val_edges), len(val_non_edges))

177 177


In [70]:
G_train = nx.from_scipy_sparse_matrix(train_adj)
len(G_train.edges())

1374

#### Preferential attachment

In [105]:
model_name = 'Preferential attachment'

In [106]:
preds = nx.preferential_attachment(G_train, val_edges)
preds_false = nx.preferential_attachment(G_train, val_non_edges)

In [107]:
for u, v, p in preds:
    emb[u][v] = p
    emb[v][u] = p
for u, v, p in preds_false:
    emb[u][v] = p
    emb[v][u] = p

In [108]:
val_roc, val_ap = get_roc_score(val_edges, val_non_edges, emb, apply_sigmoid=True)
print('Experiment result - ROC(AUC) score: {}, AP score: {}'.format(np.round(val_roc*100, 4), np.round(val_ap*100, 4)), end='\n')

Experiment result - ROC(AUC) score: 78.5183, AP score: 72.0174


In [109]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['roc'] = val_roc
log_dict['ap'] = val_ap
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,Preferential attachment
1,datetime,2022-03-05 19:35:12
2,roc,0.785183
3,ap,0.720174


In [110]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-03-05 19:35:13
Total data num: 1


#### Jaccard coefficient

In [111]:
model_name = 'Jaccard coefficient'

In [112]:
emb = np.zeros(train_adj.shape)

In [113]:
preds = nx.jaccard_coefficient(G_train, val_edges)
preds_false = nx.jaccard_coefficient(G_train, val_non_edges)

In [114]:
for u, v, p in preds:
    emb[u][v] = p
    emb[v][u] = p
for u, v, p in preds_false:
    emb[u][v] = p
    emb[v][u] = p

In [115]:
val_roc, val_ap = get_roc_score(val_edges, val_non_edges, emb, apply_sigmoid=True)
print('Experiment result - ROC(AUC) score: {}, AP score: {}'.format(np.round(val_roc*100, 4), np.round(val_ap*100, 4)), end='\n')

Experiment result - ROC(AUC) score: 82.2385, AP score: 81.1269


In [116]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['roc'] = val_roc
log_dict['ap'] = val_ap
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,Jaccard coefficient
1,datetime,2022-03-05 19:35:15
2,roc,0.822385
3,ap,0.811269


In [117]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-03-05 19:35:17
Total data num: 2


#### Adamic adar index

In [118]:
model_name = 'Adamic_adar_index'

In [119]:
emb = np.zeros(train_adj.shape)

In [120]:
preds = nx.adamic_adar_index(G_train, val_edges)
preds_false = nx.adamic_adar_index(G_train, val_non_edges)

In [121]:
for u, v, p in preds:
    emb[u][v] = p
    emb[v][u] = p
for u, v, p in preds_false:
    emb[u][v] = p
    emb[v][u] = p

In [122]:
val_roc, val_ap = get_roc_score(val_edges, val_non_edges, emb, apply_sigmoid=True)
print('Experiment result - ROC(AUC) score: {}, AP score: {}'.format(np.round(val_roc*100, 4), np.round(val_ap*100, 4)), end='\n')

Experiment result - ROC(AUC) score: 85.0474, AP score: 85.5214


In [123]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['roc'] = val_roc
log_dict['ap'] = val_ap
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,Adamic_adar_index
1,datetime,2022-03-05 19:35:20
2,roc,0.850474
3,ap,0.855214


In [124]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-03-05 19:35:21
Total data num: 3


#### Resource allocation index

In [125]:
model_name = 'Resource_allocation_index'

In [126]:
emb = np.zeros(train_adj.shape)

In [127]:
preds = nx.resource_allocation_index(G_train, val_edges)
preds_false = nx.resource_allocation_index(G_train, val_non_edges)

In [128]:
for u, v, p in preds:
    emb[u][v] = p
    emb[v][u] = p
for u, v, p in preds_false:
    emb[u][v] = p
    emb[v][u] = p

In [129]:
val_roc, val_ap = get_roc_score(val_edges, val_non_edges, emb, apply_sigmoid=True)
print('Experiment result - ROC(AUC) score: {}, AP score: {}'.format(np.round(val_roc*100, 4), np.round(val_ap*100, 4)), end='\n')

Experiment result - ROC(AUC) score: 85.2868, AP score: 86.1158


In [130]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['roc'] = val_roc
log_dict['ap'] = val_ap
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,Resource_allocation_index
1,datetime,2022-03-05 19:35:24
2,roc,0.852868
3,ap,0.861158


In [131]:
data = json.load(open('../results/' + save_file_name))
data.append(log_dict)
with open('../results/' + save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-03-05 19:35:24
Total data num: 4
