In [16]:
import numpy as np
import pandas as pd
import pickle as pkl
import json
import scipy.sparse as sp

import networkx as nx
from networkx.readwrite import json_graph

from sc_dw.utils import *
from sklearn.manifold import spectral_embedding
from node2vec import Node2Vec

from datetime import datetime, timedelta
import time
import random
from collections import OrderedDict
import warnings; warnings.filterwarnings('ignore')

In [17]:
graphs = ['reduced', 'reduced_train', 'reduced_val']
obj = []
for graph in graphs:
    with open('../data/'+graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
reduced_g = json_graph.node_link_graph(obj[0])
train_g = json_graph.node_link_graph(obj[1])
val_g = json_graph.node_link_graph(obj[2])
pd.DataFrame([[len(reduced_g.nodes()),len(reduced_g.edges())],[len(train_g.nodes()),len(train_g.edges())],[len(val_g.nodes()),len(val_g.edges())]],
             index=['Reduced graph', 'Reduced graph - train(1979-2020)', 'Reduced graph - val(2019-2020)'], columns=['nodes', 'edges'])

Unnamed: 0,nodes,edges
Reduced graph,322,1551
Reduced graph - train(1979-2020),322,1374
Reduced graph - val(2019-2020),322,177


In [18]:
val_set = ['val_edges_name', 'val_non_edges_name', 'val_edges', 'val_non_edges']
vals = []
for val in val_set:
    with open('../data/'+ val +'.pkl', 'rb') as f:
        data = pkl.load(f)
    vals.append(data)
val_edges_name = vals[0]
val_non_edges_name = vals[1]
val_edges = vals[2]
val_non_edges = vals[3]

In [19]:
reduced_adj = nx.adjacency_matrix(reduced_g, nodelist=train_g.nodes())
train_adj = nx.adjacency_matrix(train_g, nodelist=train_g.nodes())
print('total edge num:', int(np.count_nonzero(reduced_adj.todense())/2))
print('train edge num:', int(np.count_nonzero(train_adj.todense())/2))
print('test edge num:', len(val_edges))

total edge num: 1551
train edge num: 1374
test edge num: 177


### Model

In [20]:
n_iter = 10
dim = 16

In [21]:
sc_roc_results = []
sc_ap_results = []
for i in range(n_iter):
    
    random_state = random.sample(range(0, 50), 1)[0]
    spectral_emb = spectral_embedding(train_adj, n_components=dim, random_state=random_state)
    sc_score_matrix = np.dot(spectral_emb, spectral_emb.T)
    sc_val_roc, sc_val_ap = get_roc_score(val_edges, val_non_edges, sc_score_matrix, apply_sigmoid=True)
    
    sc_roc_results.append(sc_val_roc)
    sc_ap_results.append(sc_val_ap)
    
    print('Experiment {} result - ROC(AUC) score: {}, AP score: {}'.format(i+1, np.round(sc_val_roc, 4), np.round(sc_val_ap, 4)), end='\n')

Experiment 1 result - ROC(AUC) score: 0.7128, AP score: 0.6643
Experiment 2 result - ROC(AUC) score: 0.7267, AP score: 0.6842
Experiment 3 result - ROC(AUC) score: 0.7196, AP score: 0.6713
Experiment 4 result - ROC(AUC) score: 0.7286, AP score: 0.6849
Experiment 5 result - ROC(AUC) score: 0.7148, AP score: 0.673
Experiment 6 result - ROC(AUC) score: 0.7286, AP score: 0.6849
Experiment 7 result - ROC(AUC) score: 0.7195, AP score: 0.677
Experiment 8 result - ROC(AUC) score: 0.7155, AP score: 0.6721
Experiment 9 result - ROC(AUC) score: 0.7221, AP score: 0.68
Experiment 10 result - ROC(AUC) score: 0.7143, AP score: 0.664


In [22]:
print('SC_AUC mean:', '{:.2f} ± {:.2f}'.format( np.round(np.mean(sc_roc_results)*100, 2), np.round(np.std(sc_roc_results)*100, 2) ))
print('SC_AP mean:', '{:.2f} ± {:.2f}'.format( np.round(np.mean(sc_ap_results)*100, 2), np.round(np.std(sc_ap_results)*100, 2) ))

SC_AUC mean: 72.03 ± 0.58
SC_AP mean: 67.56 ± 0.75


In [24]:
print(len(sc_roc_results), len(sc_ap_results))

10 10


In [25]:
model_name = 'spectral_clustering'

In [30]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['settings'] = {'dim':dim, 'niter':n_iter}
log_dict['roc_all'] = sc_roc_results
log_dict['roc_mean'] = np.mean(sc_roc_results)
log_dict['roc_std'] = np.std(sc_roc_results)
log_dict['ap_all'] = sc_ap_results
log_dict['ap_mean'] = np.mean(sc_ap_results)
log_dict['ap_std'] = np.std(sc_ap_results)

In [31]:
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,spectral_clustering
1,datetime,2022-02-26 12:56:48
2,settings,"{'dim': 16, 'niter': 10}"
3,roc_all,"[0.7127581474033644, 0.7267388043027228, 0.719..."
4,roc_mean,0.720253
5,roc_std,0.00575293
6,ap_all,"[0.6643389186155679, 0.6841739009996797, 0.671..."
7,ap_mean,0.675562
8,ap_std,0.00753004


In [32]:
save_file_name = 'SC_results_ADD_patent.json'

In [34]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-02-26 12:57:10
Total data num: 1


In [33]:
# empty = []
# with open('../results/'+save_file_name, 'w') as f:
#     json.dump(empty, f)

In [38]:
n_iter = 10
dim = 16
walk_len = 8
num_walk = 8
window = 4

In [35]:
dw_roc_results = []
dw_ap_results = []
for i in range(n_iter):
    
    G_train = nx.from_scipy_sparse_matrix(train_adj)
    
    model_train = Node2Vec(G_train, dimensions=dim, walk_length=walk_len, num_walks=num_walk)
    n2v_train = model_train.fit(window=window, min_count=3)
    edge_emb = n2v_train.wv
    
    emb_list = []
    for node_index in range(0, train_adj.shape[0]):
        node_emb = edge_emb[str(node_index)]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    n2v_score_matrix = np.dot(emb_matrix, emb_matrix.T)
    n2v_val_roc, n2v_val_ap = get_roc_score(val_edges, val_non_edges, n2v_score_matrix, apply_sigmoid=True)
    
    dw_roc_results.append(n2v_val_roc)
    dw_ap_results.append(n2v_val_ap)
    
    print('Experiment {} result - ROC(AUC) score: {}, AP score: {}'.format(i+1, np.round(n2v_val_roc, 4), np.round(n2v_val_ap, 4)), end='\n')

HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…

Generating walks (CPU: 1):  25%|██████████████▎                                          | 2/8 [00:00<00:00, 15.87it/s]




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.54it/s]


Experiment 1 result - ROC(AUC) score: 0.6565, AP score: 0.6177


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.22it/s]


Experiment 2 result - ROC(AUC) score: 0.6728, AP score: 0.6228


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.47it/s]


Experiment 3 result - ROC(AUC) score: 0.6815, AP score: 0.6236


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.33it/s]


Experiment 4 result - ROC(AUC) score: 0.6576, AP score: 0.6006


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…

Generating walks (CPU: 1):  25%|██████████████▎                                          | 2/8 [00:00<00:00, 15.87it/s]




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  7.91it/s]


Experiment 5 result - ROC(AUC) score: 0.6554, AP score: 0.6175


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…

Generating walks (CPU: 1):  25%|██████████████▎                                          | 2/8 [00:00<00:00, 15.87it/s]




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.32it/s]


Experiment 6 result - ROC(AUC) score: 0.6647, AP score: 0.6146


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.50it/s]


Experiment 7 result - ROC(AUC) score: 0.6849, AP score: 0.6415


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.35it/s]


Experiment 8 result - ROC(AUC) score: 0.6771, AP score: 0.6317


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.38it/s]


Experiment 9 result - ROC(AUC) score: 0.6599, AP score: 0.6064


HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=322.0, style=Pro…

Generating walks (CPU: 1):   0%|                                                                 | 0/8 [00:00<?, ?it/s]




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  7.35it/s]


Experiment 10 result - ROC(AUC) score: 0.6535, AP score: 0.5994


In [39]:
print('DW_AUC mean:', '{:.2f} ± {:.2f}'.format( np.round(np.mean(dw_roc_results)*100, 2), np.round(np.std(dw_roc_results)*100, 2) ))
print('DW_AP mean:', '{:.2f} ± {:.2f}'.format( np.round(np.mean(dw_ap_results)*100, 2), np.round(np.std(dw_ap_results)*100, 2) ))

DW_AUC mean: 66.64 ± 1.11
DW_AP mean: 61.76 ± 1.26


#### SAVE DATA

In [40]:
print(len(dw_roc_results), len(dw_ap_results))

10 10


In [41]:
model_name = 'deepwalk'

In [42]:
log_dict= OrderedDict()
log_dict['model'] = model_name
log_dict['datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_dict['settings'] = {'dim':dim, 'walk_len':walk_len, 'num_walk':num_walk, 'window_size':window, 'niter':n_iter}
log_dict['roc_all'] = dw_roc_results
log_dict['roc_mean'] = np.mean(dw_roc_results)
log_dict['roc_std'] = np.std(dw_roc_results)
log_dict['ap_all'] = dw_ap_results
log_dict['ap_mean'] = np.mean(dw_ap_results)
log_dict['ap_std'] = np.std(dw_ap_results)

In [43]:
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,deepwalk
1,datetime,2022-02-26 13:02:10
2,settings,"{'dim': 16, 'walk_len': 8, 'num_walk': 8, 'win..."
3,roc_all,"[0.6565482460340261, 0.6727632544926425, 0.681..."
4,roc_mean,0.666379
5,roc_std,0.0110935
6,ap_all,"[0.6177477992728284, 0.6227583521818516, 0.623..."
7,ap_mean,0.61757
8,ap_std,0.0125918


In [44]:
save_file_name = 'DW_results_ADD_patent.json'

In [46]:
data = json.load(open('../results/'+save_file_name))
data.append(log_dict)
with open('../results/'+save_file_name, 'w') as f:
    json.dump(data, f)
print("Last data saved at: {}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2022-02-26 13:02:44
Total data num: 1


In [45]:
# empty = []
# with open('../results/'+save_file_name, 'w') as f:
#     json.dump(empty, f)