In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
import json
import scipy.sparse as sp

import networkx as nx
from networkx.readwrite import json_graph

from gae.model import *
from gae.optimizer import *
from gae.utils import *

import torch
from torch import optim
import torch.nn.functional as F

from datetime import datetime, timedelta
import time
import random
from collections import OrderedDict
import warnings; warnings.filterwarnings('ignore')

In [2]:
graphs = ['original']
obj = []
for graph in graphs:
    with open('../data/'+graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
original_g = json_graph.node_link_graph(obj[0])

In [3]:
pd.DataFrame([[len(original_g.nodes()),len(original_g.edges())]],
             index=['Original graph'], columns=['nodes', 'edges'])

Unnamed: 0,nodes,edges
Original graph,330,1564


In [4]:
idx = [i.upper() for i in original_g.nodes()]

In [5]:
adj = nx.adjacency_matrix(original_g, nodelist=original_g.nodes())

In [6]:
idx2nodes = {}
nodes2idx = {}
for idx, node in enumerate(original_g.nodes()):
    idx2nodes[idx] = node
    nodes2idx[node] = idx

### Model

In [106]:
hidden1 = 16
hidden2 = 8
lr = 0.00025
dropout = 0.
epochs = 200
val_ratio = 0.

In [107]:
pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

In [108]:
device = torch.device('cpu')

In [109]:
features = sp.identity(adj.shape[0])
features = torch.FloatTensor(np.array(features.todense()))
n_nodes, feat_dim = features.shape

In [394]:
model_name = 'GAE'

In [395]:
n_iter = 10

In [396]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [398]:
recon_adjs = np.zeros((adj.shape[0], adj.shape[0]))
for i in range(n_iter):
    if model_name == 'GAE':
        model = GCN_AE(feat_dim, hidden1, hidden2, dropout)
    elif model_name == 'VGAE':])
        model = GCN_VAE(feat_dim, hidden1, hidden2, dropout)

    
    train_adj_norm = preprocess_graph(adj)
    adj_label = adj + sp.eye(adj.shape[0
    adj_label = torch.FloatTensor(adj_label.toarray())
    
    train_adj_norm, features, adj_label, pos_weight = train_adj_norm.to(device), features.to(device), adj_label.to(device), pos_weight.to(device)
    
    model= model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    hidden_emb = None
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
    
        if model_name == 'GAE':
            recovered = model(features, train_adj_norm)
            loss = loss_function_gae(preds=recovered, labels=adj_label, norm=norm, pos_weight=pos_weight)
        else:
            recovered, mu, logvar = model(features, train_adj_norm)
            loss = loss_function_vgae(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight)

        loss.backward()
        cur_loss = loss.item()
        optimizer.step()
 
        hidden_emb = recovered.data.cpu().numpy()
        recon_adj = sigmoid(np.dot(hidden_emb, hidden_emb.T))
    recon_adjs += sigmoid(recon_adj)
    print(f'Prediction {i+1} complete')

Prediction 1 complete
Prediction 2 complete
Prediction 3 complete
Prediction 4 complete
Prediction 5 complete
Prediction 6 complete
Prediction 7 complete
Prediction 8 complete
Prediction 9 complete
Prediction 10 complete


In [421]:
avg_recon_adjs = recon_adjs/n_iter - sp.eye(adj.shape[0])
flat_recon = np.array(avg_recon_adjs.ravel().tolist()[0])

In [422]:
threshold = np.quantile(flat_recon, 0.98)

In [423]:
avg_recon_adjs[avg_recon_adjs < threshold] = 0.

In [424]:
pred_np = avg_recon_adjs + adj
pred_np[pred_np != 0] = 1.

In [507]:
print(f'Num. of edges in original graph: {int(np.count_nonzero(adj.todense())/2)}')
print(f'Num. of edges in predicted graph: {int(np.count_nonzero(pred_np)/2)}')
print(f'Num. of newly generated edges: {int(np.count_nonzero(pred_np)/2) - int(np.count_nonzero(adj.todense())/2)}')

Num. of edges in original graph: 1564
Num. of edges in predicted graph: 2199
Num. of newly generated edges: 635


### 예측 결과 시각화를 위한 전처리

In [426]:
pred_up = sp.triu(sp.csr_matrix(pred_np)).toarray()
adj_up = sp.triu(adj).toarray()

In [439]:
idx = [i.upper() for i in original_g.nodes()]
pred_up_df = pd.DataFrame(pred_up, columns=idx, index=idx); pred_up_df

Unnamed: 0,A01K,A01M,A01N,A41D,A42B,A47B,A61B,A61F,A61H,A61K,...,H04M,H04N,H04Q,H04R,H04S,H04W,H05B,H05F,H05H,H05K
A01K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A41D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A42B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H04W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Gephi 그래프 노드 정보 입력에 필요한 노드 리스트 csv 파일 구축

In [440]:
nodes = pd.DataFrame(pred_up_df.columns.tolist())
nodes = nodes.rename(columns={0:'label'})
nodes['id'] = nodes.index

In [441]:
nodes = nodes[['id', 'label']]

In [442]:
nodes

Unnamed: 0,id,label
0,0,A01K
1,1,A01M
2,2,A01N
3,3,A41D
4,4,A42B
...,...,...
325,325,H04W
326,326,H05B
327,327,H05F
328,328,H05H


In [461]:
# nodes.to_csv('../data/nodeid.csv', index=False)

#### 예측 그래프 edgelist에 기존에 존재하던 edge와 새로 생성된 edge를 구별하여 레이블 부여 
- int 형태의 레이블(1, 2)로 설정할 경우

In [491]:
pred_G = nx.from_pandas_adjacency(pred_up_df)

In [492]:
pred_elist = pred_up_df.stack().reset_index()
pred_elist = pred_elist.rename(columns={'level_0':'source', 'level_1':'target', 0: 'label'})
pred_elist = pred_elist[pred_elist['label'] != 0]
pred_elist['label'] = pred_elist['label'].astype(int)
pred_elist = pred_elist.reset_index(drop=True); pred_elist

Unnamed: 0,source,target,label
0,A01K,G05F,1
1,A01K,G06Q,1
2,A01K,H04Q,1
3,A01N,C07D,1
4,A01N,C07K,1
...,...,...,...
2194,H04L,H04M,1
2195,H04L,H04N,1
2196,H04L,H04W,1
2197,H04M,H04N,1


In [493]:
new_edge = pred_up - adj_up

In [494]:
new_edge_df = pd.DataFrame(new_edge, columns=idx, index=idx); new_edge_df

Unnamed: 0,A01K,A01M,A01N,A41D,A42B,A47B,A61B,A61F,A61H,A61K,...,H04M,H04N,H04Q,H04R,H04S,H04W,H05B,H05F,H05H,H05K
A01K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A41D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A42B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H04W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [495]:
newedge_G = nx.from_pandas_adjacency(new_edge_df)

In [496]:
for i in pred_elist.index:
    pair1 = (pred_elist.iloc[i][0], pred_elist.iloc[i][1])
    pair2 = (pred_elist.iloc[i][1], pred_elist.iloc[i][0])
    for j in newedge_G.edges():
        if j == pair1 or j == pair2:
            pred_elist['label'][i] = 2

In [497]:
pred_elist['label'].value_counts()

1    1564
2     635
Name: label, dtype: int64

In [498]:
for i in pred_elist.index:
    pred_elist['source'][i] = nodes2idx[pred_elist.iloc[i][0]]
    pred_elist['target'][i] = nodes2idx[pred_elist.iloc[i][1]]
pred_elist

Unnamed: 0,source,target,label
0,0,260,1
1,0,266,1
2,0,322,1
3,2,121,1
4,2,123,1
...,...,...,...
2194,319,320,1
2195,319,321,1
2196,319,325,1
2197,320,321,1


In [499]:
# pred_elist.to_csv('../data/pred_int_label.csv', index=False)

#### Float 레이블이 필요할 경우

In [500]:
pred_elist2 = pred_elist.copy()
pred_elist2['label'] = pred_elist2['label'].astype(float)

In [501]:
for i in pred_elist2.index:
    if pred_elist2['label'][i] == 1:
        pred_elist2['label'][i] = 0.95
    else:
        pred_elist2['label'][i] = 1.

In [503]:
pred_elist2 = pred_elist2.rename(columns={'label':'weight'})
pred_elist2

Unnamed: 0,source,target,weight
0,0,260,0.95
1,0,266,0.95
2,0,322,0.95
3,2,121,0.95
4,2,123,0.95
...,...,...,...
2194,319,320,0.95
2195,319,321,0.95
2196,319,325,0.95
2197,320,321,0.95


In [504]:
pred_elist2.to_csv('../data/pred_float_label.csv', index=False)

### 예측 그래프 저장

In [517]:
pred = pd.DataFrame(avg_recon_adjs + adj, columns=idx, index=idx)

In [518]:
pred

Unnamed: 0,A01K,A01M,A01N,A41D,A42B,A47B,A61B,A61F,A61H,A61K,...,H04M,H04N,H04Q,H04R,H04S,H04W,H05B,H05F,H05H,H05K
A01K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A41D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A42B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H04W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H05H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [519]:
pred_G = nx.from_pandas_adjacency(pred)
pred_G_json = json_graph.node_link_data(pred_G)

In [520]:
pred_G

<networkx.classes.graph.Graph at 0x1e53b22c880>

In [522]:
# with open('../data/pred.graph', 'w') as f:
#     json.dump(pred_G_json, f)

In [523]:
# with open('../data/pred.graph', 'r') as f:
#     data1 = json.load(f)
# pred_g = json_graph.node_link_graph(data1)
# pred_g

<networkx.classes.graph.Graph at 0x1e53b5dc970>

In [525]:
len(pred_g.edges())

2199