In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
# pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio===0.9.1 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
from gae.model import GCN_AE
from gae.optimizer import loss_function
from gae.utils import *
import numpy as np
import scipy.sparse as sp
import torch
from torch import optim
import torch.nn.functional as F
from datetime import datetime, timedelta
import pandas as pd
import time
import json
from collections import OrderedDict

In [None]:
print(torch.version.cuda)

11.1


### Settings

In [None]:
model_name = 'gae'
dataset = 'cora' # 'citeseer', 'cora', 'pubmed', 'arx'
task = 'link_prediction' # 'classification', 'link_prediction'
feature_use = False # option only for link prediction
feat_norm = False
n_iter = 10

In [None]:
hidden1 = 32
hidden2 = 16
lr = 0.01
dropout = 0.
epochs = 200
test_val_ratio = [0.1, 0.05] # len_test = len_total * test_ratio, len_val = len_total * val_ratio
n_diff_range = [1,2,3,4,5]
alpha_range = [0.1,0.3,0.5,0.7,0.9]

In [None]:
# n_diff = 3
# alpha = 0.5

In [None]:
# hparams = epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha

### Import dataset

In [None]:
if task == 'link_prediction':
    adj, features = load_data(dataset, task, feat_norm)
elif task == 'classification':
    adj, features, labels = load_data(dataset, task, feat_norm)

In [None]:
device = torch.device('cuda')
device

device(type='cuda')

In [None]:
data = json.load(open('results_link_prediction_add.json'))
len(data)

50

### GAE(iterative operation)

In [None]:
for n_diff in n_diff_range:
    for alpha in alpha_range:
        
        hparams = epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha
        
        results = gae_iter(adj, features, hparams, feature_use=feature_use, n_iter=n_iter)
        
        date = (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
        hparams_add = hparams[:-2] + (feat_norm,) + (feature_use,)
        if n_diff == 1 and alpha == 0.5:
            experiment_mode = 'normal mode(n_diff={}, alpha={})'.format(n_diff, alpha)
        else:
            experiment_mode = 'simplified diffusion mode(n_diff={}, alpha={})'.format(n_diff, alpha)
 
        log_dict= OrderedDict()
        log_dict['model'] = model_name
        log_dict['dataset'] = dataset
        log_dict['datetime'] = date
        log_dict['experiment mode'] =  experiment_mode
        log_dict['setting_order'] = ['epochs', 'hidden1', 'hidden2', 'lr', 'dropout', 'test_val_ratio', 'feat_norm', 'feat_use']
        log_dict['setting_value'] = hparams_add
        log_dict['iteration'] = n_iter
        log_dict['roc'] = results[0]
        log_dict['roc_mean'] = np.mean(results[0])
        log_dict['roc_std'] = np.std(results[0])
        log_dict['ap'] = results[1]
        log_dict['ap_mean'] = np.mean(results[1])
        log_dict['ap_std'] = np.std(results[1])
        log_dict['elapsed time(s)'] = np.round(results[2], 4)
        print(pd.DataFrame(log_dict.items(), columns=['key', 'value']))
 
        data = json.load(open('results_link_prediction_add.json'))
        data.append(log_dict)
        with open('results_link_prediction_add.json', 'w') as f:
            json.dump(data, f)
        print("Last data saved at: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
        print("Total data num: {}".format(len(data)))

Dataset: cora
Simdiff_params: n_diff =  1, alpha = 0.1
Feature use: True
Start time: 2021-12-23 10:59:10
Experiment 1 - 1/200 val_roc: 0.6321 val_ap: 0.6230
Experiment 1 - 51/200 val_roc: 0.8240 val_ap: 0.8461
Experiment 1 - 101/200 val_roc: 0.8370 val_ap: 0.8524
Experiment 1 - 151/200 val_roc: 0.8468 val_ap: 0.8608
Experiment 1 result - ROC(AUC) score: 0.83716, AP score: 0.86517
Experiment 2 - 1/200 val_roc: 0.5674 val_ap: 0.5513
Experiment 2 - 51/200 val_roc: 0.8456 val_ap: 0.8534
Experiment 2 - 101/200 val_roc: 0.8535 val_ap: 0.8607
Experiment 2 - 151/200 val_roc: 0.8429 val_ap: 0.8558
Experiment 2 result - ROC(AUC) score: 0.83476, AP score: 0.85721
Experiment 3 - 1/200 val_roc: 0.6034 val_ap: 0.5958
Experiment 3 - 51/200 val_roc: 0.7982 val_ap: 0.8022
Experiment 3 - 101/200 val_roc: 0.8175 val_ap: 0.8339
Experiment 3 - 151/200 val_roc: 0.8304 val_ap: 0.8500
Experiment 3 result - ROC(AUC) score: 0.85185, AP score: 0.85915
Experiment 4 - 1/200 val_roc: 0.5866 val_ap: 0.6003
Experimen

  return 1 / (1 + np.exp(-x))


Experiment 4 - 51/200 val_roc: 0.6327 val_ap: 0.6219
Experiment 4 - 101/200 val_roc: 0.6285 val_ap: 0.6247
Experiment 4 - 151/200 val_roc: 0.6427 val_ap: 0.6406
Experiment 4 result - ROC(AUC) score: 0.61662, AP score: 0.62637
Experiment 5 - 1/200 val_roc: 0.5939 val_ap: 0.5555
Experiment 5 - 51/200 val_roc: 0.5535 val_ap: 0.5756
Experiment 5 - 101/200 val_roc: 0.5513 val_ap: 0.5798
Experiment 5 - 151/200 val_roc: 0.5476 val_ap: 0.5755
Experiment 5 result - ROC(AUC) score: 0.5666, AP score: 0.5865
Experiment 6 - 1/200 val_roc: 0.6205 val_ap: 0.5836
Experiment 6 - 51/200 val_roc: 0.5730 val_ap: 0.5710
Experiment 6 - 101/200 val_roc: 0.5735 val_ap: 0.5717
Experiment 6 - 151/200 val_roc: 0.5766 val_ap: 0.5739
Experiment 6 result - ROC(AUC) score: 0.56128, AP score: 0.56575
Experiment 7 - 1/200 val_roc: 0.5627 val_ap: 0.5356


  return 1 / (1 + np.exp(-x))


Experiment 7 - 51/200 val_roc: 0.5354 val_ap: 0.5420
Experiment 7 - 101/200 val_roc: 0.5384 val_ap: 0.5496
Experiment 7 - 151/200 val_roc: 0.5425 val_ap: 0.5535
Experiment 7 result - ROC(AUC) score: 0.56478, AP score: 0.58404
Experiment 8 - 1/200 val_roc: 0.5773 val_ap: 0.5431
Experiment 8 - 51/200 val_roc: 0.5409 val_ap: 0.5488
Experiment 8 - 101/200 val_roc: 0.5520 val_ap: 0.5565
Experiment 8 - 151/200 val_roc: 0.5559 val_ap: 0.5588
Experiment 8 result - ROC(AUC) score: 0.55105, AP score: 0.54365
Experiment 9 - 1/200 val_roc: 0.6750 val_ap: 0.6224
Experiment 9 - 51/200 val_roc: 0.5998 val_ap: 0.6006
Experiment 9 - 101/200 val_roc: 0.6035 val_ap: 0.6059
Experiment 9 - 151/200 val_roc: 0.6124 val_ap: 0.6206
Experiment 9 result - ROC(AUC) score: 0.59343, AP score: 0.59747
Experiment 10 - 1/200 val_roc: 0.6152 val_ap: 0.5709
Experiment 10 - 51/200 val_roc: 0.5908 val_ap: 0.5746
Experiment 10 - 101/200 val_roc: 0.6082 val_ap: 0.5971
Experiment 10 - 151/200 val_roc: 0.6243 val_ap: 0.6146
E

  return 1 / (1 + np.exp(-x))


Experiment 1 - 51/200 val_roc: 0.5556 val_ap: 0.5664
Experiment 1 - 101/200 val_roc: 0.5477 val_ap: 0.5665
Experiment 1 - 151/200 val_roc: 0.5371 val_ap: 0.5605
Experiment 1 result - ROC(AUC) score: 0.59342, AP score: 0.62074
Experiment 2 - 1/200 val_roc: 0.6474 val_ap: 0.5979
Experiment 2 - 51/200 val_roc: 0.6337 val_ap: 0.6377
Experiment 2 - 101/200 val_roc: 0.6398 val_ap: 0.6517
Experiment 2 - 151/200 val_roc: 0.6445 val_ap: 0.6528
Experiment 2 result - ROC(AUC) score: 0.6356, AP score: 0.63625
Experiment 3 - 1/200 val_roc: 0.6573 val_ap: 0.6100
Experiment 3 - 51/200 val_roc: 0.5000 val_ap: 0.5000
Experiment 3 - 101/200 val_roc: 0.5000 val_ap: 0.5000
Experiment 3 - 151/200 val_roc: 0.5000 val_ap: 0.5000
Experiment 3 result - ROC(AUC) score: 0.5, AP score: 0.5
Experiment 4 - 1/200 val_roc: 0.7349 val_ap: 0.7116
Experiment 4 - 51/200 val_roc: 0.5775 val_ap: 0.5567
Experiment 4 - 101/200 val_roc: 0.6052 val_ap: 0.5737
Experiment 4 - 151/200 val_roc: 0.6318 val_ap: 0.5992
Experiment 4 r

  return 1 / (1 + np.exp(-x))


Experiment 6 - 51/200 val_roc: 0.5285 val_ap: 0.5345
Experiment 6 - 101/200 val_roc: 0.5335 val_ap: 0.5451
Experiment 6 - 151/200 val_roc: 0.5404 val_ap: 0.5538
Experiment 6 result - ROC(AUC) score: 0.59392, AP score: 0.59183
Experiment 7 - 1/200 val_roc: 0.7036 val_ap: 0.6514
Experiment 7 - 51/200 val_roc: 0.5172 val_ap: 0.5196
Experiment 7 - 101/200 val_roc: 0.5191 val_ap: 0.5211
Experiment 7 - 151/200 val_roc: 0.5210 val_ap: 0.5230
Experiment 7 result - ROC(AUC) score: 0.52941, AP score: 0.52533
Experiment 8 - 1/200 val_roc: 0.6423 val_ap: 0.5946
Experiment 8 - 51/200 val_roc: 0.5019 val_ap: 0.5019
Experiment 8 - 101/200 val_roc: 0.5019 val_ap: 0.5019
Experiment 8 - 151/200 val_roc: 0.5019 val_ap: 0.5019
Experiment 8 result - ROC(AUC) score: 0.50095, AP score: 0.50095
Experiment 9 - 1/200 val_roc: 0.6753 val_ap: 0.6299
Experiment 9 - 51/200 val_roc: 0.5040 val_ap: 0.5133
Experiment 9 - 101/200 val_roc: 0.5133 val_ap: 0.5161
Experiment 9 - 151/200 val_roc: 0.5135 val_ap: 0.5188
Exper

### GAE(single operation)

In [None]:
# results = gae_iter(adj, features, hparams, feature_use=feature_use, n_iter=n_iter)

In [None]:
np.mean(results[0])

0.6873318954808464

### Save log

In [None]:
# date = (datetime.now()).strftime("%Y-%m-%d %H:%M:%S")
# hparams_add = hparams[:-2] + (feat_norm,) + (feature_use,)
# if n_diff == 1 and alpha == 0.5:
#     experiment_mode = 'normal mode(n_diff={}, alpha={})'.format(n_diff, alpha)
# else:
#     experiment_mode = 'simplified diffusion mode(n_diff={}, alpha={})'.format(n_diff, alpha)

In [None]:
# log_dict= OrderedDict()

In [None]:
# log_dict['model'] = model_name
# log_dict['dataset'] = dataset
# log_dict['datetime'] = date
# log_dict['experiment mode'] =  experiment_mode
# log_dict['setting_order'] = ['epochs', 'hidden1', 'hidden2', 'lr', 'dropout', 'test_val_ratio', 'feat_norm', 'feat_use']
# log_dict['setting_value'] = hparams_add
# log_dict['iteration'] = n_iter
# log_dict['roc'] = results[0]
# log_dict['roc_mean'] = np.mean(results[0])
# log_dict['roc_std'] = np.std(results[0])
# log_dict['ap'] = results[1]
# log_dict['ap_mean'] = np.mean(results[1])
# log_dict['ap_std'] = np.std(results[1])
# log_dict['elapsed time(s)'] = np.round(results[2], 4)
# log_dict
# pd.DataFrame(log_dict.items(), columns=['key', 'value'])

In [None]:
# data = json.load(open('results_link_prediction.json'))
# data.append(log_dict)
# with open('results_link_prediction.json', 'w') as f:
#     json.dump(data, f)

In [None]:
# # if wanna clear results(be careful!)
# empty = []
# with open('results_link_prediction_add.json', 'w') as f:
#     json.dump(empty, f)

In [None]:
def gae_iter(adj, features, hparams, feature_use=False, n_iter=1):
    
    epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha = hparams
    test = 1/test_val_ratio[0]
    val = 1/test_val_ratio[1]
    roc_results = []
    ap_results = []
    start = time.time()
    print("Dataset: {}".format(dataset))
    print("Simdiff_params: n_diff =  {}, alpha = {}".format(n_diff, alpha))
    print("Feature use: {}".format(feature_use))
    print("Start time: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
    
    for i in range(n_iter):
        adj_orig = remove_diag(adj)
        while True:
            try:
                adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj, test, val)
            except AssertionError:
                continue
            break
        adj_ = adj_train
        adj_norm = preprocess_graph_diff(adj_, n_diff, alpha)
            
        pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
        norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
        
        if feature_use == False:
            features = sp.identity(adj.shape[0])
            features = torch.FloatTensor(np.array(features.todense()))
        n_nodes, feat_dim = features.shape
            
        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = torch.FloatTensor(adj_label.toarray())

        adj_norm, features, adj_label, pos_weight = adj_norm.to(device), features.to(device), adj_label.to(device), pos_weight.to(device)
        
        model = GCN_AE(feat_dim, hidden1, hidden2, dropout)
        model= model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        hidden_emb = None
        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            reconstructed = model(features, adj_norm)
            loss = loss_function(preds=reconstructed, labels=adj_label, norm=norm, pos_weight=pos_weight)
            loss.backward()
            cur_loss = loss.item()
            optimizer.step()

            hidden_emb = reconstructed.data.cpu().numpy()
            roc_curr, ap_curr = get_roc_score_gae(hidden_emb, adj_orig, val_edges, val_edges_false)
            if epoch % 50 == 0:
                print('Experiment {} - {}/{} val_roc: {:.4f} val_ap: {:.4f}'.format(i+1, epoch+1, epochs, roc_curr, ap_curr), end='\n', flush=True)

        roc_score, ap_score = get_roc_score_gae(hidden_emb, adj_orig, test_edges, test_edges_false)
        roc_results.append(roc_score)
        ap_results.append(ap_score)
        print('Experiment {} result - ROC(AUC) score: {}, AP score: {}'.format(i+1, np.round(roc_score, 5), np.round(ap_score, 5)), end='\n')
    elapsed_time = time.time()-start
    print('All experiments finished!', '\nElapsed time: {:.2f}s'.format(elapsed_time), end='\n')
    print("End time: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
    return roc_results, ap_results, elapsed_time