In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
# import torch_xla
# import torch_xla.core.xla_model as xm

In [None]:
from vgae.model import GCN_VAE
from vgae.optimizer import loss_function
from vgae.utils import * 
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from torch import optim
import torch.nn.functional as F
from datetime import datetime, timedelta
import time
import json
from collections import OrderedDict

### Settings

In [None]:
model_name = 'vgae'
dataset = 'pubmed' # 'citeseer', 'cora', 'pubmed', 'arx'
task = 'link_prediction' # 'classification', 'link_prediction'
feature_use = True # option only for link prediction
feat_norm = False
n_iter = 10

In [None]:
hidden1 = 32
hidden2 = 16
lr = 0.01
dropout = 0.
epochs = 200
test_val_ratio = [0.1, 0.05] # len_test = len_total * test_ratio, len_val = len_total * val_ratio
n_diff_range = [2]
alpha_range = [0.1, 0.3, 0.5, 0.7, 0.9]

In [None]:
# n_diff = 5
# alpha = 0.5
# hparams = epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha

### Import dataset

In [None]:
if task == 'link_prediction':
    adj, features = load_data(dataset, task, feat_norm)
elif task == 'classification':
    adj, features, labels = load_data(dataset, task, feat_norm)

In [None]:
# if dataset == 'pubmed':
#     device = torch.device('cpu')
# else: 
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = xm.xla_device()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### VGAE(iterative operation)

In [None]:
for n_diff in n_diff_range:
    for alpha in alpha_range:
        
        hparams = epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha
        
        results = vgae_iter(adj, features, hparams, feature_use=feature_use, n_iter=n_iter)
        
        date = (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
        hparams_add = hparams[:-2] + (feat_norm,) + (feature_use,)
        if n_diff == 1 and alpha == 0.5:
            experiment_mode = 'normal mode(n_diff={}, alpha={})'.format(n_diff, alpha)
        else:
            experiment_mode = 'simplified diffusion mode(n_diff={}, alpha={})'.format(n_diff, alpha)
 
        log_dict= OrderedDict()
        log_dict['model'] = model_name
        log_dict['dataset'] = dataset
        log_dict['datetime'] = date
        log_dict['experiment mode'] =  experiment_mode
        log_dict['setting_order'] = ['epochs', 'hidden1', 'hidden2', 'lr', 'dropout', 'test_val_ratio', 'feat_norm', 'feat_use']
        log_dict['setting_value'] = hparams_add
        log_dict['iteration'] = n_iter
        log_dict['roc'] = results[0]
        log_dict['roc_mean'] = np.mean(results[0])
        log_dict['roc_std'] = np.std(results[0])
        log_dict['ap'] = results[1]
        log_dict['ap_mean'] = np.mean(results[1])
        log_dict['ap_std'] = np.std(results[1])
        log_dict['elapsed time(s)'] = np.round(results[2], 4)
        print(pd.DataFrame(log_dict.items(), columns=['key', 'value']))
 
        data = json.load(open('results_link_prediction.json'))
        data.append(log_dict)
        with open('results_link_prediction.json', 'w') as f:
            json.dump(data, f)
        print("Last data saved at: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
        print("Total data num: {}".format(len(data)))

Dataset: pubmed
Simdiff_params: n_diff =  2, alpha = 0.1
Start time: 2021-09-08 23:31:17
Experiment 1 - 1/200 val_roc: 0.6807 val_ap: 0.6599
Experiment 1 - 51/200 val_roc: 0.8102 val_ap: 0.8152
Experiment 1 - 101/200 val_roc: 0.8296 val_ap: 0.8352
Experiment 1 - 151/200 val_roc: 0.8927 val_ap: 0.8928
Experiment 1 result - ROC(AUC) score: 0.92514, AP score: 0.92331
Experiment 2 - 1/200 val_roc: 0.7216 val_ap: 0.7117
Experiment 2 - 51/200 val_roc: 0.8108 val_ap: 0.8121
Experiment 2 - 101/200 val_roc: 0.8825 val_ap: 0.8804
Experiment 2 - 151/200 val_roc: 0.9161 val_ap: 0.9146
Experiment 2 result - ROC(AUC) score: 0.92653, AP score: 0.92411
Experiment 3 - 1/200 val_roc: 0.7378 val_ap: 0.7185
Experiment 3 - 51/200 val_roc: 0.8258 val_ap: 0.8220
Experiment 3 - 101/200 val_roc: 0.8724 val_ap: 0.8731
Experiment 3 - 151/200 val_roc: 0.8871 val_ap: 0.8921
Experiment 3 result - ROC(AUC) score: 0.89644, AP score: 0.89895
Experiment 4 - 1/200 val_roc: 0.6796 val_ap: 0.6681
Experiment 4 - 51/200 val

### VGAE(single operation)

In [None]:
# results = vgae_iter(adj, features, hparams, feature_use=feature_use, n_iter=n_iter)

### Save log

In [None]:
# date = (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
# hparams_add = hparams[:-2] + (feat_norm,) + (feature_use,)
# if n_diff == 1 and alpha == 0.5:
#     experiment_mode = 'normal mode(n_diff={}, alpha={})'.format(n_diff, alpha)
# else:
#     experiment_mode = 'simplified diffusion mode(n_diff={}, alpha={})'.format(n_diff, alpha)

In [None]:
# log_dict= OrderedDict()

In [None]:
# log_dict['model'] = model_name
# log_dict['dataset'] = dataset
# log_dict['datetime'] = date
# log_dict['experiment mode'] =  experiment_mode
# log_dict['setting_order'] = ['epochs', 'hidden1', 'hidden2', 'lr', 'dropout', 'test_val_ratio', 'feat_norm', 'feat_use']
# log_dict['setting_value'] = hparams_add
# log_dict['iteration'] = n_iter
# log_dict['roc'] = results[0]
# log_dict['roc_mean'] = np.mean(results[0])
# log_dict['roc_std'] = np.std(results[0])
# log_dict['ap'] = results[1]
# log_dict['ap_mean'] = np.mean(results[1])
# log_dict['ap_std'] = np.std(results[1])
# log_dict['elapsed time(s)'] = np.round(results[2], 4)
# pd.DataFrame(log_dict.items(), columns=['key', 'value'])

In [None]:
# data = json.load(open('results_link_prediction.json'))
# data.append(log_dict)
# with open('results_link_prediction.json', 'w') as f:
#     json.dump(data, f)
# print("Last data saved at: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
# print("Total data num: {}".format(len(data)))

Last data saved at: 2021-09-03 11:07:55
Total data num: 12


In [None]:
# if wanna clear results(be careful!)
# empty = []
# with open('results_link_prediction.json', 'w') as f:
#     json.dump(empty, f)

### Define VGAE

In [None]:
def vgae_iter(adj, features, hparams, feature_use=False, n_iter=1):
    
    epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha = hparams
    test = 1/test_val_ratio[0]
    val = 1/test_val_ratio[1]
    roc_results = []
    ap_results = []
    start = time.time()
    print("Dataset: {}".format(dataset))
    print("Simdiff_params: n_diff =  {}, alpha = {}".format(n_diff, alpha))
    print("Start time: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
    
    for i in range(n_iter):
        adj_orig = remove_diag(adj)
        while True:
            try:
                adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj, test, val)
            except AssertionError:
                continue
            break
        adj_ = adj_train
        adj_norm = preprocess_graph_diff(adj_, n_diff, alpha)
            
        pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
        norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
        
        if feature_use == False:
            features = sp.identity(adj.shape[0])
            features = torch.FloatTensor(np.array(features.todense()))
        n_nodes, feat_dim = features.shape
            
        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = torch.FloatTensor(adj_label.toarray())
 
        adj_norm, features, adj_label, pos_weight = adj_norm.to(device), features.to(device), adj_label.to(device), pos_weight.to(device)
        
        model = GCN_VAE(feat_dim, hidden1, hidden2, dropout)
        model= model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        hidden_emb = None
        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            recovered, mu, logvar = model(features, adj_norm)
            while True:
                try:
                    loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight)
                except RuntimeError:
                    continue
                break
            loss.backward()
            cur_loss = loss.item()
            optimizer.step()
 
            hidden_emb = mu.data.cpu().numpy()
            roc_curr, ap_curr = get_roc_score_vgae(hidden_emb, adj_orig, val_edges, val_edges_false)
            if epoch % 50 == 0:
                print('Experiment {} - {}/{} val_roc: {:.4f} val_ap: {:.4f}'.format(i+1, epoch+1, epochs, roc_curr, ap_curr), end='\n', flush=True)
 
        roc_score, ap_score = get_roc_score_vgae(hidden_emb, adj_orig, test_edges, test_edges_false)
        roc_results.append(roc_score)
        ap_results.append(ap_score)
        print('Experiment {} result - ROC(AUC) score: {}, AP score: {}'.format(i+1, np.round(roc_score, 5), np.round(ap_score, 5)), end='\n')
    elapsed_time = time.time()-start
    print('All experiments finished!', '\nElapsed time: {:.2f}s'.format(elapsed_time), end='\n')
    print("End time: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
    return roc_results, ap_results, elapsed_time