In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
# pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio===0.9.1 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
from gae.model import GCN_AE
from gae.optimizer import loss_function
from gae.utils import *
import numpy as np
import scipy.sparse as sp
import torch
from torch import optim
import torch.nn.functional as F
from datetime import datetime, timedelta
import pandas as pd
import time
import json
from collections import OrderedDict

In [None]:
print(torch.version.cuda)

11.1


### Settings

In [None]:
model_name = 'gae'
dataset = 'pubmed' # 'citeseer', 'cora', 'pubmed', 'arx'
task = 'link_prediction' # 'classification', 'link_prediction'
feature_use = True # option only for link prediction
feat_norm = False
n_iter = 10

In [None]:
hidden1 = 32
hidden2 = 16
lr = 0.01
dropout = 0.
epochs = 200
test_val_ratio = [0.1, 0.05] # len_test = len_total * test_ratio, len_val = len_total * val_ratio
n_diff_range = [5]
alpha_range = [0.5]

In [None]:
# n_diff = 1
# alpha = 0.5

In [None]:
# hparams = epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha

### Import dataset

In [None]:
if task == 'link_prediction':
    adj, features = load_data(dataset, task, feat_norm)
elif task == 'classification':
    adj, features, labels = load_data(dataset, task, feat_norm)

In [None]:
# device = torch.device('cpu')
device = torch.device('cpu')
device

device(type='cpu')

In [26]:
data = json.load(open('results_link_prediction.json'))
len(data)

400

In [27]:
data[350:355]

[{'ap': [0.9179010442650157,
   0.904843431612509,
   0.8937775710943865,
   0.8876549357578043,
   0.9147400331010008,
   0.9045643381497407,
   0.9135599283106204,
   0.9119238566121137,
   0.9163552305094279,
   0.914469768802574],
  'ap_mean': 0.9079790138215194,
  'ap_std': 0.009692256587751651,
  'dataset': 'pubmed',
  'datetime': '2021-10-05 15:48:47',
  'elapsed time(s)': 47.0691,
  'experiment mode': 'simplified diffusion mode(n_diff=1, alpha=0.1)',
  'iteration': 10,
  'model': 'gae',
  'roc': [0.9043511736814662,
   0.8960328250710251,
   0.8813648524187535,
   0.8924462377381109,
   0.9002870244482455,
   0.899288042922094,
   0.9024360529996248,
   0.9022654951780867,
   0.9057253824150014,
   0.910574097627297],
  'roc_mean': 0.8994771184499706,
  'roc_std': 0.007697485438875076,
  'setting_order': ['epochs',
   'hidden1',
   'hidden2',
   'lr',
   'dropout',
   'test_val_ratio',
   'feat_norm',
   'feat_use'],
  'setting_value': [200, 32, 16, 0.01, 0.0, [0.1, 0.05], Fals

### GAE(iterative operation)

In [None]:
for n_diff in n_diff_range:
    for alpha in alpha_range:
        
        hparams = epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha
        
        results = gae_iter(adj, features, hparams, feature_use=feature_use, n_iter=n_iter)
        
        date = (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
        hparams_add = hparams[:-2] + (feat_norm,) + (feature_use,)
        if n_diff == 1 and alpha == 0.5:
            experiment_mode = 'normal mode(n_diff={}, alpha={})'.format(n_diff, alpha)
        else:
            experiment_mode = 'simplified diffusion mode(n_diff={}, alpha={})'.format(n_diff, alpha)
 
        log_dict= OrderedDict()
        log_dict['model'] = model_name
        log_dict['dataset'] = dataset
        log_dict['datetime'] = date
        log_dict['experiment mode'] =  experiment_mode
        log_dict['setting_order'] = ['epochs', 'hidden1', 'hidden2', 'lr', 'dropout', 'test_val_ratio', 'feat_norm', 'feat_use']
        log_dict['setting_value'] = hparams_add
        log_dict['iteration'] = n_iter
        log_dict['roc'] = results[0]
        log_dict['roc_mean'] = np.mean(results[0])
        log_dict['roc_std'] = np.std(results[0])
        log_dict['ap'] = results[1]
        log_dict['ap_mean'] = np.mean(results[1])
        log_dict['ap_std'] = np.std(results[1])
        log_dict['elapsed time(s)'] = np.round(results[2], 4)
        print(pd.DataFrame(log_dict.items(), columns=['key', 'value']))
 
        data = json.load(open('results_link_prediction_2.json'))
        data.append(log_dict)
        with open('results_link_prediction_2.json', 'w') as f:
            json.dump(data, f)
        print("Last data saved at: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
        print("Total data num: {}".format(len(data)))

Dataset: pubmed
Simdiff_params: n_diff =  5, alpha = 0.5
Feature use: True
Start time: 2021-10-23 11:02:54
Experiment 1 - 1/200 val_roc: 0.8764 val_ap: 0.8596


KeyboardInterrupt: ignored

### GAE(single operation)

In [None]:
# results = gae_iter(adj, features, hparams, feature_use=feature_use, n_iter=n_iter)

### Save log

In [None]:
# date = (datetime.now()).strftime("%Y-%m-%d %H:%M:%S")
# hparams_add = hparams[:-2] + (feat_norm,) + (feature_use,)
# if n_diff == 1 and alpha == 0.5:
#     experiment_mode = 'normal mode(n_diff={}, alpha={})'.format(n_diff, alpha)
# else:
#     experiment_mode = 'simplified diffusion mode(n_diff={}, alpha={})'.format(n_diff, alpha)

In [None]:
# log_dict= OrderedDict()

In [None]:
# log_dict['model'] = model_name
# log_dict['dataset'] = dataset
# log_dict['datetime'] = date
# log_dict['experiment mode'] =  experiment_mode
# log_dict['setting_order'] = ['epochs', 'hidden1', 'hidden2', 'lr', 'dropout', 'test_val_ratio', 'feat_norm', 'feat_use']
# log_dict['setting_value'] = hparams_add
# log_dict['iteration'] = n_iter
# log_dict['roc'] = results[0]
# log_dict['roc_mean'] = np.mean(results[0])
# log_dict['roc_std'] = np.std(results[0])
# log_dict['ap'] = results[1]
# log_dict['ap_mean'] = np.mean(results[1])
# log_dict['ap_std'] = np.std(results[1])
# log_dict['elapsed time(s)'] = np.round(results[2], 4)
# log_dict
# pd.DataFrame(log_dict.items(), columns=['key', 'value'])

In [None]:
# data = json.load(open('results_link_prediction.json'))
# data.append(log_dict)
# with open('results_link_prediction.json', 'w') as f:
#     json.dump(data, f)

In [None]:
# # if wanna clear results(be careful!)
# empty = []
# with open('results_link_prediction.json', 'w') as f:
#     json.dump(empty, f)

In [None]:
def gae_iter(adj, features, hparams, feature_use=False, n_iter=1):
    
    epochs, hidden1, hidden2, lr, dropout, test_val_ratio, n_diff, alpha = hparams
    test = 1/test_val_ratio[0]
    val = 1/test_val_ratio[1]
    roc_results = []
    ap_results = []
    start = time.time()
    print("Dataset: {}".format(dataset))
    print("Simdiff_params: n_diff =  {}, alpha = {}".format(n_diff, alpha))
    print("Feature use: {}".format(feature_use))
    print("Start time: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
    
    for i in range(n_iter):
        adj_orig = remove_diag(adj)
        while True:
            try:
                adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj, test, val)
            except AssertionError:
                continue
            break
        adj_ = adj_train
        adj_norm = preprocess_graph_diff(adj_, n_diff, alpha)
            
        pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
        norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
        
        if feature_use == False:
            features = sp.identity(adj.shape[0])
            features = torch.FloatTensor(np.array(features.todense()))
        n_nodes, feat_dim = features.shape
            
        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = torch.FloatTensor(adj_label.toarray())

        adj_norm, features, adj_label, pos_weight = adj_norm.to(device), features.to(device), adj_label.to(device), pos_weight.to(device)
        
        model = GCN_AE(feat_dim, hidden1, hidden2, dropout)
        model= model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        hidden_emb = None
        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            reconstructed = model(features, adj_norm)
            loss = loss_function(preds=reconstructed, labels=adj_label, norm=norm, pos_weight=pos_weight)
            loss.backward()
            cur_loss = loss.item()
            optimizer.step()

            hidden_emb = reconstructed.data.cpu().numpy()
            roc_curr, ap_curr = get_roc_score_gae(hidden_emb, adj_orig, val_edges, val_edges_false)
            if epoch % 50 == 0:
                print('Experiment {} - {}/{} val_roc: {:.4f} val_ap: {:.4f}'.format(i+1, epoch+1, epochs, roc_curr, ap_curr), end='\n', flush=True)

        roc_score, ap_score = get_roc_score_gae(hidden_emb, adj_orig, test_edges, test_edges_false)
        roc_results.append(roc_score)
        ap_results.append(ap_score)
        print('Experiment {} result - ROC(AUC) score: {}, AP score: {}'.format(i+1, np.round(roc_score, 5), np.round(ap_score, 5)), end='\n')
    elapsed_time = time.time()-start
    print('All experiments finished!', '\nElapsed time: {:.2f}s'.format(elapsed_time), end='\n')
    print("End time: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
    return roc_results, ap_results, elapsed_time