In [28]:
from __future__ import division
from __future__ import print_function

import argparse
import time

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from torch import optim
import networkx as nx

from gae.model import GCNModelVAE
from gae.optimizer import loss_function
from gae.utils import mask_test_edges, preprocess_graph, get_roc_score

In [29]:
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='gcn_vae', help="models used")
parser.add_argument('--seed', type=int, default=42, help='Random seed.')
parser.add_argument('--epochs', type=int, default=200, help='Number of epochs to train.')
parser.add_argument('--hidden1', type=int, default=32, help='Number of units in hidden layer 1.')
parser.add_argument('--hidden2', type=int, default=16, help='Number of units in hidden layer 2.')
parser.add_argument('--lr', type=float, default=0.01, help='Initial learning rate.')
parser.add_argument('--dropout', type=float, default=0., help='Dropout rate (1 - keep probability).')
parser.add_argument('--dataset-str', type=str, default='email', help='type of dataset.')

args,_ = parser.parse_known_args()

In [37]:
def load_data(adj_name):
    if adj_name == 'Cora':
        nodes_numbers = 2708
        datasets = Planetoid('./datasets', adj_name)
        edges = datasets[0].edge_index
        raw_edges = pd.DataFrame([[edges[0,i].item(), edges[1,i].item()] for i in range(edges.shape[1])])
    elif adj_name == 'wiki':
        nodes_numbers = 2405
        raw_edges = pd.read_csv('datasets/graph.txt', header=None, sep='\t')
    elif adj_name == 'Citeseer':
        nodes_numbers = 3327
        datasets = Planetoid('./datasets', adj_name)
        edges = datasets[0].edge_index
        raw_edges = pd.DataFrame([[edges[0,i].item(), edges[1,i].item()] for i in range(edges.shape[1])])
    elif adj_name == 'email':
        nodes_numbers = 1133
        raw_edges = pd.read_csv("datasets/ia-email-univ.mtx",header=None,sep=' ') - 1
    else:
        print("Dataset is not exist!")
    
    drop_self_loop = raw_edges[raw_edges[0]!=raw_edges[1]]
    
    graph_np = np.zeros((nodes_numbers, nodes_numbers))
    
    for i in range(drop_self_loop.shape[0]):
        graph_np[drop_self_loop.iloc[i,0], drop_self_loop.iloc[i,1]]=1
        graph_np[drop_self_loop.iloc[i,1], drop_self_loop.iloc[i,0]]=1
    
    adj = nx.adjacency_matrix(nx.from_numpy_matrix(graph_np))
    
    features = torch.eye(nodes_numbers)
    
    return adj, features

In [38]:
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = torch.tensor(float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum())
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    hidden_emb = None
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        recovered, mu, logvar = model(features, adj_norm)
        
        loss = loss_function(preds=recovered, labels=adj_label,
                             mu=mu, logvar=logvar, n_nodes=n_nodes,
                             norm=norm, pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss),
              "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t)
              )

    print("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
    print('Test ROC score: ' + str(roc_score))
    print('Test AP score: ' + str(ap_score))

In [39]:
if __name__ == '__main__':
    gae_for(args)

Using soc dataset
Epoch: 0001 train_loss= 1.72463 val_ap= 0.57052 time= 0.05286
Epoch: 0002 train_loss= 1.68511 val_ap= 0.66478 time= 0.06113
Epoch: 0003 train_loss= 1.68646 val_ap= 0.73518 time= 0.05784
Epoch: 0004 train_loss= 1.71408 val_ap= 0.76006 time= 0.05585
Epoch: 0005 train_loss= 1.67026 val_ap= 0.76865 time= 0.05086
Epoch: 0006 train_loss= 1.62372 val_ap= 0.76952 time= 0.05884
Epoch: 0007 train_loss= 1.56628 val_ap= 0.77000 time= 0.10572
Epoch: 0008 train_loss= 1.53263 val_ap= 0.76914 time= 0.09375
Epoch: 0009 train_loss= 1.47744 val_ap= 0.76835 time= 0.08777
Epoch: 0010 train_loss= 1.42757 val_ap= 0.76736 time= 0.10172
Epoch: 0011 train_loss= 1.39451 val_ap= 0.76580 time= 0.06084
Epoch: 0012 train_loss= 1.30147 val_ap= 0.76415 time= 0.06283
Epoch: 0013 train_loss= 1.26027 val_ap= 0.76227 time= 0.05633
Epoch: 0014 train_loss= 1.18787 val_ap= 0.76096 time= 0.05606
Epoch: 0015 train_loss= 1.12298 val_ap= 0.75992 time= 0.05503
Epoch: 0016 train_loss= 1.06759 val_ap= 0.76043 time

Epoch: 0133 train_loss= 0.51031 val_ap= 0.86342 time= 0.06765
Epoch: 0134 train_loss= 0.50972 val_ap= 0.86425 time= 0.05965
Epoch: 0135 train_loss= 0.50842 val_ap= 0.86489 time= 0.06004
Epoch: 0136 train_loss= 0.50812 val_ap= 0.86511 time= 0.05969
Epoch: 0137 train_loss= 0.50715 val_ap= 0.86585 time= 0.06164
Epoch: 0138 train_loss= 0.50630 val_ap= 0.86585 time= 0.05606
Epoch: 0139 train_loss= 0.50630 val_ap= 0.86661 time= 0.05596
Epoch: 0140 train_loss= 0.50570 val_ap= 0.86702 time= 0.06371
Epoch: 0141 train_loss= 0.50408 val_ap= 0.86748 time= 0.05758
Epoch: 0142 train_loss= 0.50349 val_ap= 0.86752 time= 0.05102
Epoch: 0143 train_loss= 0.50372 val_ap= 0.86765 time= 0.05801
Epoch: 0144 train_loss= 0.50395 val_ap= 0.86848 time= 0.08178
Epoch: 0145 train_loss= 0.50239 val_ap= 0.86926 time= 0.10083
Epoch: 0146 train_loss= 0.50101 val_ap= 0.87013 time= 0.09133
Epoch: 0147 train_loss= 0.50166 val_ap= 0.87122 time= 0.09110
Epoch: 0148 train_loss= 0.50151 val_ap= 0.87200 time= 0.06846
Epoch: 0