In [24]:
from __future__ import division
from __future__ import print_function

import argparse
import time

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from torch import optim
import networkx as nx

from gae.model import GCNModelVAE
from gae.optimizer import loss_function
from gae.utils import mask_test_edges, preprocess_graph, get_roc_score

In [20]:
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='gcn_vae', help="models used")
parser.add_argument('--seed', type=int, default=42, help='Random seed.')
parser.add_argument('--epochs', type=int, default=200, help='Number of epochs to train.')
parser.add_argument('--hidden1', type=int, default=32, help='Number of units in hidden layer 1.')
parser.add_argument('--hidden2', type=int, default=16, help='Number of units in hidden layer 2.')
parser.add_argument('--lr', type=float, default=0.01, help='Initial learning rate.')
parser.add_argument('--dropout', type=float, default=0., help='Dropout rate (1 - keep probability).')
parser.add_argument('--dataset-str', type=str, default='wiki', help='type of dataset.')

args,_ = parser.parse_known_args()

In [21]:
def load_data(adj_name):
    if adj_name == 'Cora':
        nodes_numbers = 2708
        datasets = Planetoid('./datasets', adj_name)
        edges = datasets[0].edge_index
        raw_edges = pd.DataFrame([[edges[0,i].item(), edges[1,i].item()] for i in range(edges.shape[1])])
    elif adj_name == 'wiki':
        nodes_numbers = 2405
        raw_edges = pd.read_csv('datasets/graph.txt', header=None, sep='\t')
    elif adj_name == 'Citeseer':
        nodes_numbers = 3327
        datasets = Planetoid('./datasets', adj_name)
        edges = datasets[0].edge_index
        raw_edges = pd.DataFrame([[edges[0,i].item(), edges[1,i].item()] for i in range(edges.shape[1])])
    elif adj_name == 'soc':
        nodes_numbers = 2426
        raw_edges = pd.read_csv("datasets/soc-hamsterster.edges",header=None,sep=' ') - 1
    else:
        print("Dataset is not exist!")
    
    drop_self_loop = raw_edges[raw_edges[0]!=raw_edges[1]]
    
    graph_np = np.zeros((nodes_numbers, nodes_numbers))
    
    for i in range(drop_self_loop.shape[0]):
        graph_np[drop_self_loop.iloc[i,0], drop_self_loop.iloc[i,1]]=1
        graph_np[drop_self_loop.iloc[i,1], drop_self_loop.iloc[i,0]]=1
    
    adj = nx.adjacency_matrix(nx.from_numpy_matrix(graph_np))
    
    features = torch.eye(nodes_numbers)
    
    return adj, features

In [22]:
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = torch.tensor(float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum())
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    hidden_emb = None
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        recovered, mu, logvar = model(features, adj_norm)
        
        loss = loss_function(preds=recovered, labels=adj_label,
                             mu=mu, logvar=logvar, n_nodes=n_nodes,
                             norm=norm, pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss),
              "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t)
              )

    print("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
    print('Test ROC score: ' + str(roc_score))
    print('Test AP score: ' + str(ap_score))

In [23]:
if __name__ == '__main__':
    gae_for(args)

Using wiki dataset
Epoch: 0001 train_loss= 1.73937 val_ap= 0.67886 time= 0.21542
Epoch: 0002 train_loss= 1.70765 val_ap= 0.77238 time= 0.22512
Epoch: 0003 train_loss= 1.65570 val_ap= 0.81869 time= 0.29617
Epoch: 0004 train_loss= 1.67139 val_ap= 0.82768 time= 0.20675
Epoch: 0005 train_loss= 1.57507 val_ap= 0.82745 time= 0.21794
Epoch: 0006 train_loss= 1.54388 val_ap= 0.82617 time= 0.21340
Epoch: 0007 train_loss= 1.49327 val_ap= 0.82448 time= 0.18530
Epoch: 0008 train_loss= 1.42958 val_ap= 0.82306 time= 0.24180
Epoch: 0009 train_loss= 1.39314 val_ap= 0.82190 time= 0.32424
Epoch: 0010 train_loss= 1.32801 val_ap= 0.82074 time= 0.20645
Epoch: 0011 train_loss= 1.25310 val_ap= 0.81999 time= 0.20528
Epoch: 0012 train_loss= 1.19001 val_ap= 0.81963 time= 0.21099
Epoch: 0013 train_loss= 1.13988 val_ap= 0.81915 time= 0.20105
Epoch: 0014 train_loss= 1.08011 val_ap= 0.81869 time= 0.24740
Epoch: 0015 train_loss= 1.03262 val_ap= 0.81859 time= 0.28449
Epoch: 0016 train_loss= 0.98916 val_ap= 0.81881 tim

Epoch: 0133 train_loss= 0.46314 val_ap= 0.91928 time= 0.33494
Epoch: 0134 train_loss= 0.46287 val_ap= 0.91953 time= 0.21094
Epoch: 0135 train_loss= 0.46202 val_ap= 0.91963 time= 0.21642
Epoch: 0136 train_loss= 0.46141 val_ap= 0.91992 time= 0.19720
Epoch: 0137 train_loss= 0.46061 val_ap= 0.92026 time= 0.19893
Epoch: 0138 train_loss= 0.45989 val_ap= 0.92071 time= 0.27084
Epoch: 0139 train_loss= 0.45947 val_ap= 0.92107 time= 0.26871
Epoch: 0140 train_loss= 0.45908 val_ap= 0.92141 time= 0.21672
Epoch: 0141 train_loss= 0.45825 val_ap= 0.92159 time= 0.20146
Epoch: 0142 train_loss= 0.45729 val_ap= 0.92146 time= 0.21443
Epoch: 0143 train_loss= 0.45680 val_ap= 0.92139 time= 0.18475
Epoch: 0144 train_loss= 0.45664 val_ap= 0.92170 time= 0.27262
Epoch: 0145 train_loss= 0.45584 val_ap= 0.92199 time= 0.27652
Epoch: 0146 train_loss= 0.45529 val_ap= 0.92216 time= 0.21943
Epoch: 0147 train_loss= 0.45471 val_ap= 0.92239 time= 0.18170
Epoch: 0148 train_loss= 0.45410 val_ap= 0.92259 time= 0.18950
Epoch: 0