In [33]:
from __future__ import division
from __future__ import print_function

import argparse
import time

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from torch import optim
import networkx as nx
from torch_geometric.datasets import Planetoid

from gae.model import GCNModelVAE
from gae.optimizer import loss_function
from gae.utils import mask_test_edges, preprocess_graph, get_roc_score

In [34]:
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='gcn_vae', help="models used")
parser.add_argument('--seed', type=int, default=42, help='Random seed.')
parser.add_argument('--epochs', type=int, default=200, help='Number of epochs to train.')
parser.add_argument('--hidden1', type=int, default=32, help='Number of units in hidden layer 1.')
parser.add_argument('--hidden2', type=int, default=16, help='Number of units in hidden layer 2.')
parser.add_argument('--lr', type=float, default=0.01, help='Initial learning rate.')
parser.add_argument('--dropout', type=float, default=0., help='Dropout rate (1 - keep probability).')
parser.add_argument('--dataset-str', type=str, default='Citeseer', help='type of dataset.')

args,_ = parser.parse_known_args()

In [35]:
def load_data(adj_name):
    if adj_name == 'Cora':
        nodes_numbers = 2708
        datasets = Planetoid('./datasets', adj_name)
        edges = datasets[0].edge_index
        raw_edges = pd.DataFrame([[edges[0,i].item(), edges[1,i].item()] for i in range(edges.shape[1])])
    elif adj_name == 'Citeseer':
        nodes_numbers = 3327
        datasets = Planetoid('./datasets', adj_name)
        edges = datasets[0].edge_index
        raw_edges = pd.DataFrame([[edges[0,i].item(), edges[1,i].item()] for i in range(edges.shape[1])])
    elif adj_name == 'wiki':
        nodes_numbers = 2405
        raw_edges = pd.read_csv('datasets/graph.txt', header=None, sep='\t')
    else:
        print("Dataset is not exist!")
    
    drop_self_loop = raw_edges[raw_edges[0]!=raw_edges[1]]
    
    graph_np = np.zeros((nodes_numbers, nodes_numbers))
    
    for i in range(drop_self_loop.shape[0]):
        graph_np[drop_self_loop.iloc[i,0], drop_self_loop.iloc[i,1]]=1
        graph_np[drop_self_loop.iloc[i,1], drop_self_loop.iloc[i,0]]=1
    
    adj = nx.adjacency_matrix(nx.from_numpy_matrix(graph_np))
    
    features = torch.eye(nodes_numbers)
    
    return adj, features

In [36]:
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = torch.tensor(float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum())
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    hidden_emb = None
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        recovered, mu, logvar = model(features, adj_norm)
        
        loss = loss_function(preds=recovered, labels=adj_label,
                             mu=mu, logvar=logvar, n_nodes=n_nodes,
                             norm=norm, pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss),
              "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t)
              )

    print("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
    print('Test ROC score: ' + str(roc_score))
    print('Test AP score: ' + str(ap_score))

In [37]:
if __name__ == '__main__':
    gae_for(args)

Using Citeseer dataset
Epoch: 0001 train_loss= 1.73496 val_ap= 0.54567 time= 0.30102
Epoch: 0002 train_loss= 1.72790 val_ap= 0.57563 time= 0.31863
Epoch: 0003 train_loss= 1.70738 val_ap= 0.61210 time= 0.44780
Epoch: 0004 train_loss= 1.66278 val_ap= 0.65622 time= 0.59904
Epoch: 0005 train_loss= 1.65923 val_ap= 0.67667 time= 0.45955
Epoch: 0006 train_loss= 1.61362 val_ap= 0.68720 time= 0.31438
Epoch: 0007 train_loss= 1.62774 val_ap= 0.69397 time= 0.33111
Epoch: 0008 train_loss= 1.58066 val_ap= 0.69798 time= 0.43384
Epoch: 0009 train_loss= 1.55686 val_ap= 0.69915 time= 0.57991
Epoch: 0010 train_loss= 1.50854 val_ap= 0.70172 time= 0.46565
Epoch: 0011 train_loss= 1.48676 val_ap= 0.70141 time= 0.32072
Epoch: 0012 train_loss= 1.39649 val_ap= 0.69957 time= 0.33610
Epoch: 0013 train_loss= 1.35461 val_ap= 0.69835 time= 0.35940
Epoch: 0014 train_loss= 1.30349 val_ap= 0.69756 time= 0.46204
Epoch: 0015 train_loss= 1.25709 val_ap= 0.69671 time= 0.34834
Epoch: 0016 train_loss= 1.17117 val_ap= 0.69642

Epoch: 0133 train_loss= 0.43401 val_ap= 0.81264 time= 0.49036
Epoch: 0134 train_loss= 0.43377 val_ap= 0.81231 time= 0.37223
Epoch: 0135 train_loss= 0.43309 val_ap= 0.81235 time= 0.31754
Epoch: 0136 train_loss= 0.43312 val_ap= 0.81154 time= 0.41118
Epoch: 0137 train_loss= 0.43255 val_ap= 0.81101 time= 0.55997
Epoch: 0138 train_loss= 0.43244 val_ap= 0.81122 time= 0.50244
Epoch: 0139 train_loss= 0.43239 val_ap= 0.81194 time= 0.37400
Epoch: 0140 train_loss= 0.43227 val_ap= 0.81204 time= 0.31434
Epoch: 0141 train_loss= 0.43207 val_ap= 0.81217 time= 0.36655
Epoch: 0142 train_loss= 0.43191 val_ap= 0.81158 time= 0.54404
Epoch: 0143 train_loss= 0.43167 val_ap= 0.81124 time= 0.53211
Epoch: 0144 train_loss= 0.43141 val_ap= 0.81090 time= 0.39095
Epoch: 0145 train_loss= 0.43121 val_ap= 0.81087 time= 0.31335
Epoch: 0146 train_loss= 0.43129 val_ap= 0.81120 time= 0.33757
Epoch: 0147 train_loss= 0.43094 val_ap= 0.81128 time= 0.46771
Epoch: 0148 train_loss= 0.43070 val_ap= 0.81119 time= 0.37819
Epoch: 0