## Explainable Conditional LSTM random walk graph GAN training


In [1]:
from eggen import utils
from eggen.eggen_shadow import *

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle
# import seaborn as sns
# import matplotlib.pyplot as plt
%matplotlib inline

  return f(*args, **kwds)


## Load data

In [2]:
def load_dataset(file_name):
    """Load a graph from a Numpy binary file.
    Parameters
    ----------
    file_name : str
        Name of the file to load.
    Returns
    -------
    graph : dict
        Dictionary that contains:
            * 'A' : The adjacency matrix in sparse matrix format
            * 'X' : The attribute matrix in sparse matrix format
            * 'z' : The ground truth class labels
            * Further dictionaries mapping node, class and attribute IDs
    """
    if not file_name.endswith('.npz'):
        file_name += '.npz'
    with np.load(file_name) as loader:
        loader = dict(loader)
        A = sp.csr_matrix((loader['adj_data'], loader['adj_indices'],
                           loader['adj_indptr']), shape=loader['adj_shape'])

        X = sp.csr_matrix((loader['attr_data'], loader['attr_indices'],
                           loader['attr_indptr']), shape=loader['attr_shape'])

        z = loader.get('labels')

        graph = {
            'A': A,
            'X': X,
            'z': z
        }

        idx_to_node = loader.get('idx_to_node')
        if idx_to_node:
            idx_to_node = idx_to_node.tolist()
            graph['idx_to_node'] = idx_to_node

        idx_to_attr = loader.get('idx_to_attr')
        if idx_to_attr:
            idx_to_attr = idx_to_attr.tolist()
            graph['idx_to_attr'] = idx_to_attr

        idx_to_class = loader.get('idx_to_class')
        if idx_to_class:
            idx_to_class = idx_to_class.tolist()
            graph['idx_to_class'] = idx_to_class

        return graph

In [3]:
# cora, cora_ml, citeseer, dblp, pubmed
g = load_dataset('data/cora_ml.npz')
A, X, z = g['A'], g['X'], g['z']

# N= 4230, E= 5358, K=6
print("**** N nodes {:}, E edges: {:}, K classes: {:} ****".format(A.shape[0],
                                                                   np.count_nonzero(A.todense()>0),
                                                                   len(np.unique(z))
                                                                  ))

**** N nodes 2995, E edges: 8416, K classes: 7 ****


#### Adj matrix and Class data (conditions)

In [4]:
cond_list = np.stack((np.arange(len(z)), z), axis=-1) #(N nodes, 2)

In [None]:
A

In [None]:
ind_zero = np.argwhere(z==0)
ind_four = np.argwhere(z==4)
ind_five = np.argwhere(z==5)
print(len(ind_zero))
print(len(ind_four))
print(len(ind_five))
ind_remove = np.concatenate((ind_zero, ind_four, ind_five), axis=0).flatten()
print(len(ind_remove))

In [None]:
condlist_new3 = np.delete(cond_list, ind_remove, 0)
# np.unique(new3_condlist[:,1], return_counts=True)

In [None]:
A_new3 = A.todense()

""" Deleting rows and cols that are not relevant to the selected group """
A_new3 = np.delete(A_new3, ind_remove, 0)
A_new3 = np.delete(A_new3, ind_remove, 1)

A_new3 = sp.coo_matrix(A_new3)
A_new3

In [None]:
A = A_new3
cond_list = condlist_new3

## Network Analysis

## Preparing data

#### Convert from Pandas edgelist dataframe to Adjacency matrix for undirected graphs

#### Adjacency matrix of the largest connected components

In [5]:
_A_obs = A
_A_obs = A + A.T
_A_obs[_A_obs > 1] = 1
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [6]:
ind_nonlcc = np.delete(np.arange(len(cond_list)), lcc)
lcc_condlist = np.delete(cond_list, ind_nonlcc, 0)
len(lcc_condlist)

2810

In [7]:
print("**** N nodes {:}, E edges: {:}, K classes: {:} ****".format(_N,
                                                                  np.count_nonzero(_A_obs.todense())/2,
                                                                  len(np.unique(lcc_condlist[:,1]))
                                                                  ))

**** N nodes 2810, E edges: 7981.0, K classes: 7 ****


In [None]:
print(np.count_nonzero(_A_obs.todense()))

_A_obs

In [None]:
# unique_conds, conds_counts = np.unique(subset_condlist[:,1], return_counts=True)
unique_conds, conds_counts = np.unique(lcc_condlist[:,1], return_counts=True)
print(unique_conds)
print(conds_counts)

def bins_labels(bins, **kwargs):
    bin_w = (max(bins) - min(bins)) / (len(bins) - 1)
    plt.xticks(np.arange(min(bins)+bin_w/2, max(bins), bin_w), bins, **kwargs)
    plt.xlim(bins[0], bins[-1])
    
import matplotlib.pyplot as plt
bins = range(len(unique_conds)+1)
# plt.hist(subset_condlist[:,1], bins)  
plt.hist(cond_list[:,1], bins)  
bins_labels(bins, fontsize=15)
# plt.title("CORA-ML")
plt.savefig('./image/cora-ml.png')
plt.show()

In [None]:
np.sum([348, 393, 440, 407, 781, 150, 291])

#### Separate the edges into train, test, validation

In [None]:
val_share = 0.1
test_share = 0.05
seed = 2020 

In [None]:
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

In [None]:
print('train ones',train_ones.shape)
print('val_ones',val_ones.shape)
print('val_zeros',val_zeros.shape)
print('test_ones',test_ones.shape)
print('test_zeros',test_zeros.shape)

## EGGen

In [None]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

In [None]:
train_graph.todense().shape

#### Parameters

In [None]:
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 1
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 16 #5 
# setting the training data batch size
batch_size = 128
# getting the number of departments
n_conds=len(np.unique(z))

In [None]:
walker = utils.RandomWalker(train_graph, cond_list, rw_len, p=1, q=1, batch_size=batch_size)

#### An example random walk

In [None]:
example_rw = walker.cond_walk().__next__()
print("Example random walk: ", example_rw[0][0])
print("Example conditions: ", example_rw[1][0])

#### Create our generative model

In [None]:
l2_gen=1e-7; l2_disc=5e-5 #1e-4 
gen_lay=[40]; disc_lay=[30]  #[50]; disc_lay=[35]  
lr_gen=0.0002; lr_disc=0.0002 #0.0001
gen_iters=1; disc_iters=3
discWdown_size=128; genWdown_size=128 #128; 128


eggen = EGGen(_N, rw_len, walk_generator=walker, n_conds=n_conds, condition_dim=n_conds,
              gpu_id=gpu_id, use_gumbel=True, gen_iters=gen_iters, disc_iters=disc_iters, 
              W_down_discriminator_size=discWdown_size, W_down_generator_size=genWdown_size,
              l2_penalty_generator=l2_gen, l2_penalty_discriminator=l2_disc,generator_layers=gen_lay,
              discriminator_layers=disc_lay, temp_start=5, lr_gen=lr_gen, lr_disc=lr_disc
             ) #, plot_show=False

# eggen = EGGen(_N, rw_len, walk_generator=walker, n_conds=n_conds, condition_dim=n_conds,
#               gpu_id=gpu_id, use_gumbel=True, gen_iters=gen_iters, disc_iters=disc_iters,
#               W_down_discriminator_size=128, W_down_generator_size=128, l2_penalty_generator=l2_gen,
#               l2_penalty_discriminator=l2_disc,
#               generator_layers=gen_lay, discriminator_layers=disc_lay, temp_start=5,
#               lr_gen=lr_gen, lr_disc=lr_disc
#              ) #, plot_show=False

#### Define the stopping criterion

In [None]:
# #### Define the stopping criterion
stopping_criterion = "val"


assert stopping_criterion in ["val", "eo"], "Please set the desired stopping criterion."

if stopping_criterion == "val": # use val criterion for early stopping
    stopping = None
elif stopping_criterion == "eo":  #use eo criterion for early stopping
    stopping = 0.5 # set the target edge overlap here

#### Train the model

In [None]:
eval_every = plot_every = 200 
max_iters = 20000
patience= 20 

#### Load saved log file

In [None]:
# Load saved log file
save_directory = "./snapshots_gencond" 
model_name = "gencond"
max_iters_load = 20000 
eval_every_load = 200 

save_log = "{}/log8_{}_maxiter{}_evalevery{}.pkl".format(save_directory, model_name, max_iters_load, eval_every_load)

with open(save_log, 'rb') as f:
    log_dict = pickle.load(f)

In [None]:
log_dict.keys()

In [None]:
plt.plot(np.arange(len(log_dict['val_performances'])) * eval_every, 
         np.array(log_dict['val_performances'])[:,0], label="ROC-AUC")
plt.plot(np.arange(len(log_dict['val_performances'])) * eval_every,
         np.array(log_dict['val_performances'])[:,1], label="Avg. Prec.")

plt.title("Validation Performance during Training")
plt.legend()
plt.savefig('./image/top5_cond_val_perf.png')
plt.show()

In [None]:
plt.plot(np.array(log_dict['edge_overlaps'])/_A_obs.sum())
plt.title("Edge Overlap during Training")
# plt.savefig('./image/top5_cond_EO.png')
plt.show()

#### Load pretrained model

In [None]:
saver = tf.train.Saver()
saver.restore(eggen.session, "snapshots_gencond/model_best_19.ckpt") 

#### Generate random walks on the trained model

In [None]:
# sample_many = eggen.generate_discrete(10000, conds, rw_len=rw_len, reuse=True)
sample_many, explain_conds = eggen.generate_discrete(10000, rw_len=rw_len, reuse=True)

In [None]:
tf.get_default_session().run([sample_many, explain_conds], feed_dict={eggen.tau: 0.5})

In [None]:
samples = []
sampleconds = []

In [None]:
for _ in range(5000):
    if (_+1) % 500 == 0:
        print(_+1)
#     samples.append(sample_many.eval({eggen.tau: 0.5}))
    sample, samplecond = tf.get_default_session().run([sample_many, explain_conds], feed_dict={eggen.tau: 0.5})
    samples.append(sample)
    sampleconds.append(samplecond)

print("Shape of one fake walk: ", samples[0].shape)
print("Shape of one explain cond: ", sampleconds[0].shape)

#### Assemble score matrix from the random walks

In [None]:
rws = np.array(samples).reshape([-1, rw_len])
print(rws.shape)
scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()

#### Explainable transition counts

In [None]:
explain_walkconds = np.array(sampleconds).reshape([-1, rw_len])
print(explain_walkconds.shape)

# explain transition counts from RW samples
explain_walkconds = np.array(explain_walkconds)
bigram_conds = np.array(list(zip(explain_walkconds[:, :-1], explain_walkconds[:, 1:]))) # (50000, 2, 15)

bigram_conds = np.transpose(bigram_conds, [0, 2, 1]) # (50000, 15, 2) 15 transitions of a walk
bigram_conds = bigram_conds.reshape([-1, 2]) # (750000, 2) all transitions

unique_trans, trans_counts = np.unique(bigram_conds, axis=0, return_counts=True)
trans_percent = np.divide(trans_counts, np.sum(trans_counts))
explainconds = np.array(list(zip(unique_trans, trans_counts, trans_percent))) # (25, 2)

explainconds

In [None]:
np.sum(trans_percent)

#### Compute graph statistics

In [None]:
A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))

In [None]:
A_select = train_graph
A_select.sum()

In [None]:
sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())
plt.spy(sampled_graph, markersize=.2)
# plt.savefig('./image/top5_cond_spy.png')
plt.show()

In [None]:
plt.spy(A_select, markersize=.2)
# plt.savefig('./image/top5_original_spy.png')
plt.show()

In [None]:
utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()

In [None]:
utils.compute_graph_statistics(sampled_graph, encoded)

In [None]:
utils.compute_graph_statistics(A_select.toarray()) #, encoded)