## Explainable Conditional LSTM random walk graph GAN training
Cora-ML

In [1]:
from eggen import utils
from eggen.eggen_shadow import *

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle

  return f(*args, **kwds)


In [2]:
# ## Load data
def load_dataset(file_name):
    """Load a graph from a Numpy binary file.
    Parameters
    ----------
    file_name : str
        Name of the file to load.
    Returns
    -------
    graph : dict
        Dictionary that contains:
            * 'A' : The adjacency matrix in sparse matrix format
            * 'X' : The attribute matrix in sparse matrix format
            * 'z' : The ground truth class labels
            * Further dictionaries mapping node, class and attribute IDs
    """
    if not file_name.endswith('.npz'):
        file_name += '.npz'
    with np.load(file_name) as loader:
        loader = dict(loader)
        A = sp.csr_matrix((loader['adj_data'], loader['adj_indices'],
                           loader['adj_indptr']), shape=loader['adj_shape'])

        X = sp.csr_matrix((loader['attr_data'], loader['attr_indices'],
                           loader['attr_indptr']), shape=loader['attr_shape'])

        z = loader.get('labels')

        graph = {
            'A': A,
            'X': X,
            'z': z
        }

        idx_to_node = loader.get('idx_to_node')
        if idx_to_node:
            idx_to_node = idx_to_node.tolist()
            graph['idx_to_node'] = idx_to_node

        idx_to_attr = loader.get('idx_to_attr')
        if idx_to_attr:
            idx_to_attr = idx_to_attr.tolist()
            graph['idx_to_attr'] = idx_to_attr

        idx_to_class = loader.get('idx_to_class')
        if idx_to_class:
            idx_to_class = idx_to_class.tolist()
            graph['idx_to_class'] = idx_to_class

        return graph



# Datasets: pubmed, dblp, cora | citeseer, cora_ml
data_name  = "cora_ml" #"cora_ml"
dataset = "data/{}.npz".format(data_name)
g = load_dataset(dataset)
# g = load_dataset('data/cora_ml.npz')
A, X, z = g['A'], g['X'], g['z']

# #### Adj matrix and Class data (conditions)
cond_list = np.stack((np.arange(len(z)), z), axis=-1)


""" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. """ 
_A_obs = A
# _A_obs = A + A.T
_A_obs = _A_obs + _A_obs.T #
_A_obs[_A_obs > 1] = 1
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

""" Get the subset of cond_list before reindexing from 0 """
all_nodes = np.arange(A.shape[0]) #(N nodes,)
train_nodes = np.array(lcc) 
nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(657,)
nontrain = nontrain_nodes.tolist()
subset_condlist = np.delete(cond_list, nontrain, 0)
# reindexing the condlist subset
subset_condlist[:,0] = np.arange(subset_condlist.shape[0])

print("**** N nodes {:}, E edges: {:}, K classes: {:} ****".format(_N,
                                                                  np.count_nonzero(_A_obs.todense())/2,
                                                                  len(np.unique(subset_condlist[:,1]))
                                                                  ))

#### Separate the edges into train, test, validation
val_share = 0.1
test_share = 0.05
seed = 2020 #481516234  
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

## EGGen
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()


Selecting 1 largest connected components
**** N nodes 2810, E edges: 7981.0, K classes: 7 ****


In [3]:
#### Parameters
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 0
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 16 #8 #32 #
# setting the training data batch size
batch_size = 128 #512 #
# getting the number of departments
n_conds=len(np.unique(z))
print("n_conds: ", n_conds)
sample_batch = 1000 #128 #2048 #256 #512 #1024 #
# log_num = 21 #99 #

walker = utils.RandomWalker(train_graph, subset_condlist, rw_len, p=1, q=1, batch_size=batch_size, sample_batch=sample_batch)

n_conds:  7


## Create our generative model 

In [4]:
l2_gen=1e-7; l2_disc=5e-5 #1e-4 
gencond_lay=[10]; gen_lay=[50]; disc_lay=[40] 
lr_gencond=0.01; lr_gen=0.0002; lr_disc=0.0002
gencond_iters=1; gen_iters=1; disc_iters=3
discWdown_size=128; genWdown_size=128 

eggen = EGGen(_N,
rw_len,
walk_generator=walker,
n_conds=n_conds,
condgenerator_layers=gencond_lay,
generator_layers=gen_lay,
discriminator_layers=disc_lay,
W_down_discriminator_size=discWdown_size,
W_down_generator_size=genWdown_size,
batch_size=batch_size,
sample_batch=sample_batch,
condition_dim=n_conds,
gencond_iters=gencond_iters,
gen_iters=gen_iters,
disc_iters=disc_iters,
wasserstein_penalty=10, #20, #
l2_penalty_generator=l2_gen,
l2_penalty_discriminator=l2_disc,
lr_gencond=lr_gencond,
lr_gen=lr_gen,
lr_disc=lr_disc,
noise_dim=16, #32, #
noise_type="Uniform", #"Gaussian", #
temp_start=10.0, #5.0, #30.0, #
min_temperature=0.5,
temperature_decay=1-5e-5,
seed=15, #seed, #
use_gumbel=True,
legacy_generator=False,
gpu_id=gpu_id,
plot_show=False
)

intermediate:  Tensor("Generator/Generator.int_1/Tanh:0", shape=(128, 50), dtype=float32)
h:  Tensor("Generator/Generator.h_1/Tanh:0", shape=(128, 50), dtype=float32)
c:  Tensor("Generator/Generator.c_1/Tanh:0", shape=(128, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator/unstack:0", shape=(128, 7), dtype=float32)


In [5]:
""" ===================================================================================== """

model_name = "model_best_32" #24, "model_best_23"
print("Model: ", model_name)
saver = tf.train.Saver()
saver.restore(eggen.session, "snapshots_shadow/" + model_name + ".ckpt") # 

Model:  model_best_32
INFO:tensorflow:Restoring parameters from snapshots_shadow/model_best_32.ckpt


## Generate graphs to evaluate performance 

In [6]:
sample_many, explain_conds = eggen.generate_discrete(1000, conds=True, rw_len=rw_len, reuse=True) 

intermediate:  Tensor("Generator_1/Generator.int_1/Tanh:0", shape=(1000, 50), dtype=float32)
h:  Tensor("Generator_1/Generator.h_1/Tanh:0", shape=(1000, 50), dtype=float32)
c:  Tensor("Generator_1/Generator.c_1/Tanh:0", shape=(1000, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator_1/unstack:0", shape=(1000, 7), dtype=float32)


In [7]:
import time
t0 = time.time()

num_paths = 100 #500
samples = []
for _ in range(num_paths): 
    if (_+1) % round(num_paths/3) == 0:
        print(_+1)
    samples.append(sample_many.eval({eggen.tau: 0.5}))
    
t1 = time.time()
print("total time: ", t1-t0)

33
66
99
total time:  19.470201015472412


In [8]:
### Assemble score matrix from the random walks
rws = np.array(samples).reshape([-1, rw_len])
scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()

A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
A_select = train_graph

In [9]:
### Better graph from scores ###
def graph_from_scores(scores, n_edges):
#     scores=scores_matrix; n_edges=A_select.sum()

    target_g = np.zeros(scores.shape) # initialize target graph
    scores_int = scores.toarray().copy() # internal copy of the scores matrix
    scores_int[np.diag_indices_from(scores_int)] = 0  # set diagonal to zero
    N = scores.shape[0]
#     print("N: ", N)

    for n in np.random.choice(N, replace=False, size=N): # Iterate the nodes in random order
        row = scores_int[n,:].copy()
        if row.sum() == 0:
            target = np.random.choice(N)
    #         continue
        else:
            probs = row / row.sum()
            target = np.random.choice(N, p=probs)
    #         target = np.argmax(probs) # argmax probs
        target_g[n, target] = 1
        target_g[target, n] = 1

    # print(target_g.sum()/2)
    diff = (n_edges - target_g.sum())/2
#     print("diff: ", diff)
#     print("n_edges - N: ", n_edges/2 - N)
    if diff > 0:   
        triu = np.triu(scores_int) # upper triangle
        triu[target_g > 0] = 0 # set previously assigned edge to zero
        # print("triu nonzeros: ",np.count_nonzero(triu))

        num_elements = np.count_nonzero(triu) #len(triu[triu>0]) 
        avg_threshold = triu.sum()/num_elements
        tau = 1.2
        avg_threshold = avg_threshold*tau #1.485# tune
#         print("avg_threshold: ", avg_threshold)
        triu[triu < avg_threshold] = 0 # 
#         print("triu nonzeros: ",np.count_nonzero(triu))

        triu = triu / triu.sum() # every count divided by total sum
        triu_ixs = np.triu_indices_from(triu) # indices
        extra_edges = np.random.choice(triu_ixs[0].shape[0], replace=False, p=triu[triu_ixs], size=int(diff))
    #     extra_edges = triu[triu_ixs].argsort()[-int(diff):][::-1] # choose top k based on prob

        target_g[(triu_ixs[0][extra_edges], triu_ixs[1][extra_edges])] = 1
        target_g[(triu_ixs[1][extra_edges], triu_ixs[0][extra_edges])] = 1

    target_g = utils.symmetric(target_g)
    return target_g

In [10]:
### New graph statistics ###
target_g = graph_from_scores(scores_matrix, A_select.sum())
newstats = utils.compute_graph_statistics(target_g)

print(newstats['assortativity'])
print(newstats['clustering_coefficient'])
print(newstats['cpl'])
print(newstats['gini'])
print(newstats['d_max'])
print(newstats['power_law_exp'])

  (Theoretical_CDF * (1 - Theoretical_CDF))


-0.08121550638631311
0.0020952921658643053
5.2268463467071395
0.45947556771350606
224.0
1.8303458975578026


### Graph stats over 5 runs

In [11]:
dmax,dmin,deg,lcc,wc,cc,tc,sc,law,gini,rel,assrt,coe,ncomp,intra,inter,cpl,eo = ([] for i in range(18))
num_trials = 5 #1 #
num_paths = 100

def compute_stats(samples):
    rws = np.array(samples).reshape([-1, rw_len])
    print("rws: ", rws.shape)
    scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()
    
    A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
    A_select = train_graph
#     sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())
    sampled_graph = graph_from_scores(scores_matrix, A_select.sum())
    EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()
    
    stats = utils.compute_graph_statistics(sampled_graph) #, encoded)
    return stats, EO

for trials in range(num_trials):
    print("trial: ", trials)
    start_time = time.time()
  
    samples = []
    for _ in range(num_paths): #6000):
        if (_+1) % round(num_paths/3) == 0:
            print(_+1)
        samples.append(sample_many.eval({eggen.tau: 0.5}))

    stats, EO = compute_stats(samples)
    print("Trial %i: --- %s seconds ---" % (trials, time.time() - start_time))

    dmax.append(stats['d_max'])
    dmin.append(stats['d_min'])
    deg.append(stats['d'])
    lcc.append(stats['LCC'])
    wc.append(stats['wedge_count'])
    cc.append(stats['claw_count'])
    tc.append(stats['triangle_count'])
    sc.append(stats['square_count'])
    law.append(stats['power_law_exp'])
    gini.append(stats['gini'])
    rel.append(stats['rel_edge_distr_entropy'])
    assrt.append(stats['assortativity'])
    coe.append(stats['clustering_coefficient'])
    ncomp.append(stats['n_components'])
    cpl.append(stats['cpl'])
    eo.append(EO)

    print("stats: ", stats)


# all_stats = [dmax, dmin, deg, lcc, wc, cc, tc, sc, law, gini, rel, assrt, coe, ncomp, cpl, eo]
# ====== ASST CLUST CPL GINI MD PLE EO ====== 
all_stats = [assrt, coe, cpl, gini, dmax, law, eo, dmin, deg, lcc, wc, cc, tc, sc, rel, ncomp]

# avg_stats = [np.mean(dmax), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(law), np.mean(gini), np.mean(rel), np.mean(assrt), np.mean(coe), np.mean(ncomp), np.mean(cpl), np.mean(eo)]
avg_stats = [np.mean(assrt), np.mean(coe), np.mean(cpl), np.mean(gini), np.mean(dmax), np.mean(law), np.mean(eo), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(rel), np.mean(ncomp)]
print("avg_stats: ", avg_stats)


from scipy import stats
# stderror_stats = [stats.sem(dmax), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(law), stats.sem(gini), stats.sem(rel), stats.sem(assrt), stats.sem(coe), stats.sem(ncomp), stats.sem(cpl), stats.sem(eo)]
stderror_stats = [stats.sem(assrt), stats.sem(coe), stats.sem(cpl), stats.sem(gini), stats.sem(dmax), stats.sem(law), stats.sem(eo), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(rel), stats.sem(ncomp)]
    
save_directory = "./generate_stats" #"./snapshots_gencond"  #"./snapshots_gencond2" 
# log_num = 8
data_name  = "coraml" #"EUcore" #

# save_stats = "{}/{}_stats{}.txt".format(save_directory, data_name, log_num)
save_stats = "{}/{}_{}_numpaths{}.txt".format(save_directory, data_name, model_name, num_paths)

np.savetxt(save_stats, np.c_[avg_stats ,stderror_stats , all_stats])

trial:  0
33
66
99
rws:  (100000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 0: --- 22.356707334518433 seconds ---
stats:  {'d_max': 229.0, 'd_min': 1.0, 'd': 4.8277580071174375, 'LCC': 2810, 'wedge_count': 94343.0, 'claw_count': 2734118.0, 'triangle_count': 1659, 'square_count': 167, 'power_law_exp': 1.8290776033111555, 'gini': 0.4571380828038276, 'rel_edge_distr_entropy': 0.9465809567771357, 'assortativity': -0.0810516318643556, 'clustering_coefficient': 0.001820331090318706, 'n_components': 1, 'cpl': 5.198218233461586}
trial:  1
33
66
99
rws:  (100000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 1: --- 22.34714365005493 seconds ---
stats:  {'d_max': 250.0, 'd_min': 1.0, 'd': 4.8277580071174375, 'LCC': 2810, 'wedge_count': 97351.0, 'claw_count': 3236739.0, 'triangle_count': 1743, 'square_count': 176, 'power_law_exp': 1.8295333940545764, 'gini': 0.45808555300749254, 'rel_edge_distr_entropy': 0.9462880696081627, 'assortativity': -0.07571898430923082, 'clustering_coefficient': 0.0016155148746933256, 'n_components': 1, 'cpl': 5.175427483343447}
trial:  2
33
66
99
rws:  (100000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 2: --- 21.671723127365112 seconds ---
stats:  {'d_max': 203.0, 'd_min': 1.0, 'd': 4.8277580071174375, 'LCC': 2810, 'wedge_count': 90065.0, 'claw_count': 2125151.0, 'triangle_count': 1767, 'square_count': 192, 'power_law_exp': 1.8349102154504622, 'gini': 0.46396412844965673, 'rel_edge_distr_entropy': 0.9459418806344562, 'assortativity': -0.09080909741956121, 'clustering_coefficient': 0.0024944109853840975, 'n_components': 1, 'cpl': 5.219538620777901}
trial:  3
33
66
99
rws:  (100000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 3: --- 22.37575101852417 seconds ---
stats:  {'d_max': 237.0, 'd_min': 1.0, 'd': 4.8277580071174375, 'LCC': 2810, 'wedge_count': 95389.0, 'claw_count': 2908653.0, 'triangle_count': 1758, 'square_count': 188, 'power_law_exp': 1.830838067954034, 'gini': 0.45724395770670134, 'rel_edge_distr_entropy': 0.9465557806636608, 'assortativity': -0.07404191146286149, 'clustering_coefficient': 0.001813210444834774, 'n_components': 1, 'cpl': 5.157752724149246}
trial:  4
33
66
99
rws:  (100000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 4: --- 21.991105556488037 seconds ---
stats:  {'d_max': 229.0, 'd_min': 1.0, 'd': 4.8277580071174375, 'LCC': 2810, 'wedge_count': 94617.0, 'claw_count': 2747587.0, 'triangle_count': 1641, 'square_count': 159, 'power_law_exp': 1.8310659190796217, 'gini': 0.46012514014783656, 'rel_edge_distr_entropy': 0.9462562903031259, 'assortativity': -0.08555426576616852, 'clustering_coefficient': 0.0017917540008742217, 'n_components': 1, 'cpl': 5.186496125189876}
avg_stats:  [-0.08143517816443553, 0.0019070442792210249, 5.187486637384412, 0.45931137242310294, 229.6, 1.8310850399699699, 0.27645584549609314, 1.0, 4.8277580071174375, 2810.0, 94353.0, 2750449.6, 1713.6, 176.4, 0.9463245955973083, 1.0]
