## Explainable Conditional LSTM random walk graph GAN training
Enron

In [1]:
from eggen import utils
from eggen.eggen_shadow import *

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle

  return f(*args, **kwds)


In [2]:
# ## Load data
# """ Entire set of email data """
data = pd.read_csv('./data/Enron_messages.csv')
print("Data chunk size: " ,len(data))

#### Dept data (conditions)
cond_list = pd.read_csv('./data/Enron_DeptList.csv') #, sep=" ", header=None)
cond_list.columns = ['ID', 'DEPT']
cond_list = cond_list.values


import networkx as nx
import nxviz as nv

G = nx.from_pandas_edgelist(data, 'Sender ID', 'Receiver ID') #, edge_attr=['SENDER DE


## Preparing data
Adjtraining = nx.adjacency_matrix(G)
Adjtraining = sp.csr_matrix(Adjtraining, dtype='float64')
_A_obs = Adjtraining
_A_obs = _A_obs + _A_obs.T # 
_A_obs[_A_obs > 1] = 1 # Max value of 1 

""" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. """ 
lcc = utils.largest_connected_components(_A_obs) # 
_A_obs = _A_obs[lcc,:][:,lcc] # 
_N = _A_obs.shape[0] # 

""" Get the subset of cond_list before reindexing from 0 """
ind_nonlcc = np.delete(np.arange(len(cond_list)), lcc)
lcc_condlist = np.delete(cond_list, ind_nonlcc, 0)
print("Length of cond_list", len(cond_list))

print("**** N nodes {:}, E edges: {:}, K classes: {:} ****".format(_N,
                                                                  np.count_nonzero(_A_obs.todense())/2,
                                                                  len(np.unique(lcc_condlist[:,1]))
                                                                  ))

#### Separate the edges into train, test, validation
val_share = 0.1
test_share = 0.05
seed = 2020 #  
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

## EGGen
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()


Data chunk size:  38388
Selecting 1 largest connected components
Length of cond_list 156
**** N nodes 154, E edges: 1843.0, K classes: 3 ****


In [3]:
#### Parameters
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 1
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 16 #
# setting the training data batch size
batch_size = 128 #
# getting the number of departments
n_conds=len(np.unique(lcc_condlist[:,1]))
print("n_conds: ", n_conds)
sample_batch = 128 #
log_num = 31 #

walker = utils.RandomWalker(train_graph, lcc_condlist, rw_len, p=1, q=1, batch_size=batch_size, sample_batch=sample_batch)

n_conds:  3


## Create our generative model 

In [4]:
l2_gen=1e-7; l2_disc=5e-5 #1e-4 
gencond_lay=[10]; gen_lay=[50]; disc_lay=[40] 
lr_gencond=0.01; lr_gen=0.0002; lr_disc=0.0002
gencond_iters=1; gen_iters=1; disc_iters=3
discWdown_size=128; genWdown_size=128 

eggen = EGGen(_N,
rw_len,
walk_generator=walker,
n_conds=n_conds,
condgenerator_layers=gencond_lay,
generator_layers=gen_lay,
discriminator_layers=disc_lay,
W_down_discriminator_size=discWdown_size,
W_down_generator_size=genWdown_size,
batch_size=batch_size,
sample_batch=sample_batch,
condition_dim=n_conds,
gencond_iters=gencond_iters,
gen_iters=gen_iters,
disc_iters=disc_iters,
wasserstein_penalty=10, 
l2_penalty_generator=l2_gen,
l2_penalty_discriminator=l2_disc,
lr_gencond=lr_gencond,
lr_gen=lr_gen,
lr_disc=lr_disc,
noise_dim=16, #
noise_type="Gaussian", #
temp_start=5.0, #
min_temperature=0.5,
temperature_decay=1-5e-5,
seed=15, #seed, #
use_gumbel=True,
legacy_generator=False,
gpu_id=gpu_id,
plot_show=False
)

intermediate:  Tensor("Generator/Generator.int_1/Tanh:0", shape=(128, 50), dtype=float32)
h:  Tensor("Generator/Generator.h_1/Tanh:0", shape=(128, 50), dtype=float32)
c:  Tensor("Generator/Generator.c_1/Tanh:0", shape=(128, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator/unstack:0", shape=(128, 3), dtype=float32)
self.fake_conds:  Tensor("Gen_shadow/concat:0", shape=(128, 16, 3), dtype=float32)
self.real_conds:  Tensor("one_hot_1:0", shape=(128, 16, 3), dtype=float32)
self.diff_conds:  Tensor("sub_2:0", shape=(128, 16, 3), dtype=float32)


In [None]:
# #### Define the stopping criterion
stopping_criterion = "val"
assert stopping_criterion in ["val", "eo"], "Please set the desired stopping criterion."
if stopping_criterion == "val": # use val criterion for early stopping
    stopping = None
elif stopping_criterion == "eo":  #use eo criterion for early stopping
    stopping = 0.5 # set the target edge overlap here


# #### Train the model
eval_every = plot_every = 1000 #200
max_iters = 20000 #10000 #
patience= 20 

## Training the model 

In [None]:
# train and save model to ./snapshots/
log_dict = eggen.train(A_orig=_A_obs, val_ones=val_ones, val_zeros=val_zeros, stopping=stopping,
                        eval_every=eval_every, plot_every=plot_every, max_patience=patience, max_iters=max_iters)

In [None]:
# #### Save the training log
## when changing the directory, remember to change directory in eggen.train() too
# save_directory = "./testing"
save_directory = "./snapshots_shadow" #"./snapshots_gencond"  #"./snapshots_gencond2" 
model_name = "shadowgen"
# log_num = 1

save_log = "{}/log{}_{}_maxiter{}_evalevery{}.pkl".format(save_directory, log_num, model_name, max_iters, eval_every)
f = open(save_log,"wb")
pickle.dump(log_dict,f)
f.close()

## Load trained model 

In [12]:
""" ===================================================================================== """
model_name = "model_best_31"
print("Model: ", model_name)
saver = tf.train.Saver()
saver.restore(eggen.session, "snapshots_shadow/" + model_name + ".ckpt") # 

Model:  model_best_31
INFO:tensorflow:Restoring parameters from snapshots_shadow/model_best_31.ckpt


## Generate graphs to evaluate performance

In [13]:
sample_many, explain_conds = eggen.generate_discrete(1000, conds=True, rw_len=rw_len, reuse=True) 

intermediate:  Tensor("Generator_2/Generator.int_1/Tanh:0", shape=(1000, 50), dtype=float32)
h:  Tensor("Generator_2/Generator.h_1/Tanh:0", shape=(1000, 50), dtype=float32)
c:  Tensor("Generator_2/Generator.c_1/Tanh:0", shape=(1000, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator_2/unstack:0", shape=(1000, 3), dtype=float32)


In [14]:
import time
t0 = time.time()

num_paths = 150 #50 #200 #
samples = []
for _ in range(num_paths): 
    if (_+1) % round(num_paths/3) == 0:
        print(_+1)
    samples.append(sample_many.eval({eggen.tau: 0.5}))
    
t1 = time.time()
print("total time: ", t1-t0)

50
100
150
total time:  51.60448503494263


In [15]:
### Assemble score matrix from the random walks
rws = np.array(samples).reshape([-1, rw_len])
scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()

A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
A_select = train_graph

In [42]:
def graph_from_scores(scores, n_edges):
#     scores=scores_matrix; n_edges=A_select.sum()

    target_g = np.zeros(scores.shape) # initialize target graph
    scores_int = scores.toarray().copy() # internal copy of the scores matrix
    scores_int[np.diag_indices_from(scores_int)] = 0  # set diagonal to zero
    N = scores.shape[0]
#     print("N: ", N)

    for n in np.random.choice(N, replace=False, size=N): # Iterate the nodes in random order
        row = scores_int[n,:].copy()
        if row.sum() == 0:
            target = np.random.choice(N)
    #         continue
        else:
            probs = row / row.sum()
            target = np.random.choice(N, p=probs)
    #         target = np.argmax(probs) # argmax probs
        target_g[n, target] = 1
        target_g[target, n] = 1

    # print(target_g.sum()/2)
    diff = (n_edges - target_g.sum())/2
#     print("diff: ", diff)
#     print("n_edges - N: ", n_edges/2 - N)
    if diff > 0:   
        triu = np.triu(scores_int) # upper triangle
        triu[target_g > 0] = 0 # set previously assigned edge to zero
        # print("triu nonzeros: ",np.count_nonzero(triu))

        num_elements = np.count_nonzero(triu) #len(triu[triu>0]) 
        avg_threshold = triu.sum()/num_elements
        tau = 1.2
        avg_threshold = avg_threshold*tau #1.485# tune
#         print("avg_threshold: ", avg_threshold)
        triu[triu < avg_threshold] = 0 # 
#         print("triu nonzeros: ",np.count_nonzero(triu))

        triu = triu / triu.sum() # every count divided by total sum
        triu_ixs = np.triu_indices_from(triu) # indices
        extra_edges = np.random.choice(triu_ixs[0].shape[0], replace=False, p=triu[triu_ixs], size=int(diff))
    #     extra_edges = triu[triu_ixs].argsort()[-int(diff):][::-1] # choose top k based on prob

        target_g[(triu_ixs[0][extra_edges], triu_ixs[1][extra_edges])] = 1
        target_g[(triu_ixs[1][extra_edges], triu_ixs[0][extra_edges])] = 1

    target_g = utils.symmetric(target_g)
    return target_g

In [43]:
### New graph statistics ###
target_g = graph_from_scores(scores_matrix, A_select.sum())
newstats = utils.compute_graph_statistics(target_g)
EO = utils.edge_overlap(A_select.toarray(), target_g)/A_select.sum()

print(newstats['assortativity'])
print(newstats['clustering_coefficient'])
print(newstats['cpl'])
print(newstats['gini'])
print(newstats['d_max'])
print(newstats['power_law_exp'])
print(EO)

-0.016128997714528234
0.03561309298936802
2.1959935489347253
0.27900101175963243
74.0
1.3516519169623282
0.6028097062579821


  (Theoretical_CDF * (1 - Theoretical_CDF))


### Graph stats over 5 runs

In [44]:
dmax,dmin,deg,lcc,wc,cc,tc,sc,law,gini,rel,assrt,coe,ncomp,intra,inter,cpl,eo = ([] for i in range(18))
num_trials = 5 #1 #
num_paths = 150 #100 #

def compute_stats(samples):
    rws = np.array(samples).reshape([-1, rw_len])
    print("rws: ", rws.shape)
    scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()
    
    A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
    A_select = train_graph
#     sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())
    sampled_graph = graph_from_scores(scores_matrix, A_select.sum())
    EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()
    
    stats = utils.compute_graph_statistics(sampled_graph) #, encoded)
    return stats, EO

for trials in range(num_trials):
    print("trial: ", trials)
    start_time = time.time()
  
    samples = []
    for _ in range(num_paths): #6000):
        if (_+1) % round(num_paths/3) == 0:
            print(_+1)
        samples.append(sample_many.eval({eggen.tau: 0.5}))

    stats, EO = compute_stats(samples)
    print("Trial %i: --- %s seconds ---" % (trials, time.time() - start_time))

    dmax.append(stats['d_max'])
    dmin.append(stats['d_min'])
    deg.append(stats['d'])
    lcc.append(stats['LCC'])
    wc.append(stats['wedge_count'])
    cc.append(stats['claw_count'])
    tc.append(stats['triangle_count'])
    sc.append(stats['square_count'])
    law.append(stats['power_law_exp'])
    gini.append(stats['gini'])
    rel.append(stats['rel_edge_distr_entropy'])
    assrt.append(stats['assortativity'])
    coe.append(stats['clustering_coefficient'])
    ncomp.append(stats['n_components'])
    cpl.append(stats['cpl'])
    eo.append(EO)

    print("stats: ", stats)


# all_stats = [dmax, dmin, deg, lcc, wc, cc, tc, sc, law, gini, rel, assrt, coe, ncomp, cpl, eo]
# ====== ASST CLUST CPL GINI MD PLE EO ====== 
all_stats = [assrt, coe, cpl, gini, dmax, law, eo, dmin, deg, lcc, wc, cc, tc, sc, rel, ncomp]

# avg_stats = [np.mean(dmax), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(law), np.mean(gini), np.mean(rel), np.mean(assrt), np.mean(coe), np.mean(ncomp), np.mean(cpl), np.mean(eo)]
avg_stats = [np.mean(assrt), np.mean(coe), np.mean(cpl), np.mean(gini), np.mean(dmax), np.mean(law), np.mean(eo), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(rel), np.mean(ncomp)]
print("avg_stats: ", avg_stats)


from scipy import stats
# stderror_stats = [stats.sem(dmax), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(law), stats.sem(gini), stats.sem(rel), stats.sem(assrt), stats.sem(coe), stats.sem(ncomp), stats.sem(cpl), stats.sem(eo)]
stderror_stats = [stats.sem(assrt), stats.sem(coe), stats.sem(cpl), stats.sem(gini), stats.sem(dmax), stats.sem(law), stats.sem(eo), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(rel), stats.sem(ncomp)]
    
save_directory = "./generate_stats" #"./snapshots_gencond"  #"./snapshots_gencond2" 
# log_num = 8
data_name  = "enron" #"EUcore" #

# save_stats = "{}/{}_stats{}.txt".format(save_directory, data_name, log_num)
save_stats = "{}/{}_{}_numpaths{}.txt".format(save_directory, data_name, model_name, num_paths)

np.savetxt(save_stats, np.c_[avg_stats ,stderror_stats , all_stats])

trial:  0
83
166
249
rws:  (250000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 0: --- 80.33452153205872 seconds ---
stats:  {'d_max': 74.0, 'd_min': 1.0, 'd': 20.337662337662337, 'LCC': 154, 'wedge_count': 40321.0, 'claw_count': 440071.0, 'triangle_count': 5245, 'square_count': 7992, 'power_law_exp': 1.3507800733583044, 'gini': 0.27799339868305384, 'rel_edge_distr_entropy': 0.9714924245650257, 'assortativity': 0.002445623435882296, 'clustering_coefficient': 0.035755593983698084, 'n_components': 1, 'cpl': 2.2087259146082676}
trial:  1
83
166
249
rws:  (250000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 1: --- 78.94564843177795 seconds ---
stats:  {'d_max': 78.0, 'd_min': 1.0, 'd': 20.337662337662337, 'LCC': 154, 'wedge_count': 40375.0, 'claw_count': 456093.0, 'triangle_count': 5158, 'square_count': 8205, 'power_law_exp': 1.3493711824225898, 'gini': 0.26886268265578606, 'rel_edge_distr_entropy': 0.9724966906441881, 'assortativity': -0.012796594894220007, 'clustering_coefficient': 0.0339272911445692, 'n_components': 1, 'cpl': 2.194295900178253}
trial:  2
83
166
249
rws:  (250000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 2: --- 79.27358531951904 seconds ---
stats:  {'d_max': 77.0, 'd_min': 1.0, 'd': 20.337662337662337, 'LCC': 154, 'wedge_count': 40924.0, 'claw_count': 462210.0, 'triangle_count': 5271, 'square_count': 8130, 'power_law_exp': 1.3523172231226654, 'gini': 0.2815262642848848, 'rel_edge_distr_entropy': 0.9699629113385164, 'assortativity': -0.007837674332687694, 'clustering_coefficient': 0.03421172194457065, 'n_components': 1, 'cpl': 2.1858925388337154}
trial:  3
83
166
249
rws:  (250000, 16)


  (Theoretical_CDF * (1 - Theoretical_CDF))


Trial 3: --- 78.25892901420593 seconds ---
stats:  {'d_max': 79.0, 'd_min': 2.0, 'd': 20.337662337662337, 'LCC': 154, 'wedge_count': 40647.0, 'claw_count': 461975.0, 'triangle_count': 5190, 'square_count': 8033, 'power_law_exp': 1.461231112257116, 'gini': 0.27315436798195414, 'rel_edge_distr_entropy': 0.9718000748748823, 'assortativity': -0.015050938753416112, 'clustering_coefficient': 0.03370312246333676, 'n_components': 1, 'cpl': 2.1838553603259485}
trial:  4
83
166
249
rws:  (250000, 16)
Trial 4: --- 78.28544306755066 seconds ---
stats:  {'d_max': 74.0, 'd_min': 1.0, 'd': 20.337662337662337, 'LCC': 154, 'wedge_count': 40344.0, 'claw_count': 444445.0, 'triangle_count': 5398, 'square_count': 8953, 'power_law_exp': 1.351079440742599, 'gini': 0.27322071287588523, 'rel_edge_distr_entropy': 0.9715763246583421, 'assortativity': 0.011915675048934852, 'clustering_coefficient': 0.03643645445443193, 'n_components': 1, 'cpl': 2.224089635854342}
avg_stats:  [-0.004264781899101333, 0.034806836798

  (Theoretical_CDF * (1 - Theoretical_CDF))
