## Explainable Conditional LSTM random walk graph GAN training


In [None]:
from eggen import utils
from eggen.eggen_shadow import *

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle

## Load data
# """ Entire set of email data """
data = pd.read_csv('./data/EU-core-join.csv')
print("Data chunk size: " ,len(data))

#### Dept data (conditions)
cond_list = pd.read_csv('./data/raw/email-Eu-core-department-labels.txt', sep=" ", header=None)
cond_list.columns = ['ID', 'DEPT']
cond_list = cond_list.values

############################ Top N departments ############################
# The sorted list of biggest groups by count of send only
count_series = data.groupby(['RECEIVER DEPT', 'SENDER DEPT']).size()
df_count = count_series.to_frame(name = 'size').reset_index()
df_count.sort_values(by=['size'], inplace=True, ascending=False)

df_countshow = df_count
df_countshow.columns = ['Receiver dept', 'Sender dept', 'Email count']
df_countshow = df_countshow.reset_index(drop=True)
df_countshow.iloc[:10]

############################ Top N departments ############################
""" Select top N departments """
topN_grp = 5
dept_list = df_count.iloc[:topN_grp]['Receiver dept'].tolist() #RECEIVER DEPT'].tolist()
dept_list.sort()
dept_list

def no_selfloop(df, x):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for ind in range(x):
        if (df.iloc[ind]['SENDER'] == df.iloc[ind]['RECEIVER']):
            rows.append(ind)
    
    print(len(rows))
    df = df.drop(df.index[rows])
    return df

#### Clean data
x = len(data.index)
data = no_selfloop(data, x)
data = data.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(data.index), np.round(((x - len(data.index)) / x) * 100, decimals=2)))

x = len(data.index)

############################ Top N departments ############################
def subset_dept(df, x, dept_list):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for ind in range(x):
#         # all depts associated with dept list
#         if not (df.iloc[ind]['SENDER DEPT'] or df.iloc[ind]['RECEIVER DEPT']) in dept_list:
        # only depts in the dept list
        if ((data.iloc[ind]['SENDER DEPT'] in dept_list) and (data.iloc[ind]['RECEIVER DEPT'] in dept_list)):
            rows.append(ind)

#     df = df.drop(df.index[rows])
    df = df.iloc[rows]
    return df

############################ Top N departments ############################
# # Selecting some groups. For example, groups 3 and 28
grpone = dept_list[0]
grptwo = dept_list[1]
grpthree = dept_list[2]
grpfour = dept_list[3]
grpfive = dept_list[4]
# dept_list = [grpone, grptwo, grpthree]#, grpfour, grpfive] 
dept_list.sort()
grpothers = dept_list[-1]+1

data_small = subset_dept(data, x, dept_list)
# data = data.reset_index()
len(data_small)

In [None]:
import networkx as nx
import nxviz as nv

G = nx.from_pandas_edgelist(data_small, 'SENDER', 'RECEIVER') #, edge_attr=['SENDER DEPT']) # , 'RECEIVER DEPT'

""" Get the subset of cond_list before reindexing from 0 """
all_nodes = np.arange(1005)
train_nodes = np.array(G.nodes) #(348,)
nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(657,)
nontrain = nontrain_nodes.tolist()

subset_condlist = np.delete(cond_list, nontrain, 0)

# reindexing the condlist subset
subset_condlist[:,0] = np.arange(subset_condlist.shape[0])

# Relabel nodes indices in G to match the generated indicies
G = nx.convert_node_labels_to_integers(G)

print("Number of nodes in G: " ,G.number_of_nodes())
print("Number of edges in G: " ,G.number_of_edges())
print("Number of selfloops in G: " ,G.number_of_selfloops())

## Preparing data
Adjtraining = nx.adjacency_matrix(G)
Adjtraining = sp.csr_matrix(Adjtraining, dtype='float64')
_A_obs = Adjtraining
_A_obs = _A_obs + _A_obs.T # (597, 597)
_A_obs[_A_obs > 1] = 1 # Max value of 1 (597, 597)

""" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. """ 
lcc = utils.largest_connected_components(_A_obs) # len(lcc) = 584
_A_obs = _A_obs[lcc,:][:,lcc] # (584, 584)
_N = _A_obs.shape[0] # 584

#### Separate the edges into train, test, validation
val_share = 0.1
test_share = 0.05
seed = 2020 #481516234  
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

## EGGen
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

#### Parameters
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 1
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 32 #16 #5 
# setting the training data batch size
batch_size = 128
# getting the number of departments
n_conds=np.unique(data[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #42
# n_conds=np.unique(data_small[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0]
# n_conds=len(dept_list)
sample_batch=1000

walker = utils.RandomWalker(train_graph, subset_condlist, rw_len, p=1, q=1, batch_size=batch_size, sample_batch=sample_batch)

In [None]:
print(np.count_nonzero(_A_obs.todense()))
print(len(subset_condlist))
print("**** N nodes {:}, E edges: {:}, K classes: {:} ****".format(_N,
                                                                  np.count_nonzero(_A_obs.todense()),
                                                                  len(np.unique(subset_condlist[:,1]))
                                                                  ))

In [None]:
# Reindex condlist for plot
# [ 1  4  7 14 21]
plt_condlist = subset_condlist

plt_condlist[:,1][plt_condlist[:,1]==1] = 0
plt_condlist[:,1][plt_condlist[:,1]==4] = 1
plt_condlist[:,1][plt_condlist[:,1]==7] = 2
plt_condlist[:,1][plt_condlist[:,1]==14] = 3
plt_condlist[:,1][plt_condlist[:,1]==21] = 4

plt_condlist

unique_conds, conds_counts = np.unique(plt_condlist[:,1], return_counts=True)
print(unique_conds)
print(conds_counts)

def bins_labels(bins, **kwargs):
    bin_w = (max(bins) - min(bins)) / (len(bins) - 1)
    plt.xticks(np.arange(min(bins)+bin_w/2, max(bins), bin_w), [1, 4, 7, 14, 21], **kwargs)
    plt.xlim(bins[0], bins[-1])


import matplotlib.pyplot as plt
bins = range(len(unique_conds)+1)
plt.hist(plt_condlist[:,1], bins=bins)  
# plt.xticks(list(range(len(conds_counts))), [1, 4, 7, 14, 21])
bins_labels(bins, fontsize=15)

# plt.title("EUCORE-TOP")
# plt.savefig('./image/eucore-top.png')
plt.show()

## ====== Create our generative model ======

In [None]:
l2_gen=1e-7; l2_disc=5e-5 #1e-4 
gencond_lay=[10]; gen_lay=[50]; disc_lay=[40] #35] #
# lr_gencond=0.0002; lr_gen=0.0002; lr_disc=0.0002 #0.0002 #
lr_gencond=0.01; lr_gen=0.0003; lr_disc=0.0003 #0.0002
gencond_iters=gen_iters=1; disc_iters=3
discWdown_size=128; genWdown_size=128 #128; 128

# eggen = EGGen(_N, rw_len, walk_generator=walker, n_conds=n_conds, condition_dim=n_conds,
#               gpu_id=gpu_id, use_gumbel=True, gencond_iters=gencond_iters, gen_iters=gen_iters, disc_iters=disc_iters, 
#               W_down_discriminator_size=discWdown_size, W_down_generator_size=genWdown_size,
#               l2_penalty_generator=l2_gen, l2_penalty_discriminator=l2_disc,
#               condgenerator_layers=gencond_lay, generator_layers=gen_lay, discriminator_layers=disc_lay,
#               temp_start=5, lr_gencond=lr_gencond, lr_gen=lr_gen, lr_disc=lr_disc, plot_show=False, sample_batch=sample_batch
#              ) 

eggen = EGGen(_N, rw_len, walk_generator=walker, n_conds=n_conds, condition_dim=n_conds,
              gpu_id=gpu_id, use_gumbel=True, gencond_iters=gencond_iters, gen_iters=gen_iters, disc_iters=disc_iters, 
              W_down_discriminator_size=discWdown_size, W_down_generator_size=genWdown_size,
              l2_penalty_generator=l2_gen, l2_penalty_discriminator=l2_disc,
              condgenerator_layers=gencond_lay, generator_layers=gen_lay, discriminator_layers=disc_lay,
              temp_start=5, lr_gencond=lr_gencond, lr_gen=lr_gen, lr_disc=lr_disc, plot_show=True, sample_batch=sample_batch
             ) 

In [None]:
# #### Define the stopping criterion
stopping_criterion = "val"
assert stopping_criterion in ["val", "eo"], "Please set the desired stopping criterion."
if stopping_criterion == "val": # use val criterion for early stopping
    stopping = None
elif stopping_criterion == "eo":  #use eo criterion for early stopping
    stopping = 0.5 # set the target edge overlap here


# #### Train the model
eval_every = 2000 #1000 
plot_every = 2000 
max_iters = 20000 
patience= 20 

# train and save model to ./snapshots/
log_dict = eggen.train(A_orig=_A_obs, val_ones=val_ones, val_zeros=val_zeros, stopping=stopping,
                        eval_every=eval_every, plot_every=plot_every, max_patience=patience, max_iters=max_iters)


# #### Save the training log
## when changing the directory, remember to change directory in eggen.train() too
# save_directory = "./testing"
save_directory = "./snapshots_shadow" #"./snapshots_gencond"  #"./snapshots_gencond2" 
model_name = "shadowgen"
log_num = 0

save_log = "{}/log{}_{}_maxiter{}_evalevery{}.pkl".format(save_directory, log_num, model_name, max_iters, eval_every)
f = open(save_log,"wb")
pickle.dump(log_dict,f)
f.close()


## ====== generate graphs to evaluate performance ======

In [None]:
sample_many, explain_conds = eggen.generate_discrete(10000, conds=True, rw_len=rw_len, reuse=True) 

In [None]:
dmax,dmin,deg,lcc,wc,cc,tc,sc,law,gini,rel,assrt,coe,ncomp,intra,inter,cpl,eo = ([] for i in range(18))
num_trials = 2 #5 
num_paths = 10

def compute_stats(samples):
    rws = np.array(samples).reshape([-1, rw_len])
    print("rws: ", rws.shape)
    scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()
    
    A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
    A_select = train_graph
    sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())
    EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()
    
    stats = utils.compute_graph_statistics(sampled_graph) #, encoded)
    return stats, EO

for trials in range(num_trials):
    print("trial: ", trials)
    start_time = time.time()
  
    samples = []
    for _ in range(num_paths): #6000):
        if (_+1) % 1000 == 0:
            print(_+1)
        samples.append(sample_many.eval({eggen.tau: 0.5}))

    stats, EO = compute_stats(samples)
    print("Trial %i: --- %s seconds ---" % (trials, time.time() - start_time))

    dmax.append(stats['d_max'])
    dmin.append(stats['d_min'])
    deg.append(stats['d'])
    lcc.append(stats['LCC'])
    wc.append(stats['wedge_count'])
    cc.append(stats['claw_count'])
    tc.append(stats['triangle_count'])
    sc.append(stats['square_count'])
    law.append(stats['power_law_exp'])
    gini.append(stats['gini'])
    rel.append(stats['rel_edge_distr_entropy'])
    assrt.append(stats['assortativity'])
    coe.append(stats['clustering_coefficient'])
    ncomp.append(stats['n_components'])
    cpl.append(stats['cpl'])
    eo.append(EO)


all_stats = [dmax, dmin, deg, lcc, wc, cc, tc, sc, law, gini, rel, assrt, coe, ncomp, cpl, eo]


avg_stats = [np.mean(dmax), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(law), np.mean(gini), np.mean(rel), np.mean(assrt), np.mean(coe), np.mean(ncomp), np.mean(cpl), np.mean(eo)]
print("avg_stats: ", avg_stats)


from scipy import stats
stderror_stats = [stats.sem(dmax), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(law), stats.sem(gini), stats.sem(rel), stats.sem(assrt), stats.sem(coe), stats.sem(ncomp), stats.sem(cpl), stats.sem(eo)]
    
save_directory = "./generate_stats" #"./snapshots_gencond"  #"./snapshots_gencond2" 
# log_num = 0
data_name  = "EUcore-top" #"EUcore" #

save_stats = "{}/{}_stats{}.txt".format(save_directory, data_name, log_num)

np.savetxt(save_stats, np.c_[avg_stats,stderror_stats, all_stats])