In [1]:
# ## Explainable Conditional LSTM random walk graph GAN training
from eggen import utils
from eggen.eggen_shadow import *

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle

## Load data
# """ Entire set of email data """
data = pd.read_csv('./data/EU-core-join.csv')
print("Data chunk size: " ,len(data))

#### Dept data (conditions)
cond_list = pd.read_csv('./data/raw/email-Eu-core-department-labels.txt', sep=" ", header=None)
cond_list.columns = ['ID', 'DEPT']
cond_list = cond_list.values


def no_selfloop(df, x):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for ind in range(x):
        if (df.iloc[ind]['SENDER'] == df.iloc[ind]['RECEIVER']):
            rows.append(ind)
    
    print(len(rows))
    df = df.drop(df.index[rows])
    return df

#### Clean data
x = len(data.index)
data = no_selfloop(data, x)
data = data.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(data.index), np.round(((x - len(data.index)) / x) * 100, decimals=2)))

x = len(data.index)


####################################################################################


# ############################ All departments ############################
data_small = data

####################################################################################

import networkx as nx
import nxviz as nv

G = nx.from_pandas_edgelist(data_small, 'SENDER', 'RECEIVER') #, edge_attr=['SENDER DEPT']) # , 'RECEIVER DEPT'

""" Get the subset of cond_list before reindexing from 0 """
all_nodes = np.arange(1005)
train_nodes = np.array(G.nodes) #(348,)
nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(657,)
nontrain = nontrain_nodes.tolist()

subset_condlist = np.delete(cond_list, nontrain, 0)

# reindexing the condlist subset
subset_condlist[:,0] = np.arange(subset_condlist.shape[0])
dept_list = np.unique(cond_list[:,1])
for i in np.arange(len(dept_list)):
    subset_condlist[:,1][subset_condlist[:,1]==dept_list[i]] = i

print("subset_condlist: ", np.unique(subset_condlist[:,1], return_counts=True))

# Relabel nodes indices in G to match the generated indicies
G = nx.convert_node_labels_to_integers(G)

print("Number of nodes in G: " ,G.number_of_nodes())
print("Number of edges in G: " ,G.number_of_edges())
print("Number of selfloops in G: " ,G.number_of_selfloops())

## Preparing data
Adjtraining = nx.adjacency_matrix(G)
Adjtraining = sp.csr_matrix(Adjtraining, dtype='float64')
_A_obs = Adjtraining
_A_obs = _A_obs + _A_obs.T # (597, 597)
_A_obs[_A_obs > 1] = 1 # Max value of 1 (597, 597)

""" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. """ 
lcc = utils.largest_connected_components(_A_obs) # len(lcc) = 584
_A_obs = _A_obs[lcc,:][:,lcc] # (584, 584)
_N = _A_obs.shape[0] # 584

#### Separate the edges into train, test, validation
val_share = 0.1
test_share = 0.05
seed = 2020 #481516234  
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

## EGGen
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()


#### Parameters
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 0
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 16 #32 #
# setting the training data batch size
batch_size = 128
# getting the number of departments
# n_conds=np.unique(data[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #42
n_conds=np.unique(data_small[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #5
# n_conds=len(dept_list)
print("n_conds: ", n_conds)
sample_batch =1000 #1024 # 128 #256 #512 #
# log_num = 8

walker = utils.RandomWalker(train_graph, subset_condlist, rw_len, p=1, q=1, batch_size=batch_size, sample_batch=sample_batch)



#### ====== Create our generative model ======
l2_gen=1e-7; l2_disc=5e-5 #1e-4 
gencond_lay=[10]; gen_lay=[50]; disc_lay=[40] #35] #
# lr_gencond=0.0002; lr_gen=0.0002; lr_disc=0.0002 #0.0002 #
lr_gencond=0.01; lr_gen=0.0003; lr_disc=0.0003 #0.0002
gencond_iters=gen_iters=1; disc_iters=3
discWdown_size=128; genWdown_size=128 #128; 128
# discWdown_size=256; genWdown_size=256 

eggen = EGGen(_N, rw_len, walk_generator=walker, n_conds=n_conds, condition_dim=n_conds,
              gpu_id=gpu_id, use_gumbel=True, gencond_iters=gencond_iters, gen_iters=gen_iters, disc_iters=disc_iters, 
              W_down_discriminator_size=discWdown_size, W_down_generator_size=genWdown_size,
              l2_penalty_generator=l2_gen, l2_penalty_discriminator=l2_disc,
              condgenerator_layers=gencond_lay, generator_layers=gen_lay, discriminator_layers=disc_lay,
              temp_start=5, lr_gencond=lr_gencond, lr_gen=lr_gen, lr_disc=lr_disc, plot_show=False, sample_batch=sample_batch
             ) 


  return f(*args, **kwds)


Data chunk size:  25571
642
Got rid of 642 useless emails! That's 2.51% of the total number of messages in this dataset.
subset_condlist:  (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41]), array([ 49,  62,  10,  12, 107,  18,  28,  49,  19,  31,  39,  29,   3,
        26,  91,  54,  24,  34,   1,  29,  13,  56,  25,  27,   6,   6,
         9,  10,   8,   5,   4,   8,   9,   1,  12,  13,  22,  15,  13,
         3,   4,   2]))
Number of nodes in G:  986
Number of edges in G:  16064
Number of selfloops in G:  0
Selecting 1 largest connected components
n_conds:  42
intermediate:  Tensor("Generator/Generator.int_1/Tanh:0", shape=(128, 50), dtype=float32)
h:  Tensor("Generator/Generator.h_1/Tanh:0", shape=(128, 50), dtype=float32)
c:  Tensor("Generator/Generator.c_1/Tanh:0", shape=(128, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Te

In [2]:
""" ===================================================================================== """
model_name = "model_best_16"
# num_paths = 2000
print("Model: ", model_name)

saver = tf.train.Saver()
# saver.restore(eggen.session, "snapshots_shadow/model_best_3.ckpt") # 
saver.restore(eggen.session, "snapshots_shadow/" + model_name + ".ckpt") # 

Model:  model_best_16
INFO:tensorflow:Restoring parameters from snapshots_shadow/model_best_16.ckpt


In [7]:
# ## ====== generate graphs to evaluate performance ======
sample_many, explain_conds = eggen.generate_discrete(10000, conds=True, rw_len=rw_len, reuse=True) 

intermediate:  Tensor("Generator_3/Generator.int_1/Tanh:0", shape=(10000, 50), dtype=float32)
h:  Tensor("Generator_3/Generator.h_1/Tanh:0", shape=(10000, 50), dtype=float32)
c:  Tensor("Generator_3/Generator.c_1/Tanh:0", shape=(10000, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator_3/unstack:0", shape=(10000, 42), dtype=float32)


In [None]:
num_paths = 6000#00#
samples = []
for _ in range(num_paths): 
#     if (_+1) % round(num_paths/3) == 0:
#         print(_+1)
    samples.append(sample_many.eval({eggen.tau: 0.5}))

In [None]:
### Assemble score matrix from the random walks
rws = np.array(samples).reshape([-1, rw_len])
scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()

A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
A_select = train_graph

sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())
# EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()
# print("EO: ", EO)

stats = utils.compute_graph_statistics(sampled_graph) #, encoded)
print(stats['assortativity'])
print(stats['clustering_coefficient'])
print(stats['cpl'])
print(stats['gini'])
print(stats['d_max'])
print(stats['power_law_exp'])

In [None]:
newstats = utils.compute_graph_statistics(graph_from_scores(scores_matrix, A_select.sum()))
print(newstats['assortativity'])
print(newstats['clustering_coefficient'])
print(newstats['cpl'])
print(newstats['gini'])
print(newstats['d_max'])
print(newstats['power_law_exp'])

In [None]:
Original	GVAE
-0.0242	 -0.2931
0.0067	 0.0126
2.6542	 2.6219
0.5385	 0.5481
285	     159.8000
1.3791	 1.3475

In [None]:
A_graph = nx.from_numpy_matrix(A_select.todense())
triangles = nx.triangles(A_graph)
t = np.sum(list(triangles.values())) / 3
t

In [None]:
tri = nx.triangles(nx.from_numpy_matrix(sampled_graph))
np.sum(list(tri.values())) / 3

In [None]:
def graph_from_scores(scores, n_edges):

    if  len(scores.nonzero()[0]) < n_edges:
        return symmetric(scores) > 0

    target_g = np.zeros(scores.shape) # initialize target graph
    scores_int = scores.toarray().copy() # internal copy of the scores matrix
    scores_int[np.diag_indices_from(scores_int)] = 0  # set diagonal to zero
    degrees_int = scores_int.sum(0)   # The row sum over the scores.

    N = scores.shape[0]

    for n in np.random.choice(N, replace=False, size=N): # Iterate the nodes in random order

        row = scores_int[n,:].copy()
        if row.sum() == 0:
            continue

        probs = row / row.sum()

#         target = np.argmax(probs) # argmax probs
        target = np.random.choice(N, p=probs)
#         k = 2
#         target = probs.argsort()[-k:][::-1] # choose top k based on prob
        target_g[n, target] = 1
        target_g[target, n] = 1


    diff = np.round((n_edges - target_g.sum())/2)
    if diff > 0:
        
        triu = np.triu(scores_int) # upper triangle
        triu[target_g > 0] = 0 # set previously assigned edge to zero
        triu = triu / triu.sum() # every count divided by total sum
        triu_ixs = np.triu_indices_from(scores_int) # Return the indices for the upper-triangle of arr.
        extra_edges = np.random.choice(triu_ixs[0].shape[0], replace=False, p=triu[triu_ixs], size=int(diff))
#         extra_edges = triu[triu_ixs].argsort()[-int(diff):][::-1] # choose top k based on prob

        target_g[(triu_ixs[0][extra_edges], triu_ixs[1][extra_edges])] = 1
        target_g[(triu_ixs[1][extra_edges], triu_ixs[0][extra_edges])] = 1

    target_g = symmetric(target_g)
    return target_g


def symmetric(directed_adjacency, clip_to_one=True):
    A_symmetric = directed_adjacency + directed_adjacency.T
    if clip_to_one:
        A_symmetric[A_symmetric > 1] = 1
    return A_symmetric
