In [1]:
# ## Explainable Conditional LSTM random walk graph GAN training
from eggen import utils
# from eggen.eggen_shadow import *
from eggen.edit_eggen_shadow import *

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle

## Load data
# """ Entire set of email data """
data = pd.read_csv('./data/EU-core-join.csv')
print("Data chunk size: " ,len(data))

#### Dept data (conditions)
cond_list = pd.read_csv('./data/raw/email-Eu-core-department-labels.txt', sep=" ", header=None)
cond_list.columns = ['ID', 'DEPT']
cond_list = cond_list.values

  return f(*args, **kwds)


Data chunk size:  25571


In [2]:
############################ Top N departments ############################
# The sorted list of biggest groups by count of send only
count_series = data.groupby(['RECEIVER DEPT', 'SENDER DEPT']).size()
df_count = count_series.to_frame(name = 'size').reset_index()
df_count.sort_values(by=['size'], inplace=True, ascending=False)

df_countshow = df_count
df_countshow.columns = ['Receiver dept', 'Sender dept', 'Email count']
df_countshow = df_countshow.reset_index(drop=True)
df_countshow.iloc[:10]

############################ Top N departments ############################
""" Select top N departments """
topN_grp = 5
dept_list = df_count.iloc[:topN_grp]['Receiver dept'].tolist() #RECEIVER DEPT'].tolist()
dept_list.sort()
dept_list


[1, 4, 7, 14, 21]

In [3]:
def no_selfloop(df, x):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for ind in range(x):
        if (df.iloc[ind]['SENDER'] == df.iloc[ind]['RECEIVER']):
            rows.append(ind)
    
    print(len(rows))
    df = df.drop(df.index[rows])
    return df

#### Clean data
x = len(data.index)
data = no_selfloop(data, x)
data = data.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(data.index), np.round(((x - len(data.index)) / x) * 100, decimals=2)))

x = len(data.index)

642
Got rid of 642 useless emails! That's 2.51% of the total number of messages in this dataset.


In [4]:
############################ Top N departments ############################
def subset_dept(df, x, dept_list):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for ind in range(x):
#         # all depts associated with dept list
#         if not (df.iloc[ind]['SENDER DEPT'] or df.iloc[ind]['RECEIVER DEPT']) in dept_list:
        # only depts in the dept list
        if ((data.iloc[ind]['SENDER DEPT'] in dept_list) and (data.iloc[ind]['RECEIVER DEPT'] in dept_list)):
            rows.append(ind)

#     df = df.drop(df.index[rows])
    df = df.iloc[rows]
    return df

############################ Top N departments ############################
# # Selecting some groups. For example, groups 3 and 28
# grpone = dept_list[0]
# grptwo = dept_list[1]
# grpthree = dept_list[2]
# grpfour = dept_list[3]
# grpfive = dept_list[4]
# dept_list = [grpone, grptwo, grpthree]#, grpfour, grpfive] 
dept_list.sort()
grpothers = dept_list[-1]+1

data_small = subset_dept(data, x, dept_list)
# data = data.reset_index()
data_small = data_small.reset_index()
len(data_small)

5206

In [5]:
import networkx as nx
import nxviz as nv

G = nx.from_pandas_edgelist(data_small, 'SENDER', 'RECEIVER') #, edge_attr=['SENDER DEPT']) # , 'RECEIVER DEPT'

""" Get the subset of cond_list before reindexing from 0 """
all_nodes = np.arange(1005)
train_nodes = np.array(G.nodes) #(348,)
nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(657,)
nontrain = nontrain_nodes.tolist()

subset_condlist = np.delete(cond_list, nontrain, 0)

# reindexing the condlist subset
subset_condlist[:,0] = np.arange(subset_condlist.shape[0])
for i in np.arange(len(dept_list)):
    subset_condlist[:,1][subset_condlist[:,1]==dept_list[i]] = i

print("subset_condlist: ", np.unique(subset_condlist[:,1], return_counts=True))

# Relabel nodes indices in G to match the generated indicies
G = nx.convert_node_labels_to_integers(G)

print("Number of nodes in G: " ,G.number_of_nodes())
print("Number of edges in G: " ,G.number_of_edges())
print("Number of selfloops in G: " ,G.number_of_selfloops())

## Preparing data
Adjtraining = nx.adjacency_matrix(G)
Adjtraining = sp.csr_matrix(Adjtraining, dtype='float64')
_A_obs = Adjtraining
_A_obs = _A_obs + _A_obs.T # (597, 597)
_A_obs[_A_obs > 1] = 1 # Max value of 1 (597, 597)

""" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. """ 
lcc = utils.largest_connected_components(_A_obs) # len(lcc) = 584
_A_obs = _A_obs[lcc,:][:,lcc] # (584, 584)
_N = _A_obs.shape[0] # 584

#### Separate the edges into train, test, validation
val_share = 0.1
test_share = 0.05
seed = 2020 #481516234  
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

## EGGen
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

#### Parameters
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 1
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 16 #8 #32 #
# setting the training data batch size
batch_size = 128 #512 #
# getting the number of departments
# n_conds=np.unique(data[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #42
n_conds=np.unique(data_small[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #5
# n_conds=len(dept_list)
print("n_conds: ", n_conds)
sample_batch = 1000 #128 #2048 #256 #512 #1024 #
# log_num = 13 #99 #

walker = utils.RandomWalker(train_graph, subset_condlist, rw_len, p=1, q=1, batch_size=batch_size, sample_batch=sample_batch)

subset_condlist:  (array([0, 1, 2, 3, 4]), array([ 60, 103,  48,  90,  47]))
Number of nodes in G:  348
Number of edges in G:  3342
Number of selfloops in G:  0
Selecting 1 largest connected components
n_conds:  5


## ====== Create our generative model ======

In [6]:
l2_gen=1e-7; l2_disc=5e-5 #1e-4 
gencond_lay=[10]; gen_lay=[50]; disc_lay=[40] 
lr_gencond=0.01; lr_gen=0.0003; lr_disc=0.0003 #0.0002
gencond_iters=1; gen_iters=1; disc_iters=3
discWdown_size=128; genWdown_size=128 

eggen = EGGen(_N,
rw_len,
walk_generator=walker,
n_conds=n_conds,
condgenerator_layers=gencond_lay,
generator_layers=gen_lay,
discriminator_layers=disc_lay,
W_down_discriminator_size=discWdown_size,
W_down_generator_size=genWdown_size,
batch_size=batch_size,
sample_batch=sample_batch,
condition_dim=n_conds,
gencond_iters=gencond_iters,
gen_iters=gen_iters,
disc_iters=disc_iters,
wasserstein_penalty=10, #20, #
l2_penalty_generator=l2_gen,
l2_penalty_discriminator=l2_disc,
lr_gencond=lr_gencond,
lr_gen=lr_gen,
lr_disc=lr_disc,
noise_dim=16, #32, #
noise_type="Uniform", #"Gaussian", #
temp_start=10.0, #5.0, #30.0, #
min_temperature=0.5,
temperature_decay=1-5e-5,
seed=15, #seed, #
use_gumbel=True,
legacy_generator=False,
gpu_id=gpu_id,
plot_show=False
)

intermediate:  Tensor("Generator/Generator.int_1/Tanh:0", shape=(128, 50), dtype=float32)
h:  Tensor("Generator/Generator.h_1/Tanh:0", shape=(128, 50), dtype=float32)
c:  Tensor("Generator/Generator.c_1/Tanh:0", shape=(128, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator/unstack:0", shape=(128, 5), dtype=float32)


In [7]:
""" ===================================================================================== """

model_name = "model_best_19"
print("Model: ", model_name)

saver = tf.train.Saver()
saver.restore(eggen.session, "snapshots_shadow/" + model_name + ".ckpt") # 
# saver.restore(eggen.session, "snapshots/" + "model_best_17" + ".ckpt") # 


Model:  model_best_19
INFO:tensorflow:Restoring parameters from snapshots_shadow/model_best_19.ckpt


## ====== generate graphs to evaluate performance ======

In [8]:
sample_many, explain_conds = eggen.generate_discrete(1000, conds=True, rw_len=rw_len, reuse=True) 

intermediate:  Tensor("Generator_1/Generator.int_1/Tanh:0", shape=(1000, 50), dtype=float32)
h:  Tensor("Generator_1/Generator.h_1/Tanh:0", shape=(1000, 50), dtype=float32)
c:  Tensor("Generator_1/Generator.c_1/Tanh:0", shape=(1000, 50), dtype=float32)
Generator initial_states:  1
Initial cond:  Tensor("Generator_1/unstack:0", shape=(1000, 5), dtype=float32)


In [46]:
A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
A_select = train_graph
A_select
adj=A_select #[:9,:9]
adj = np.asarray(adj.todense())
adj.shape

(348, 348)

In [49]:
u = adj.sum(1)
print(u.shape)
adj = adj/u[:,None]
# adj

(348,)


In [50]:
# n_walks = 3 # number of walks
# N = adj.shape[0]
# rw_len= 16

def rand_walking(adj, n_walks, N, rw_len):
    walk = [] # holds transitions
    elements = np.arange(adj.shape[0]) # for our graph [0,1,2,3]
    for k in range(n_walks):
        source_node = np.random.choice(N)  
        node = source_node
        walk.append(node)
        count_trans = 0 # count of transitions
        while (count_trans<rw_len-1):
            count_trans+=1
            probs = adj[node]
    #         print(node)
    #         print(probs)
            sample = np.random.choice(adj.shape[0],p=probs) # sample a target using probs
            node = sample
            walk.append(node)

    walks = np.reshape(walk, (n_walks, -1))
    return walks

# walks = rand_walking(adj, n_walks, N, rw_len)
# walks

In [55]:
import time

t0 = time.time()
rand_walking(adj, n_walks=1000*60, N=346, rw_len=16)
t1 = time.time()

total = t1-t0
total

28.221086978912354

In [43]:
subset_condlist[:9]

array([[0, 0],
       [1, 0],
       [2, 4],
       [3, 4],
       [4, 4],
       [5, 3],
       [6, 3],
       [7, 3],
       [8, 3]])