In [1]:
from eggen.eggen_saved import *
from eggen import utils

import tensorflow as tf
import scipy.sparse as sp
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pickle

  return f(*args, **kwds)


In [2]:
## Load data
# """ Entire set of email data """
data = pd.read_csv('./data/EU-core-join.csv')
print("Data chunk size: " ,len(data))

#### Dept data (conditions)
cond_list = pd.read_csv('./data/raw/email-Eu-core-department-labels.txt', sep=" ", header=None)
cond_list.columns = ['ID', 'DEPT']
cond_list = cond_list.values

def no_selfloop(df, x):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for ind in range(x):
        if (df.iloc[ind]['SENDER'] == df.iloc[ind]['RECEIVER']):
            rows.append(ind)
    
    print(len(rows))
    df = df.drop(df.index[rows])
    return df

#### Clean data
x = len(data.index)
data = no_selfloop(data, x)
data = data.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(data.index), np.round(((x - len(data.index)) / x) * 100, decimals=2)))

x = len(data.index)


# ############################ All departments ############################
data_small = data

Data chunk size:  25571
642
Got rid of 642 useless emails! That's 2.51% of the total number of messages in this dataset.


In [3]:
####################################################################################

import networkx as nx
import nxviz as nv

G = nx.from_pandas_edgelist(data_small, 'SENDER', 'RECEIVER') #, edge_attr=['SENDER DEPT']) # , 'RECEIVER DEPT'

""" Get the subset of cond_list before reindexing from 0 """
all_nodes = np.arange(1005)
train_nodes = np.array(G.nodes) #(348,)
nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(657,)
nontrain = nontrain_nodes.tolist()

subset_condlist = np.delete(cond_list, nontrain, 0)

# reindexing the condlist subset
subset_condlist[:,0] = np.arange(subset_condlist.shape[0])
dept_list = np.unique(cond_list[:,1])
for i in np.arange(len(dept_list)):
    subset_condlist[:,1][subset_condlist[:,1]==dept_list[i]] = i

print("subset_condlist: ", np.unique(subset_condlist[:,1], return_counts=True))

# Relabel nodes indices in G to match the generated indicies
G = nx.convert_node_labels_to_integers(G)

print("Number of nodes in G: " ,G.number_of_nodes())
print("Number of edges in G: " ,G.number_of_edges())
print("Number of selfloops in G: " ,G.number_of_selfloops())

## Preparing data
Adjtraining = nx.adjacency_matrix(G)
Adjtraining = sp.csr_matrix(Adjtraining, dtype='float64')
_A_obs = Adjtraining
_A_obs = _A_obs + _A_obs.T # (597, 597)
_A_obs[_A_obs > 1] = 1 # Max value of 1 (597, 597)

""" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. """ 
lcc = utils.largest_connected_components(_A_obs) # len(lcc) = 584
_A_obs = _A_obs[lcc,:][:,lcc] # (584, 584)
_N = _A_obs.shape[0] # 584

#### Separate the edges into train, test, validation
val_share = 0.1
test_share = 0.05
seed = 2020 #481516234  
"""
Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. 
"""
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) 

## EGGen
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

subset_condlist:  (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41]), array([ 49,  62,  10,  12, 107,  18,  28,  49,  19,  31,  39,  29,   3,
        26,  91,  54,  24,  34,   1,  29,  13,  56,  25,  27,   6,   6,
         9,  10,   8,   5,   4,   8,   9,   1,  12,  13,  22,  15,  13,
         3,   4,   2]))
Number of nodes in G:  986
Number of edges in G:  16064
Number of selfloops in G:  0
Selecting 1 largest connected components


In [4]:
#### Parameters
""" Adjustable parameters for training. """ 
# setting GPU id 
gpu_id = 0
# setting the number of nodes
_N = _A_obs.shape[0]
# setting the length of random walks
rw_len = 16 #5 
# setting the training data batch size
batch_size = 128
# getting the number of departments
# n_conds=np.unique(data[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #42
n_conds=np.unique(data_small[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #5
# sample_batch=5000


walker = utils.RandomWalker(train_graph, subset_condlist, rw_len, p=1, q=1, batch_size=batch_size)

#### Create our generative model

In [5]:
# Modified keepdims to tf.reduce_max(y, 1, keep_dims=True) in eggen/eggen.py
eggen = EGGen(_N, rw_len, walk_generator=walker.walk, gpu_id=gpu_id, use_gumbel=True, disc_iters=3,
                W_down_discriminator_size=128, W_down_generator_size=128,
                l2_penalty_generator=1e-7, l2_penalty_discriminator=5e-5,
                generator_layers=[40], discriminator_layers=[30], temp_start=5, learning_rate=0.0003,
            )

intermediate:  Tensor("Generator/Generator.int_1/Tanh:0", shape=(128, 40), dtype=float32)
h:  Tensor("Generator/Generator.h_1/Tanh:0", shape=(128, 40), dtype=float32)
c:  Tensor("Generator/Generator.c_1/Tanh:0", shape=(128, 40), dtype=float32)
Generator initial_states:  1
Generator state:  [(<tf.Tensor 'Generator/Generator.c_1/Tanh:0' shape=(128, 40) dtype=float32>, <tf.Tensor 'Generator/Generator.h_1/Tanh:0' shape=(128, 40) dtype=float32>)]
Generator inputs:  Tensor("Generator/zeros:0", shape=(128, 128), dtype=float32)
LSTM output:  Tensor("Generator/cell_0/cell_0/basic_lstm_cell/mul_2:0", shape=(128, 40), dtype=float32)
LSTM state:  (LSTMStateTuple(c=<tf.Tensor 'Generator/cell_0/cell_0/basic_lstm_cell/add_1:0' shape=(128, 40) dtype=float32>, h=<tf.Tensor 'Generator/cell_0/cell_0/basic_lstm_cell/mul_2:0' shape=(128, 40) dtype=float32>),)
output_bef:  Tensor("Generator/add:0", shape=(128, 986), dtype=float32)
softmax output:  Tensor("Generator/add_4:0", shape=(128, 986), dtype=float32)

LSTM output:  Tensor("Generator_1/cell_0/cell_0/basic_lstm_cell/mul_8:0", shape=(128, 40), dtype=float32)
LSTM state:  (LSTMStateTuple(c=<tf.Tensor 'Generator_1/cell_0/cell_0/basic_lstm_cell/add_5:0' shape=(128, 40) dtype=float32>, h=<tf.Tensor 'Generator_1/cell_0/cell_0/basic_lstm_cell/mul_8:0' shape=(128, 40) dtype=float32>),)
output_bef:  Tensor("Generator_1/add_10:0", shape=(128, 986), dtype=float32)
softmax output:  Tensor("Generator_1/add_14:0", shape=(128, 986), dtype=float32)
size-reduced inputs:  Tensor("Generator_1/MatMul_5:0", shape=(128, 128), dtype=float32)
LSTM output:  Tensor("Generator_1/cell_0/cell_0/basic_lstm_cell/mul_11:0", shape=(128, 40), dtype=float32)
LSTM state:  (LSTMStateTuple(c=<tf.Tensor 'Generator_1/cell_0/cell_0/basic_lstm_cell/add_7:0' shape=(128, 40) dtype=float32>, h=<tf.Tensor 'Generator_1/cell_0/cell_0/basic_lstm_cell/mul_11:0' shape=(128, 40) dtype=float32>),)
output_bef:  Tensor("Generator_1/add_15:0", shape=(128, 986), dtype=float32)
softmax outpu

#### Load pretrained model

In [6]:
uncond_saver = tf.train.Saver()
# uncond_saver.restore(eggen.session, "snapshots/model_best_33.ckpt")
uncond_saver.restore(eggen.session, "snapshots/model_best_15.ckpt")

INFO:tensorflow:Restoring parameters from snapshots/model_best_15.ckpt


#### Generate random walks on the trained model

In [7]:
sample_many = eggen.generate_discrete(10000, reuse=True)

intermediate:  Tensor("Generator_2/Generator.int_1/Tanh:0", shape=(10000, 40), dtype=float32)
h:  Tensor("Generator_2/Generator.h_1/Tanh:0", shape=(10000, 40), dtype=float32)
c:  Tensor("Generator_2/Generator.c_1/Tanh:0", shape=(10000, 40), dtype=float32)
Generator initial_states:  1
Generator state:  [(<tf.Tensor 'Generator_2/Generator.c_1/Tanh:0' shape=(10000, 40) dtype=float32>, <tf.Tensor 'Generator_2/Generator.h_1/Tanh:0' shape=(10000, 40) dtype=float32>)]
Generator inputs:  Tensor("Generator_2/zeros:0", shape=(10000, 128), dtype=float32)
LSTM output:  Tensor("Generator_2/cell_0/cell_0/basic_lstm_cell/mul_2:0", shape=(10000, 40), dtype=float32)
LSTM state:  (LSTMStateTuple(c=<tf.Tensor 'Generator_2/cell_0/cell_0/basic_lstm_cell/add_1:0' shape=(10000, 40) dtype=float32>, h=<tf.Tensor 'Generator_2/cell_0/cell_0/basic_lstm_cell/mul_2:0' shape=(10000, 40) dtype=float32>),)
output_bef:  Tensor("Generator_2/add:0", shape=(10000, 986), dtype=float32)
softmax output:  Tensor("Generator_2/

In [8]:
sample_many.eval({eggen.tau: 0.5})

array([[269, 584, 269, ..., 142, 689,  14],
       [114,  38,  40, ...,  31, 601, 515],
       [772,  13, 607, ..., 499, 695, 493],
       ...,
       [247,  41, 121, ..., 126,   6, 648],
       [723, 530, 568, ..., 145,  86, 141],
       [287,  86, 178, ..., 647,  82, 170]])

In [9]:
""" Retrieving node condition attr to get community statistics """
G_tmp = nx.from_pandas_edgelist(data_small, 'SENDER', 'RECEIVER') #, edge_attr=['SENDER DEPT']) # , 'RECEIVER DEPT'

# Iterate over df rows and set the source and target nodes' attributes for each row:
for _, row in data_small.iterrows():       
    G_tmp.nodes[row['SENDER']]['attr'] = row['SENDER DEPT']
    G_tmp.nodes[row['RECEIVER']]['attr'] = row['RECEIVER DEPT']
    
eval_condlist = list(nx.get_node_attributes(G_tmp,'attr').values())

# one-hot matrix of departments for train node ids
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

values = np.array(eval_condlist)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# print(integer_encoded)

# one hot encode
encoded = to_categorical(integer_encoded) #cond_list[:,1])
# print(encoded.shape)

all_nodes = np.arange(1005) # List of all node ids
train_nodes = np.array(G.nodes) # nodes ids that were used in training the model
nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(all node ids) \ (train node ids)
nontrain = nontrain_nodes.tolist()

encoded = np.delete(encoded, nontrain, 0)


Using TensorFlow backend.


In [10]:
dmax,dmin,deg,lcc,wc,cc,tc,sc,law,gini,rel,assrt,coe,ncomp,intra,inter,cpl,eo = ([] for i in range(18))
num_trials = 5 #2 #1 #
num_paths = 1000 #6000

def compute_stats(samples):
    rws = np.array(samples).reshape([-1, rw_len])
    scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()
    
    A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
    A_select = train_graph
    sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())
    EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()
    
    stats = utils.compute_graph_statistics(sampled_graph) #, encoded)
    return stats, EO

for trials in range(num_trials):
    print("trial: ", trials)
    samples = []
    for _ in range(num_paths): #10): #
        if (_+1) % round(num_paths/5) == 0:
            print(_+1)
        samples.append(sample_many.eval({eggen.tau: 0.5}))
        
    stats, EO = compute_stats(samples)
    
    dmax.append(stats['d_max'])
    dmin.append(stats['d_min'])
    deg.append(stats['d'])
    lcc.append(stats['LCC'])
    wc.append(stats['wedge_count'])
    cc.append(stats['claw_count'])
    tc.append(stats['triangle_count'])
    sc.append(stats['square_count'])
    law.append(stats['power_law_exp'])
    gini.append(stats['gini'])
    rel.append(stats['rel_edge_distr_entropy'])
    assrt.append(stats['assortativity'])
    coe.append(stats['clustering_coefficient'])
    ncomp.append(stats['n_components'])
#     intra.append(stats['intra_community_density'])
#     inter.append(stats['inter_community_density'])
    cpl.append(stats['cpl'])
    eo.append(EO)

trial:  0
200
400
600
800
1000


  (Theoretical_CDF * (1 - Theoretical_CDF))


trial:  1
200
400
600
800
1000


  (Theoretical_CDF * (1 - Theoretical_CDF))


trial:  2
200
400
600
800
1000


  (Theoretical_CDF * (1 - Theoretical_CDF))


trial:  3
200
400
600
800
1000


  (Theoretical_CDF * (1 - Theoretical_CDF))


trial:  4
200
400
600
800
1000


  (Theoretical_CDF * (1 - Theoretical_CDF))


In [11]:
# all_stats = [dmax, dmin, deg, lcc, wc, cc, tc, sc, law, gini, rel, assrt, coe, ncomp, cpl, eo]
# ====== ASST CLUST CPL GINI MD PLE EO ====== 
all_stats = [assrt, coe, cpl, gini, dmax, law, eo, dmin, deg, lcc, wc, cc, tc, sc, rel, ncomp]

# avg_stats = [np.mean(dmax), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(law), np.mean(gini), np.mean(rel), np.mean(assrt), np.mean(coe), np.mean(ncomp), np.mean(cpl), np.mean(eo)]
avg_stats = [np.mean(assrt), np.mean(coe), np.mean(cpl), np.mean(gini), np.mean(dmax), np.mean(law), np.mean(eo), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(rel), np.mean(ncomp)]
print("avg_stats: ", avg_stats)


from scipy import stats
# stderror_stats = [stats.sem(dmax), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(law), stats.sem(gini), stats.sem(rel), stats.sem(assrt), stats.sem(coe), stats.sem(ncomp), stats.sem(cpl), stats.sem(eo)]
stderror_stats = [stats.sem(assrt), stats.sem(coe), stats.sem(cpl), stats.sem(gini), stats.sem(dmax), stats.sem(law), stats.sem(eo), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(rel), stats.sem(ncomp)]

avg_stats:  [-0.061118106897072554, 0.004925651524263908, 2.5514972045180757, 0.5022828757430451, 273.2, 1.359997364315166, 0.30830525853229823, 1.0, 27.69574036511156, 986.0, 774728.4, 23604610.2, 38726.8, 41246.8, 0.9368882596583876, 1.0]


In [12]:
save_directory = "./generate_stats" #"./snapshots_gencond"  #"./snapshots_gencond2" 
log_num = 0
data_name  = "NetGAN_EUcore-top" #"EUcore" #

save_stats = "{}/{}_stats{}.txt".format(save_directory, data_name, log_num)

np.savetxt(save_stats, np.c_[avg_stats,stderror_stats, all_stats])

## Original Graph

In [7]:
A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))
A_select = train_graph
utils.compute_graph_statistics(A_select.toarray())

  (Theoretical_CDF * (1 - Theoretical_CDF))


{'d_max': 285.0,
 'd_min': 1.0,
 'd': 27.69574036511156,
 'LCC': 986,
 'wedge_count': 846822.0,
 'claw_count': 28308535.0,
 'triangle_count': 63226,
 'square_count': 151542,
 'power_law_exp': 1.3790943393384585,
 'gini': 0.5384692862815612,
 'rel_edge_distr_entropy': 0.92655643321839,
 'assortativity': -0.024215720231687856,
 'clustering_coefficient': 0.006700382057919988,
 'n_components': 1,
 'cpl': 2.654223082546514}