In [1]:
# import warnings
# warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np
import scipy.sparse as sp
from scipy.sparse import load_npz
import pandas as pd

import torch

from cell import utils
from cell.utils import link_prediction_performance, edge_overlap
from cell.cell import Cell, EdgeOverlapCriterion, LinkPredictionCriterion
from cell.graph_statistics import compute_graph_statistics

# CORA ML

In [2]:
#train_graph 
_A_obs, _X_obs, _z_obs = utils.load_npz('../data/cora_ml.npz')
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [3]:
val_share = 0.05
test_share = 0.1
seed = 42 #481516234

In [4]:
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=True)

In [5]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

### Edge overlap

#### CELL

In [6]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='cell',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [7]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-5})

Step:  10/200 Loss: 6.02955 Edge-Overlap: 0.037 Total-Time: 4
Step:  20/200 Loss: 4.03502 Edge-Overlap: 0.282 Total-Time: 8
Step:  30/200 Loss: 3.32699 Edge-Overlap: 0.430 Total-Time: 11
Step:  40/200 Loss: 3.02071 Edge-Overlap: 0.491 Total-Time: 15
Step:  50/200 Loss: 2.87482 Edge-Overlap: 0.538 Total-Time: 19


In [8]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.943928659259619, 0.9510697129231935)

In [9]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [10]:
df = pd.DataFrame(stat_df.mean(), columns=['cell'])

#### our CELL

In [19]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='fc',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [20]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-4})

Step:  10/200 Loss: 7.37374 Edge-Overlap: 0.020 Total-Time: 4
Step:  20/200 Loss: 6.85635 Edge-Overlap: 0.025 Total-Time: 7
Step:  30/200 Loss: 6.37722 Edge-Overlap: 0.028 Total-Time: 11
Step:  40/200 Loss: 5.81307 Edge-Overlap: 0.069 Total-Time: 15
Step:  50/200 Loss: 5.15663 Edge-Overlap: 0.121 Total-Time: 19
Step:  60/200 Loss: 4.65556 Edge-Overlap: 0.156 Total-Time: 23
Step:  70/200 Loss: 4.39197 Edge-Overlap: 0.208 Total-Time: 27
Step:  80/200 Loss: 4.05709 Edge-Overlap: 0.229 Total-Time: 31
Step:  90/200 Loss: 3.84849 Edge-Overlap: 0.287 Total-Time: 35
Step: 100/200 Loss: 3.79800 Edge-Overlap: 0.193 Total-Time: 39
Step: 110/200 Loss: 3.74362 Edge-Overlap: 0.317 Total-Time: 43
Step: 120/200 Loss: 3.31573 Edge-Overlap: 0.431 Total-Time: 47
Step: 130/200 Loss: 3.22700 Edge-Overlap: 0.431 Total-Time: 51
Step: 140/200 Loss: 3.06328 Edge-Overlap: 0.495 Total-Time: 56
Step: 150/200 Loss: 3.25868 Edge-Overlap: 0.488 Total-Time: 59
Step: 160/200 Loss: 2.89066 Edge-Overlap: 0.546 Total-Tim

In [21]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9606955502889251, 0.9628740761304896)

In [22]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [23]:
df['cell+fc'] = stat_df.mean().T

df['gt'] = compute_graph_statistics(_A_obs).values()

In [24]:
df

Unnamed: 0,cell,cell+fc,gt
d_max,185.0,201.6,246.0
d_min,1.0,1.0,1.0
d,4.827758,4.827758,5.680427
LCC,2803.0,2798.2,2810.0
wedge_count,79180.4,95355.8,137719.0
claw_count,1584960.0,2273562.0,3930163.0
triangle_count,1291.0,1778.2,5247.0
square_count,6186.2,11609.2,34507.0
power_law_exp,1.812574,1.85085,1.767268
gini,0.4431508,0.47943,0.4964733


### Link Prediction Criterion

#### CELL

In [39]:
# initialize model with LP-criterion
#
model = Cell(A=train_graph,
             g_type='cell',
             H=9,
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [40]:
# train model 
model.train(steps=300,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-6})

Step:   2/300 Loss: 7.92603 ROC-AUC Score: 0.572 Average Precision: 0.554 Total-Time: 0
Step:   4/300 Loss: 7.72035 ROC-AUC Score: 0.636 Average Precision: 0.628 Total-Time: 1
Step:   6/300 Loss: 7.29653 ROC-AUC Score: 0.685 Average Precision: 0.687 Total-Time: 1
Step:   8/300 Loss: 6.68282 ROC-AUC Score: 0.731 Average Precision: 0.735 Total-Time: 2
Step:  10/300 Loss: 6.00020 ROC-AUC Score: 0.781 Average Precision: 0.791 Total-Time: 3
Step:  12/300 Loss: 5.38965 ROC-AUC Score: 0.832 Average Precision: 0.850 Total-Time: 3
Step:  14/300 Loss: 4.92797 ROC-AUC Score: 0.870 Average Precision: 0.887 Total-Time: 4
Step:  16/300 Loss: 4.58454 ROC-AUC Score: 0.888 Average Precision: 0.902 Total-Time: 5
Step:  18/300 Loss: 4.29505 ROC-AUC Score: 0.896 Average Precision: 0.908 Total-Time: 5
Step:  20/300 Loss: 4.03938 ROC-AUC Score: 0.903 Average Precision: 0.916 Total-Time: 6
Step:  22/300 Loss: 3.83683 ROC-AUC Score: 0.911 Average Precision: 0.924 Total-Time: 7
Step:  24/300 Loss: 3.67245 ROC-

In [41]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9414584250337953, 0.9457385717379865)

#### our CELL

In [42]:
# initialize model with LP-criterion
#
model = Cell(A=train_graph,
             g_type='fc',
             H=9,
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [43]:
# train model 
model.train(steps=300,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 5e-5})

Step:   2/300 Loss: 7.90985 ROC-AUC Score: 0.815 Average Precision: 0.815 Total-Time: 0
Step:   4/300 Loss: 7.54912 ROC-AUC Score: 0.841 Average Precision: 0.854 Total-Time: 1
Step:   6/300 Loss: 7.38563 ROC-AUC Score: 0.843 Average Precision: 0.857 Total-Time: 2
Step:   8/300 Loss: 7.35279 ROC-AUC Score: 0.835 Average Precision: 0.855 Total-Time: 3
Step:  10/300 Loss: 7.30924 ROC-AUC Score: 0.837 Average Precision: 0.859 Total-Time: 3
Step:  12/300 Loss: 7.26639 ROC-AUC Score: 0.846 Average Precision: 0.866 Total-Time: 4
Step:  14/300 Loss: 7.19245 ROC-AUC Score: 0.855 Average Precision: 0.874 Total-Time: 5
Step:  16/300 Loss: 7.11766 ROC-AUC Score: 0.863 Average Precision: 0.882 Total-Time: 6
Step:  18/300 Loss: 7.03637 ROC-AUC Score: 0.870 Average Precision: 0.886 Total-Time: 6
Step:  20/300 Loss: 6.94362 ROC-AUC Score: 0.868 Average Precision: 0.883 Total-Time: 7
Step:  22/300 Loss: 6.97236 ROC-AUC Score: 0.867 Average Precision: 0.879 Total-Time: 8
Step:  24/300 Loss: 6.80680 ROC-

In [44]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9542058988002837, 0.9606755845126247)

# Citeseer

In [45]:
_A_obs, _X_obs, _z_obs = utils.load_npz('../data/citeseer.npz')
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [46]:
val_share = 0.05
test_share = 0.1
seed = 48

In [47]:
#there are self loops!

train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False)

In [48]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

In [49]:
# initialize model with EO-criterion
model = Cell(A=_A_obs,
             H=9,
             g_type='cell',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [50]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-7})

Step:  10/200 Loss: 5.33630 Edge-Overlap: 0.044 Total-Time: 2
Step:  20/200 Loss: 3.15112 Edge-Overlap: 0.360 Total-Time: 4
Step:  30/200 Loss: 2.32722 Edge-Overlap: 0.593 Total-Time: 7


In [51]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.972684703433923, 0.9861373933995713)

In [52]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [53]:
df = pd.DataFrame(stat_df.mean(), columns=['cell'])

#### our CELL

In [54]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='fc',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [55]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.05,
                            'weight_decay': 1e-5})

Step:  10/200 Loss: 6.52337 Edge-Overlap: 0.015 Total-Time: 2
Step:  20/200 Loss: 4.37955 Edge-Overlap: 0.142 Total-Time: 4
Step:  30/200 Loss: 2.81670 Edge-Overlap: 0.442 Total-Time: 7
Step:  40/200 Loss: 2.17366 Edge-Overlap: 0.635 Total-Time: 9


In [56]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.869638108451844, 0.9003855901926014)

In [57]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [58]:
df['cell+fc'] = stat_df.mean().T

df['gt'] = compute_graph_statistics(_A_obs).values()

In [59]:
df

Unnamed: 0,cell,cell+fc,gt
d_max,68.6,87.8,99.0
d_min,1.0,1.0,1.0
d,3.501422,2.997156,3.501422
LCC,2096.2,2004.8,2110.0
wedge_count,20062.8,18441.4,26160.0
claw_count,108169.2,163957.6,251101.0
triangle_count,401.8,312.0,1083.0
square_count,1519.0,1395.0,6130.0
power_law_exp,2.00654,2.24656,2.058416
gini,0.38811,0.41874,0.426981


In [60]:
# initialize model with LP-criterion
#
model = Cell(A=_A_obs,
             H=9,
             g_type='cell',
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [61]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-6})

Step:   2/200 Loss: 7.63371 ROC-AUC Score: 0.911 Average Precision: 0.922 Total-Time: 0
Step:   4/200 Loss: 7.38415 ROC-AUC Score: 0.955 Average Precision: 0.960 Total-Time: 0
Step:   6/200 Loss: 6.87651 ROC-AUC Score: 0.964 Average Precision: 0.969 Total-Time: 1
Step:   8/200 Loss: 6.14298 ROC-AUC Score: 0.969 Average Precision: 0.972 Total-Time: 1
Step:  10/200 Loss: 5.33960 ROC-AUC Score: 0.976 Average Precision: 0.979 Total-Time: 2
Step:  12/200 Loss: 4.63752 ROC-AUC Score: 0.986 Average Precision: 0.988 Total-Time: 2
Step:  14/200 Loss: 4.13953 ROC-AUC Score: 0.993 Average Precision: 0.993 Total-Time: 3
Step:  16/200 Loss: 3.77724 ROC-AUC Score: 0.996 Average Precision: 0.996 Total-Time: 3
Step:  18/200 Loss: 3.45594 ROC-AUC Score: 0.998 Average Precision: 0.998 Total-Time: 3
Step:  20/200 Loss: 3.18722 ROC-AUC Score: 0.999 Average Precision: 0.999 Total-Time: 4
Step:  22/200 Loss: 2.96647 ROC-AUC Score: 1.000 Average Precision: 1.000 Total-Time: 4
Step:  24/200 Loss: 2.77052 ROC-

In [62]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9997904382009482, 0.9997938009327869)

In [67]:
# initialize model with LP-criterion
#
model = Cell(A=_A_obs,
             H=9,
             g_type='fc',
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [68]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 5e-5})

Step:   2/200 Loss: 8.19954 ROC-AUC Score: 0.802 Average Precision: 0.805 Total-Time: 0
Step:   4/200 Loss: 7.56162 ROC-AUC Score: 0.820 Average Precision: 0.835 Total-Time: 0
Step:   6/200 Loss: 7.40063 ROC-AUC Score: 0.870 Average Precision: 0.878 Total-Time: 1
Step:   8/200 Loss: 7.23446 ROC-AUC Score: 0.871 Average Precision: 0.880 Total-Time: 1
Step:  10/200 Loss: 7.11778 ROC-AUC Score: 0.889 Average Precision: 0.894 Total-Time: 1
Step:  12/200 Loss: 7.07394 ROC-AUC Score: 0.905 Average Precision: 0.902 Total-Time: 2
Step:  14/200 Loss: 6.97687 ROC-AUC Score: 0.918 Average Precision: 0.913 Total-Time: 2
Step:  16/200 Loss: 6.90188 ROC-AUC Score: 0.928 Average Precision: 0.924 Total-Time: 3
Step:  18/200 Loss: 6.83477 ROC-AUC Score: 0.923 Average Precision: 0.918 Total-Time: 3
Step:  20/200 Loss: 6.78433 ROC-AUC Score: 0.931 Average Precision: 0.929 Total-Time: 3
Step:  22/200 Loss: 6.73645 ROC-AUC Score: 0.932 Average Precision: 0.932 Total-Time: 4
Step:  24/200 Loss: 6.79900 ROC-

In [69]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9988365707018153, 0.998733427881796)

# Prolblogs

#### CELL

In [120]:
_A_obs, _X_obs, _z_obs = utils.load_npz('../data/polblogs.npz')
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [121]:
val_share = 0.05
test_share = 0.1
seed = 48

In [122]:
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False)

In [123]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

In [125]:
# initialize model with EO-criterion
model = Cell(A=_A_obs,
             H=9,
             g_type='cell',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [126]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-7})

Step:  10/200 Loss: 5.85955 Edge-Overlap: 0.324 Total-Time: 1
Step:  20/200 Loss: 5.52497 Edge-Overlap: 0.334 Total-Time: 2
Step:  30/200 Loss: 5.35665 Edge-Overlap: 0.371 Total-Time: 4
Step:  40/200 Loss: 5.26537 Edge-Overlap: 0.381 Total-Time: 5
Step:  50/200 Loss: 5.21497 Edge-Overlap: 0.398 Total-Time: 7
Step:  60/200 Loss: 5.18568 Edge-Overlap: 0.405 Total-Time: 8
Step:  70/200 Loss: 5.16670 Edge-Overlap: 0.410 Total-Time: 10
Step:  80/200 Loss: 5.15340 Edge-Overlap: 0.412 Total-Time: 11
Step:  90/200 Loss: 5.14335 Edge-Overlap: 0.414 Total-Time: 13
Step: 100/200 Loss: 5.13535 Edge-Overlap: 0.414 Total-Time: 15
Step: 110/200 Loss: 5.12871 Edge-Overlap: 0.414 Total-Time: 16
Step: 120/200 Loss: 5.12317 Edge-Overlap: 0.420 Total-Time: 18
Step: 130/200 Loss: 5.11827 Edge-Overlap: 0.420 Total-Time: 19
Step: 140/200 Loss: 5.11391 Edge-Overlap: 0.420 Total-Time: 21
Step: 150/200 Loss: 5.11006 Edge-Overlap: 0.426 Total-Time: 23
Step: 160/200 Loss: 5.10672 Edge-Overlap: 0.423 Total-Time: 2

In [127]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9890624235198535, 0.9872647970639767)