In [1]:
# import warnings
# warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np
import scipy.sparse as sp
from scipy.sparse import load_npz
import pandas as pd

import torch

from cell import utils
from cell.utils import link_prediction_performance, edge_overlap
from cell.cell import Cell, EdgeOverlapCriterion, LinkPredictionCriterion
from cell.graph_statistics import compute_graph_statistics

# CORA ML

In [2]:
#train_graph 
_A_obs, _X_obs, _z_obs = utils.load_npz('../data/cora_ml.npz')
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [3]:
val_share = 0.05
test_share = 0.1
seed = 42 #481516234

In [4]:
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=True)

In [5]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

### Edge overlap

#### CELL

In [6]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='cell',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [7]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-5})

Step:  10/200 Loss: 6.02955 Edge-Overlap: 0.037 Total-Time: 4
Step:  20/200 Loss: 4.03502 Edge-Overlap: 0.282 Total-Time: 8
Step:  30/200 Loss: 3.32699 Edge-Overlap: 0.430 Total-Time: 11
Step:  40/200 Loss: 3.02071 Edge-Overlap: 0.491 Total-Time: 15
Step:  50/200 Loss: 2.87482 Edge-Overlap: 0.538 Total-Time: 19


In [8]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.943928659259619, 0.9510697129231935)

In [9]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [10]:
df = pd.DataFrame(stat_df.mean(), columns=['cell'])

#### our CELL

In [19]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='fc',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [20]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-4})

Step:  10/200 Loss: 7.37374 Edge-Overlap: 0.020 Total-Time: 4
Step:  20/200 Loss: 6.85635 Edge-Overlap: 0.025 Total-Time: 7
Step:  30/200 Loss: 6.37722 Edge-Overlap: 0.028 Total-Time: 11
Step:  40/200 Loss: 5.81307 Edge-Overlap: 0.069 Total-Time: 15
Step:  50/200 Loss: 5.15663 Edge-Overlap: 0.121 Total-Time: 19
Step:  60/200 Loss: 4.65556 Edge-Overlap: 0.156 Total-Time: 23
Step:  70/200 Loss: 4.39197 Edge-Overlap: 0.208 Total-Time: 27
Step:  80/200 Loss: 4.05709 Edge-Overlap: 0.229 Total-Time: 31
Step:  90/200 Loss: 3.84849 Edge-Overlap: 0.287 Total-Time: 35
Step: 100/200 Loss: 3.79800 Edge-Overlap: 0.193 Total-Time: 39
Step: 110/200 Loss: 3.74362 Edge-Overlap: 0.317 Total-Time: 43
Step: 120/200 Loss: 3.31573 Edge-Overlap: 0.431 Total-Time: 47
Step: 130/200 Loss: 3.22700 Edge-Overlap: 0.431 Total-Time: 51
Step: 140/200 Loss: 3.06328 Edge-Overlap: 0.495 Total-Time: 56
Step: 150/200 Loss: 3.25868 Edge-Overlap: 0.488 Total-Time: 59
Step: 160/200 Loss: 2.89066 Edge-Overlap: 0.546 Total-Tim

In [21]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9606955502889251, 0.9628740761304896)

In [22]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [23]:
df['cell+fc'] = stat_df.mean().T

df['gt'] = compute_graph_statistics(_A_obs).values()

In [24]:
df

Unnamed: 0,cell,cell+fc,gt
d_max,185.0,201.6,246.0
d_min,1.0,1.0,1.0
d,4.827758,4.827758,5.680427
LCC,2803.0,2798.2,2810.0
wedge_count,79180.4,95355.8,137719.0
claw_count,1584960.0,2273562.0,3930163.0
triangle_count,1291.0,1778.2,5247.0
square_count,6186.2,11609.2,34507.0
power_law_exp,1.812574,1.85085,1.767268
gini,0.4431508,0.47943,0.4964733


### Link Prediction Criterion

#### CELL

In [39]:
# initialize model with LP-criterion
#
model = Cell(A=train_graph,
             g_type='cell',
             H=9,
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [40]:
# train model 
model.train(steps=300,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-6})

Step:   2/300 Loss: 7.92603 ROC-AUC Score: 0.572 Average Precision: 0.554 Total-Time: 0
Step:   4/300 Loss: 7.72035 ROC-AUC Score: 0.636 Average Precision: 0.628 Total-Time: 1
Step:   6/300 Loss: 7.29653 ROC-AUC Score: 0.685 Average Precision: 0.687 Total-Time: 1
Step:   8/300 Loss: 6.68282 ROC-AUC Score: 0.731 Average Precision: 0.735 Total-Time: 2
Step:  10/300 Loss: 6.00020 ROC-AUC Score: 0.781 Average Precision: 0.791 Total-Time: 3
Step:  12/300 Loss: 5.38965 ROC-AUC Score: 0.832 Average Precision: 0.850 Total-Time: 3
Step:  14/300 Loss: 4.92797 ROC-AUC Score: 0.870 Average Precision: 0.887 Total-Time: 4
Step:  16/300 Loss: 4.58454 ROC-AUC Score: 0.888 Average Precision: 0.902 Total-Time: 5
Step:  18/300 Loss: 4.29505 ROC-AUC Score: 0.896 Average Precision: 0.908 Total-Time: 5
Step:  20/300 Loss: 4.03938 ROC-AUC Score: 0.903 Average Precision: 0.916 Total-Time: 6
Step:  22/300 Loss: 3.83683 ROC-AUC Score: 0.911 Average Precision: 0.924 Total-Time: 7
Step:  24/300 Loss: 3.67245 ROC-

In [41]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9414584250337953, 0.9457385717379865)

#### our CELL

In [42]:
# initialize model with LP-criterion
#
model = Cell(A=train_graph,
             g_type='fc',
             H=9,
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [43]:
# train model 
model.train(steps=300,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 5e-5})

Step:   2/300 Loss: 7.90985 ROC-AUC Score: 0.815 Average Precision: 0.815 Total-Time: 0
Step:   4/300 Loss: 7.54912 ROC-AUC Score: 0.841 Average Precision: 0.854 Total-Time: 1
Step:   6/300 Loss: 7.38563 ROC-AUC Score: 0.843 Average Precision: 0.857 Total-Time: 2
Step:   8/300 Loss: 7.35279 ROC-AUC Score: 0.835 Average Precision: 0.855 Total-Time: 3
Step:  10/300 Loss: 7.30924 ROC-AUC Score: 0.837 Average Precision: 0.859 Total-Time: 3
Step:  12/300 Loss: 7.26639 ROC-AUC Score: 0.846 Average Precision: 0.866 Total-Time: 4
Step:  14/300 Loss: 7.19245 ROC-AUC Score: 0.855 Average Precision: 0.874 Total-Time: 5
Step:  16/300 Loss: 7.11766 ROC-AUC Score: 0.863 Average Precision: 0.882 Total-Time: 6
Step:  18/300 Loss: 7.03637 ROC-AUC Score: 0.870 Average Precision: 0.886 Total-Time: 6
Step:  20/300 Loss: 6.94362 ROC-AUC Score: 0.868 Average Precision: 0.883 Total-Time: 7
Step:  22/300 Loss: 6.97236 ROC-AUC Score: 0.867 Average Precision: 0.879 Total-Time: 8
Step:  24/300 Loss: 6.80680 ROC-

In [44]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9542058988002837, 0.9606755845126247)

# Citeseer

In [71]:
_A_obs, _X_obs, _z_obs = utils.load_npz('../data/citeseer.npz')
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1
_A_obs = _A_obs - sp.eye(_A_obs.shape[0], _A_obs.shape[0])
_A_obs[_A_obs < 0] = 0
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [72]:
val_share = 0.05
test_share = 0.1
seed = 48

In [73]:
#there are self loops!

train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False)

In [74]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

In [83]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='cell',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [84]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-7})

Step:  10/200 Loss: 5.26347 Edge-Overlap: 0.044 Total-Time: 2
Step:  20/200 Loss: 2.96395 Edge-Overlap: 0.370 Total-Time: 4
Step:  30/200 Loss: 2.08671 Edge-Overlap: 0.620 Total-Time: 7


In [85]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.8816088728733459, 0.9059670666723801)

In [86]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [87]:
df = pd.DataFrame(stat_df.mean(), columns=['cell'])

#### our CELL

In [88]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='fc',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [89]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.05,
                            'weight_decay': 1e-5})

Step:  10/200 Loss: 6.61125 Edge-Overlap: 0.017 Total-Time: 2
Step:  20/200 Loss: 4.16881 Edge-Overlap: 0.182 Total-Time: 5
Step:  30/200 Loss: 2.70832 Edge-Overlap: 0.475 Total-Time: 8
Step:  40/200 Loss: 2.04511 Edge-Overlap: 0.662 Total-Time: 11


In [90]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.8869993501890359, 0.9129411569997657)

In [91]:
generated_graphs = [model.sample_graph() for _ in range(5)]
stats = [compute_graph_statistics(gg) for gg in generated_graphs]
stat_df = pd.DataFrame({k: [s[k] for s in stats] for k in stats[0].keys()})

In [92]:
df['cell+fc'] = stat_df.mean().T

df['gt'] = compute_graph_statistics(_A_obs).values()

In [93]:
df

Unnamed: 0,cell,cell+fc,gt
d_max,61.6,103.4,99.0
d_min,1.0,1.0,1.0
d,2.954502,2.954502,3.476777
LCC,2071.8,1972.6,2110.0
wedge_count,13612.4,20481.8,25943.0
claw_count,69661.2,245217.6,250348.0
triangle_count,151.2,392.6,1083.0
square_count,405.4,2378.6,5977.0
power_law_exp,2.183324,2.287204,2.068238
gini,0.37424,0.42678,0.42826


In [94]:
# initialize model with LP-criterion
#
model = Cell(A=train_graph,
             H=9,
             g_type='cell',
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [95]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-6})

Step:   2/200 Loss: 7.63163 ROC-AUC Score: 0.541 Average Precision: 0.561 Total-Time: 0
Step:   4/200 Loss: 7.37045 ROC-AUC Score: 0.623 Average Precision: 0.639 Total-Time: 0
Step:   6/200 Loss: 6.84914 ROC-AUC Score: 0.666 Average Precision: 0.681 Total-Time: 1
Step:   8/200 Loss: 6.09813 ROC-AUC Score: 0.697 Average Precision: 0.710 Total-Time: 1
Step:  10/200 Loss: 5.28319 ROC-AUC Score: 0.730 Average Precision: 0.741 Total-Time: 2
Step:  12/200 Loss: 4.56484 ROC-AUC Score: 0.767 Average Precision: 0.784 Total-Time: 2
Step:  14/200 Loss: 4.02190 ROC-AUC Score: 0.797 Average Precision: 0.823 Total-Time: 2
Step:  16/200 Loss: 3.60988 ROC-AUC Score: 0.816 Average Precision: 0.841 Total-Time: 3
Step:  18/200 Loss: 3.26524 ROC-AUC Score: 0.830 Average Precision: 0.849 Total-Time: 3
Step:  20/200 Loss: 2.97353 ROC-AUC Score: 0.842 Average Precision: 0.858 Total-Time: 4
Step:  22/200 Loss: 2.72824 ROC-AUC Score: 0.852 Average Precision: 0.868 Total-Time: 4
Step:  24/200 Loss: 2.51697 ROC-

In [96]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.8893401465028355, 0.9067387228433713)

In [110]:
# initialize model with LP-criterion
#
model = Cell(A=train_graph,
             H=9,
             g_type='fc',
             callbacks=[LinkPredictionCriterion(invoke_every=2,
                                                val_ones=val_ones,
                                            val_zeros=val_zeros,
                                                max_patience=3)])

In [111]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-4})

Step:   2/200 Loss: 7.85740 ROC-AUC Score: 0.784 Average Precision: 0.781 Total-Time: 0
Step:   4/200 Loss: 7.41183 ROC-AUC Score: 0.783 Average Precision: 0.801 Total-Time: 0
Step:   6/200 Loss: 7.28172 ROC-AUC Score: 0.760 Average Precision: 0.791 Total-Time: 1
Step:   8/200 Loss: 7.16212 ROC-AUC Score: 0.774 Average Precision: 0.814 Total-Time: 1
Step:  10/200 Loss: 7.09157 ROC-AUC Score: 0.784 Average Precision: 0.822 Total-Time: 2
Step:  12/200 Loss: 6.97190 ROC-AUC Score: 0.802 Average Precision: 0.833 Total-Time: 2
Step:  14/200 Loss: 6.83995 ROC-AUC Score: 0.797 Average Precision: 0.824 Total-Time: 2
Step:  16/200 Loss: 6.73701 ROC-AUC Score: 0.808 Average Precision: 0.837 Total-Time: 3
Step:  18/200 Loss: 6.58835 ROC-AUC Score: 0.800 Average Precision: 0.834 Total-Time: 3
Step:  20/200 Loss: 6.40314 ROC-AUC Score: 0.795 Average Precision: 0.831 Total-Time: 4
Step:  22/200 Loss: 6.24258 ROC-AUC Score: 0.812 Average Precision: 0.842 Total-Time: 4
Step:  24/200 Loss: 6.07048 ROC-

In [112]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.881453804347826, 0.8975038026834091)

# Prolblogs

#### CELL

In [115]:
_A_obs, _X_obs, _z_obs = utils.load_npz('../data/polblogs.npz')
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1
_A_obs = _A_obs - sp.eye(_A_obs.shape[0], _A_obs.shape[0])
_A_obs[_A_obs < 0] = 0
lcc = utils.largest_connected_components(_A_obs)
_A_obs = _A_obs[lcc,:][:,lcc]
_N = _A_obs.shape[0]

Selecting 1 largest connected components


In [116]:
val_share = 0.05
test_share = 0.1
seed = 48

In [117]:
train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False)

In [118]:
train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()
assert (train_graph.toarray() == train_graph.toarray().T).all()

In [119]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='cell',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [120]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-7})

Step:  10/200 Loss: 5.83401 Edge-Overlap: 0.288 Total-Time: 1
Step:  20/200 Loss: 5.47092 Edge-Overlap: 0.299 Total-Time: 2
Step:  30/200 Loss: 5.29989 Edge-Overlap: 0.334 Total-Time: 3
Step:  40/200 Loss: 5.20937 Edge-Overlap: 0.356 Total-Time: 4
Step:  50/200 Loss: 5.16049 Edge-Overlap: 0.359 Total-Time: 6
Step:  60/200 Loss: 5.13118 Edge-Overlap: 0.366 Total-Time: 7
Step:  70/200 Loss: 5.11152 Edge-Overlap: 0.373 Total-Time: 8
Step:  80/200 Loss: 5.09732 Edge-Overlap: 0.372 Total-Time: 9
Step:  90/200 Loss: 5.08634 Edge-Overlap: 0.380 Total-Time: 11
Step: 100/200 Loss: 5.07748 Edge-Overlap: 0.380 Total-Time: 12
Step: 110/200 Loss: 5.07006 Edge-Overlap: 0.386 Total-Time: 13
Step: 120/200 Loss: 5.06377 Edge-Overlap: 0.378 Total-Time: 14
Step: 130/200 Loss: 5.05826 Edge-Overlap: 0.386 Total-Time: 16
Step: 140/200 Loss: 5.05343 Edge-Overlap: 0.383 Total-Time: 17
Step: 150/200 Loss: 5.04905 Edge-Overlap: 0.391 Total-Time: 18
Step: 160/200 Loss: 5.04510 Edge-Overlap: 0.392 Total-Time: 19


In [121]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.960215496713207, 0.960696351282322)

In [125]:
# initialize model with EO-criterion
model = Cell(A=train_graph,
             H=9,
             g_type='fc',
             callbacks=[EdgeOverlapCriterion(invoke_every=10, edge_overlap_limit=.5)])

In [126]:
# train model 
model.train(steps=200,
            optimizer_fn=torch.optim.Adam,
            optimizer_args={'lr': 0.1,
                            'weight_decay': 1e-5})

Step:  10/200 Loss: 6.42485 Edge-Overlap: 0.123 Total-Time: 1
Step:  20/200 Loss: 6.34362 Edge-Overlap: 0.132 Total-Time: 2
Step:  30/200 Loss: 6.21646 Edge-Overlap: 0.158 Total-Time: 4
Step:  40/200 Loss: 5.91915 Edge-Overlap: 0.211 Total-Time: 5
Step:  50/200 Loss: 5.79610 Edge-Overlap: 0.211 Total-Time: 7
Step:  60/200 Loss: 5.73338 Edge-Overlap: 0.228 Total-Time: 8
Step:  70/200 Loss: 5.67114 Edge-Overlap: 0.241 Total-Time: 10
Step:  80/200 Loss: 5.62350 Edge-Overlap: 0.250 Total-Time: 11
Step:  90/200 Loss: 5.57992 Edge-Overlap: 0.276 Total-Time: 12
Step: 100/200 Loss: 5.53117 Edge-Overlap: 0.286 Total-Time: 14
Step: 110/200 Loss: 5.49798 Edge-Overlap: 0.280 Total-Time: 15
Step: 120/200 Loss: 5.47121 Edge-Overlap: 0.277 Total-Time: 16
Step: 130/200 Loss: 5.43454 Edge-Overlap: 0.293 Total-Time: 18
Step: 140/200 Loss: 5.42513 Edge-Overlap: 0.276 Total-Time: 19
Step: 150/200 Loss: 5.40097 Edge-Overlap: 0.306 Total-Time: 20
Step: 160/200 Loss: 5.37051 Edge-Overlap: 0.309 Total-Time: 2

In [127]:
# roc-auc, avg-precision

link_prediction_performance(scores_matrix=model._scores_matrix, val_ones=test_ones, val_zeros=test_zeros)

(0.9588410424130088, 0.9587745897560999)