In [2]:
from collections import defaultdict
from utils.DataLoader import get_link_prediction_data 

In [4]:
def calc_surprise_factor(dataset_name):
  (
    node_raw_features,
    edge_raw_features,
    full_data,
    train_data,
    val_data,
    test_data,
    new_node_val_data,
    new_node_test_data,
  ) = get_link_prediction_data(dataset_name=dataset_name, val_ratio=0.15, test_ratio=0.15)
  known_edges = defaultdict(lambda: 0)
  for src, dst in zip(train_data.src_node_ids, train_data.dst_node_ids):
    known_edges[f'{src} {dst}'] = known_edges[f'{dst} {src}'] = 1
  surp_all, tot_all =0, 0
  surp_val, tot_val =0, 0
  for src, dst in zip(val_data.src_node_ids, val_data.dst_node_ids):
    if not known_edges[f'{src} {dst}']:
      known_edges[f'{src} {dst}'] = known_edges[f'{dst} {src}'] = 1
      surp_val += 1
      surp_all += 1
    tot_val += 1
    tot_all += 1
  print('Val')
  print(surp_val, tot_val, round(100*surp_val/tot_val, 2))
  surp_test, tot_test =0, 0
  for src, dst in zip(test_data.src_node_ids, test_data.dst_node_ids):
    if not known_edges[f'{src} {dst}']:
      known_edges[f'{src} {dst}'] = known_edges[f'{dst} {src}'] = 1
      surp_test += 1
      surp_all += 1
    tot_test += 1
    tot_all += 1
  print('Test')
  print(surp_test, tot_test, round(100*surp_test/tot_test, 2))
  print('Val + Test')
  print(surp_all, tot_all, round(100*surp_all/tot_all, 2))

In [5]:
for dataset in ["ia-retweet-pol", "ia-reality-call", "ia-movielens-user2tags-10m", "ia-slashdot-reply-dir", "ia-escorts-dynamic", "ia-digg-reply"]:
  print(dataset)
  calc_surprise_factor(dataset)

ia-retweet-pol


100%|██████████| 61156/61156 [00:00<00:00, 1801059.23it/s]


The dataset has 61156 interactions, involving 18470 different nodes
The training dataset has 30070 interactions, involving 12678 different nodes
The validation dataset has 9173 interactions, involving 5479 different nodes
The test dataset has 9174 interactions, involving 5328 different nodes
The new node validation dataset has 4957 interactions, involving 4196 different nodes
The new node test dataset has 5073 interactions, involving 4153 different nodes
1847 nodes were used for the inductive testing, i.e. are never seen during training
Val
7290 9173 79.47
Test
6894 9174 75.15
Val + Test
14184 18347 77.31
ia-reality-call


100%|██████████| 52049/52049 [00:00<00:00, 2270153.68it/s]


The dataset has 52049 interactions, involving 6809 different nodes
The training dataset has 23625 interactions, involving 3838 different nodes
The validation dataset has 7807 interactions, involving 1715 different nodes
The test dataset has 7808 interactions, involving 1937 different nodes
The new node validation dataset has 4011 interactions, involving 1185 different nodes
The new node test dataset has 4611 interactions, involving 1531 different nodes
680 nodes were used for the inductive testing, i.e. are never seen during training
Val
1286 7807 16.47
Test
1270 7808 16.27
Val + Test
2556 15615 16.37
ia-movielens-user2tags-10m


100%|██████████| 95577/95577 [00:00<00:00, 2056042.76it/s]


The dataset has 95577 interactions, involving 16527 different nodes
The training dataset has 48189 interactions, involving 10566 different nodes
The validation dataset has 14336 interactions, involving 5209 different nodes
The test dataset has 14337 interactions, involving 5377 different nodes
The new node validation dataset has 6745 interactions, involving 3518 different nodes
The new node test dataset has 7521 interactions, involving 3885 different nodes
1652 nodes were used for the inductive testing, i.e. are never seen during training
Val
6977 14336 48.67
Test
6690 14337 46.66
Val + Test
13667 28673 47.67
ia-slashdot-reply-dir


100%|██████████| 140777/140777 [00:00<00:00, 2030829.11it/s]


The dataset has 140777 interactions, involving 51083 different nodes
The training dataset has 76599 interactions, involving 34496 different nodes
The validation dataset has 21116 interactions, involving 10542 different nodes
The test dataset has 21117 interactions, involving 10424 different nodes
The new node validation dataset has 15534 interactions, involving 9790 different nodes
The new node test dataset has 16568 interactions, involving 9911 different nodes
5108 nodes were used for the inductive testing, i.e. are never seen during training
Val
17584 21116 83.27
Test
17427 21117 82.53
Val + Test
35011 42233 82.9
ia-escorts-dynamic


100%|██████████| 50631/50631 [00:00<00:00, 2068895.77it/s]


The dataset has 50631 interactions, involving 10106 different nodes
The training dataset has 29100 interactions, involving 7154 different nodes
The validation dataset has 7596 interactions, involving 4118 different nodes
The test dataset has 7577 interactions, involving 4144 different nodes
The new node validation dataset has 3845 interactions, involving 2930 different nodes
The new node test dataset has 4829 interactions, involving 3346 different nodes
1010 nodes were used for the inductive testing, i.e. are never seen during training
Val
5817 7596 76.58
Test
5727 7577 75.58
Val + Test
11544 15173 76.08
ia-digg-reply


100%|██████████| 87626/87626 [00:00<00:00, 1889877.17it/s]

The dataset has 87626 interactions, involving 30398 different nodes
The training dataset has 47297 interactions, involving 21540 different nodes
The validation dataset has 13144 interactions, involving 9241 different nodes
The test dataset has 13144 interactions, involving 9511 different nodes
The new node validation dataset has 7995 interactions, involving 7321 different nodes
The new node test dataset has 8239 interactions, involving 7732 different nodes
3039 nodes were used for the inductive testing, i.e. are never seen during training
Val
12941 13144 98.46
Test
12950 13144 98.52
Val + Test
25891 26288 98.49





In [1]:
## Surprise factor of nodes = Number of new nodes in val/test times
def calc_surprise_factor(dataset_name):
  (
    node_raw_features,
    edge_raw_features,
    full_data,
    train_data,
    val_data,
    test_data,
    new_node_val_data,
    new_node_test_data,
  ) = get_link_prediction_data(dataset_name=dataset_name, val_ratio=0.15, test_ratio=0.15)
  known_nodes = defaultdict(lambda: 0)
  for src, dst in zip(train_data.src_node_ids, train_data.dst_node_ids):
    known_nodes[src] = known_nodes[dst] = 1
  surp_all, tot_all =0, 0
  surp_val, tot_val =0, 0
  for src, dst in zip(val_data.src_node_ids, val_data.dst_node_ids):
    if known_nodes[src] + known_nodes[dst] <= 1:
      known_nodes[src] = known_nodes[dst] = 1
      surp_val += 1
      surp_all += 1
    tot_val += 1
    tot_all += 1
  print('Val')
  print(surp_val, tot_val, round(100*surp_val/tot_val, 2))
  surp_test, tot_test =0, 0
  for src, dst in zip(test_data.src_node_ids, test_data.dst_node_ids):
    if known_nodes[src] + known_nodes[dst] <= 1:
      known_nodes[src] = known_nodes[dst] = 1
      surp_test += 1
      surp_all += 1
    tot_test += 1
    tot_all += 1
  print('Test')
  print(surp_test, tot_test, round(100*surp_test/tot_test, 2))
  print('Val + Test')
  print(surp_all, tot_all, round(100*surp_all/tot_all, 2))

In [3]:
for dataset in ["ia-retweet-pol", "ia-reality-call", "ia-movielens-user2tags-10m", "ia-slashdot-reply-dir", "ia-escorts-dynamic", "ia-digg-reply"]:
  print(dataset)
  calc_surprise_factor(dataset)

ia-retweet-pol


100%|██████████| 61156/61156 [00:00<00:00, 1869937.86it/s]

The dataset has 61156 interactions, involving 18470 different nodes
The training dataset has 30070 interactions, involving 12678 different nodes
The validation dataset has 9173 interactions, involving 5479 different nodes
The test dataset has 9174 interactions, involving 5328 different nodes
The new node validation dataset has 4957 interactions, involving 4196 different nodes
The new node test dataset has 5073 interactions, involving 4153 different nodes
1847 nodes were used for the inductive testing, i.e. are never seen during training
Val
2509 9173 27.35
Test
1883 9174 20.53
Val + Test
4392 18347 23.94
ia-reality-call



100%|██████████| 52049/52049 [00:00<00:00, 2274387.19it/s]


The dataset has 52049 interactions, involving 6809 different nodes
The training dataset has 23625 interactions, involving 3838 different nodes
The validation dataset has 7807 interactions, involving 1715 different nodes
The test dataset has 7808 interactions, involving 1937 different nodes
The new node validation dataset has 4011 interactions, involving 1185 different nodes
The new node test dataset has 4611 interactions, involving 1531 different nodes
680 nodes were used for the inductive testing, i.e. are never seen during training
Val
1068 7807 13.68
Test
1176 7808 15.06
Val + Test
2244 15615 14.37
ia-movielens-user2tags-10m


100%|██████████| 95577/95577 [00:00<00:00, 2210270.62it/s]


The dataset has 95577 interactions, involving 16527 different nodes
The training dataset has 48189 interactions, involving 10566 different nodes
The validation dataset has 14336 interactions, involving 5209 different nodes
The test dataset has 14337 interactions, involving 5377 different nodes
The new node validation dataset has 6745 interactions, involving 3518 different nodes
The new node test dataset has 7521 interactions, involving 3885 different nodes
1652 nodes were used for the inductive testing, i.e. are never seen during training
Val
2579 14336 17.99
Test
2368 14337 16.52
Val + Test
4947 28673 17.25
ia-slashdot-reply-dir


100%|██████████| 140777/140777 [00:00<00:00, 2059761.51it/s]


The dataset has 140777 interactions, involving 51083 different nodes
The training dataset has 76599 interactions, involving 34496 different nodes
The validation dataset has 21116 interactions, involving 10542 different nodes
The test dataset has 21117 interactions, involving 10424 different nodes
The new node validation dataset has 15534 interactions, involving 9790 different nodes
The new node test dataset has 16568 interactions, involving 9911 different nodes
5108 nodes were used for the inductive testing, i.e. are never seen during training
Val
7562 21116 35.81
Test
6652 21117 31.5
Val + Test
14214 42233 33.66
ia-escorts-dynamic


100%|██████████| 50631/50631 [00:00<00:00, 2087996.83it/s]


The dataset has 50631 interactions, involving 10106 different nodes
The training dataset has 29100 interactions, involving 7154 different nodes
The validation dataset has 7596 interactions, involving 4118 different nodes
The test dataset has 7577 interactions, involving 4144 different nodes
The new node validation dataset has 3845 interactions, involving 2930 different nodes
The new node test dataset has 4829 interactions, involving 3346 different nodes
1010 nodes were used for the inductive testing, i.e. are never seen during training
Val
1524 7596 20.06
Test
1262 7577 16.66
Val + Test
2786 15173 18.36
ia-digg-reply


100%|██████████| 87626/87626 [00:00<00:00, 2051853.68it/s]

The dataset has 87626 interactions, involving 30398 different nodes
The training dataset has 47297 interactions, involving 21540 different nodes
The validation dataset has 13144 interactions, involving 9241 different nodes
The test dataset has 13144 interactions, involving 9511 different nodes
The new node validation dataset has 7995 interactions, involving 7321 different nodes
The new node test dataset has 8239 interactions, involving 7732 different nodes
3039 nodes were used for the inductive testing, i.e. are never seen during training
Val
4040 13144 30.74
Test
3303 13144 25.13
Val + Test
7343 26288 27.93





In [7]:
### surprises
## Surprise factor of nodes = Number of new nodes in val/test times
def calc_surprise_factor_weird(dataset_name):
  (
    node_raw_features,
    edge_raw_features,
    full_data,
    train_data,
    val_data,
    test_data,
    new_node_val_data,
    new_node_test_data,
  ) = get_link_prediction_data(dataset_name=dataset_name, val_ratio=0.15, test_ratio=0.15)
  known_nodes = defaultdict(lambda: 1)
  for src, dst in zip(train_data.src_node_ids, train_data.dst_node_ids):
    known_nodes[src] += 1
    known_nodes[dst] += 1
  surp_all, tot_all =0, 0
  surp_val, tot_val =0, 0
  for src, dst in zip(val_data.src_node_ids, val_data.dst_node_ids):
    surp_val += max(1/known_nodes[src], 1/known_nodes[dst])
    surp_all += max(1/known_nodes[src], 1/known_nodes[dst])
    known_nodes[src] += 1
    known_nodes[dst] += 1
    tot_val += 1
    tot_all += 1
  print('Val')
  print(surp_val, tot_val, round(100*surp_val/tot_val, 2))
  surp_test, tot_test =0, 0
  for src, dst in zip(test_data.src_node_ids, test_data.dst_node_ids):
    surp_test += max(1/known_nodes[src], 1/known_nodes[dst])
    surp_all += max(1/known_nodes[src], 1/known_nodes[dst])
    known_nodes[src] += 1
    known_nodes[dst] += 1
    tot_test += 1
    tot_all += 1
  print('Test')
  print(surp_test, tot_test, round(100*surp_test/tot_test, 2))
  print('Val + Test')
  print(surp_all, tot_all, round(100*surp_all/tot_all, 2))

In [8]:
for dataset in ["ia-retweet-pol", "ia-reality-call", "ia-movielens-user2tags-10m", "ia-slashdot-reply-dir", "ia-escorts-dynamic", "ia-digg-reply"]:
  print(dataset)
  calc_surprise_factor_weird(dataset)

ia-retweet-pol


100%|██████████| 61156/61156 [00:00<00:00, 2153673.79it/s]

The dataset has 61156 interactions, involving 18470 different nodes
The training dataset has 30070 interactions, involving 12678 different nodes
The validation dataset has 9173 interactions, involving 5479 different nodes
The test dataset has 9174 interactions, involving 5328 different nodes
The new node validation dataset has 4957 interactions, involving 4196 different nodes
The new node test dataset has 5073 interactions, involving 4153 different nodes
1847 nodes were used for the inductive testing, i.e. are never seen during training
Val
3714.88091121497 9173 40.5
Test
3000.662525356272 9174 32.71
Val + Test
6715.543436571139 18347 36.6
ia-reality-call



100%|██████████| 52049/52049 [00:00<00:00, 2273321.42it/s]


The dataset has 52049 interactions, involving 6809 different nodes
The training dataset has 23625 interactions, involving 3838 different nodes
The validation dataset has 7807 interactions, involving 1715 different nodes
The test dataset has 7808 interactions, involving 1937 different nodes
The new node validation dataset has 4011 interactions, involving 1185 different nodes
The new node test dataset has 4611 interactions, involving 1531 different nodes
680 nodes were used for the inductive testing, i.e. are never seen during training
Val
1796.2238918964222 7807 23.01
Test
1857.866896450933 7808 23.79
Val + Test
3654.0907883473997 15615 23.4
ia-movielens-user2tags-10m


100%|██████████| 95577/95577 [00:00<00:00, 2155274.99it/s]


The dataset has 95577 interactions, involving 16527 different nodes
The training dataset has 48189 interactions, involving 10566 different nodes
The validation dataset has 14336 interactions, involving 5209 different nodes
The test dataset has 14337 interactions, involving 5377 different nodes
The new node validation dataset has 6745 interactions, involving 3518 different nodes
The new node test dataset has 7521 interactions, involving 3885 different nodes
1652 nodes were used for the inductive testing, i.e. are never seen during training
Val
4148.147302151732 14336 28.94
Test
3875.265995530015 14337 27.03
Val + Test
8023.4132976814735 28673 27.98
ia-slashdot-reply-dir


100%|██████████| 140777/140777 [00:00<00:00, 2039076.62it/s]


The dataset has 140777 interactions, involving 51083 different nodes
The training dataset has 76599 interactions, involving 34496 different nodes
The validation dataset has 21116 interactions, involving 10542 different nodes
The test dataset has 21117 interactions, involving 10424 different nodes
The new node validation dataset has 15534 interactions, involving 9790 different nodes
The new node test dataset has 16568 interactions, involving 9911 different nodes
5108 nodes were used for the inductive testing, i.e. are never seen during training
Val
10343.7467207607 21116 48.99
Test
9439.20044393669 21117 44.7
Val + Test
19782.94716469744 42233 46.84
ia-escorts-dynamic


100%|██████████| 50631/50631 [00:00<00:00, 1918666.14it/s]


The dataset has 50631 interactions, involving 10106 different nodes
The training dataset has 29100 interactions, involving 7154 different nodes
The validation dataset has 7596 interactions, involving 4118 different nodes
The test dataset has 7577 interactions, involving 4144 different nodes
The new node validation dataset has 3845 interactions, involving 2930 different nodes
The new node test dataset has 4829 interactions, involving 3346 different nodes
1010 nodes were used for the inductive testing, i.e. are never seen during training
Val
2789.257301540414 7596 36.72
Test
2535.0940927336865 7577 33.46
Val + Test
5324.351394274045 15173 35.09
ia-digg-reply


100%|██████████| 87626/87626 [00:00<00:00, 1899939.94it/s]

The dataset has 87626 interactions, involving 30398 different nodes
The training dataset has 47297 interactions, involving 21540 different nodes
The validation dataset has 13144 interactions, involving 9241 different nodes
The test dataset has 13144 interactions, involving 9511 different nodes
The new node validation dataset has 7995 interactions, involving 7321 different nodes
The new node test dataset has 8239 interactions, involving 7732 different nodes
3039 nodes were used for the inductive testing, i.e. are never seen during training
Val
6316.393472657191 13144 48.06
Test
5551.457537697504 13144 42.24
Val + Test
11867.851010355827 26288 45.15



