In [6]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [8]:
from tSVD import calculate_rr, error_func_frobenius
from common import construct_adjacency_matrix
from tqdm import tqdm
import pandas as pd

In [10]:
def find_all_rr(data, error_func=error_func_frobenius):
    results = []
    for i in tqdm(range(len(data))):
        A = construct_adjacency_matrix(data[i])
        t, rr, errors = calculate_rr(A, error_func)
        results.append(
            {
                "graph_id": i,
                "n_nodes": data[i].x.shape[0],
                "rr": rr,
                "errors": errors,
                "time": t
            }
        )
    return results

# ZINC

In [9]:
from torch_geometric.datasets import ZINC

# subset=True selects the small version of the dataset
# the split parameter chooses between the test/train/validation sets
# for the SVD analysis its probably best to just use train as its the largest.
data_zinc = ZINC(subset=True, root='data', split='train')

Downloading https://www.dropbox.com/s/feo9qle74kg48gy/molecules.zip?dl=1
Extracting data/molecules.zip
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/train.index
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/val.index
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/test.index
Processing...
Processing train dataset: 100%|██████████| 10000/10000 [00:01<00:00, 8034.61it/s]
Processing val dataset: 100%|██████████| 1000/1000 [00:00<00:00, 1713.04it/s]
Processing test dataset: 100%|██████████| 1000/1000 [00:00<00:00, 2599.68it/s]
Done!
  return torch.load(f, map_location)


In [12]:
data_zinc[0]

Data(x=[29, 1], edge_index=[2, 64], edge_attr=[64], y=[1])

In [23]:
n_nodes = list(map(lambda x: x.x.shape[0], data_zinc))

In [28]:
data_zinc[0].x.shape

torch.Size([29, 1])

In [33]:
data_zinc[0].edge_index

tensor([[ 0,  1,  1,  2,  2,  2,  3,  3,  4,  4,  5,  5,  5,  6,  6,  7,  7,  8,
          8,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 14, 14, 15, 15, 15,
         16, 16, 16, 16, 17, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 23, 23, 24,
         24, 25, 25, 26, 26, 27, 27, 27, 28, 28],
        [ 1,  0,  2,  1,  3, 28,  2,  4,  3,  5,  4,  6, 27,  5,  7,  6,  8,  7,
          9, 10,  8,  8, 11, 27, 10, 12, 11, 13, 26, 12, 14, 13, 15, 14, 16, 25,
         15, 17, 18, 19, 16, 16, 16, 20, 24, 19, 21, 20, 22, 23, 21, 21, 24, 19,
         23, 15, 26, 12, 25,  5, 10, 28,  2, 27]])

In [25]:
np.max(n_nodes)

np.int64(37)

In [5]:
zinc_results = find_all_rr(data_zinc)

100%|██████████| 10000/10000 [03:16<00:00, 50.80it/s]


In [6]:
pd.DataFrame(zinc_results).to_parquet('results/ZINC/updated.parquet')

# Peptides

In [13]:
from torch_geometric.datasets import LRGBDataset

# here we have the name parameter because LRGB has multiple benchmarks.
# for now lets focus on peptides-func
data_peptides = LRGBDataset(name='Peptides-func', root='data', split='train')

# G = nx.Graph()
# #edges for the first graph  in the dataset (index 0)
# edges = zip(*d[0].edge_index.tolist())
# G.add_edges_from(edges)
# nx.draw(G)

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


In [16]:
data_peptides[0]

Data(x=[338, 9], edge_index=[2, 682], edge_attr=[682, 3], y=[1, 10])

In [2]:
len(data_peptides)

10873

In [16]:
n_nodes = list(map(lambda x: x.x.shape[0], data_peptides))

In [21]:
np.median(n_nodes)

np.float64(137.0)

In [10]:
peptides_results = find_all_rr(data_peptides)

100%|██████████| 10873/10873 [10:14:25<00:00,  3.39s/it]   


In [11]:
pd.DataFrame(peptides_results).to_parquet('results/peptides_updated.parquet')

# CIFAR

In [3]:
from torch_geometric.datasets import GNNBenchmarkDataset

# we want specifically the CIFAR10 benchmark from this dataset
data_cifar = GNNBenchmarkDataset(name='CIFAR10', root='data', split='train')

# G = nx.Graph()
# #edges for the first graph  in the dataset (index 0)
# edges = zip(*d[0].edge_index.tolist())
# G.add_edges_from(edges)

In [10]:
data_cifar[0]

Data(x=[110, 3], edge_index=[2, 880], edge_attr=[880], y=[1], pos=[110, 2])

In [4]:
n_nodes = list(map(lambda x: x.x.shape[0], data_cifar))

In [10]:
import numpy as np

In [14]:
np.array(n_nodes).std()

np.float64(4.28053064894333)

In [5]:
cifar_results = find_all_rr(data_cifar)

100%|██████████| 45000/45000 [7:14:10<00:00,  1.73it/s]   


In [6]:
pd.DataFrame(cifar_results).to_parquet('results/cifar_updated.parquet')

In [7]:
len(cifar_results)

45000

In [8]:
pd.read_parquet('results/cifar_updated.parquet')

Unnamed: 0,graph_id,n_nodes,rr,errors,time
0,0,110,68,"[1.0, 1.0, 0.9925861538059417, 0.9717603520509...",0.943575
1,1,117,70,"[1.0, 1.0, 0.994643775085452, 0.97785304683095...",0.800947
2,2,114,59,"[1.0, 1.0, 1.0, 0.9795048874617234, 0.97332852...",0.286410
3,3,114,63,"[1.0, 0.9994516040168878, 0.9994516040168878, ...",1.007908
4,4,118,62,"[1.0, 1.0, 1.0, 0.9845200155491758, 0.96055686...",1.034828
...,...,...,...,...,...
44995,44995,117,75,"[1.0, 1.0, 0.996789718842316, 0.96686558519220...",0.601059
44996,44996,119,64,"[1.0, 1.0, 0.9936774919374964, 0.9750026933813...",0.359792
44997,44997,117,63,"[1.0, 1.0, 0.9924931914339438, 0.9635449151019...",0.803766
44998,44998,123,68,"[1.0, 1.0, 1.0, 0.9830887114773246, 0.96482864...",0.433006
