In [None]:
import scipy as sp
from common import construct_adjacency_matrix
from lpca import decomposition_at_k
from tqdm import tqdm
import pandas as pd

In [None]:
def decompose_all_at_k(data, k, save_dir=None, start_idx=0):
    results = []
    for i in tqdm(range(len(data))):
        A = sp.sparse.csr_matrix(construct_adjacency_matrix(data[i]))
        file_path = save_dir + f'idx_{start_idx + i}.mat' if save_dir is not None else None

        t, error, nit, _ = decomposition_at_k(A, k, file_path)
        results.append(
            {
                "graph_id": i,
                "n_nodes": data[i].x.shape[0],
                "nit": nit,
                "error": error,
                "time": t
            }
        )
    
    return results

# Peptides

In [11]:
from torch_geometric.datasets import LRGBDataset

# here we have the name parameter because LRGB has multiple benchmarks.
# for now lets focus on peptides-func
data_peptides = LRGBDataset(name='Peptides-func', root='data', split='train')

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


In [14]:
r = decompose_all_at_k(data_peptides, 8, 'lpca_out/Peptides/k8/')
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_8_save.parquet')

100%|██████████| 10873/10873 [6:09:57<00:00,  2.04s/it]  


In [5]:
r = decompose_all_at_k(data_peptides, 8)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_8.parquet')

100%|██████████| 10873/10873 [3:12:56<00:00,  1.06s/it]  


In [5]:
r = decompose_all_at_k(data_peptides, 16)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_16.parquet')

100%|██████████| 10873/10873 [3:19:34<00:00,  1.10s/it]  


In [6]:
r = decompose_all_at_k(data_peptides, 32)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_32.parquet')

100%|██████████| 10873/10873 [8:16:16<00:00,  2.74s/it]  


In [None]:
r = decompose_all_at_k(data_peptides, 64)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_64.parquet')

 47%|████▋     | 5077/10873 [6:04:45<5:00:15,  3.11s/it] 

In [None]:
r = decompose_all_at_k(data_peptides, 5)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_5.parquet')

In [None]:
r = decompose_all_at_k(data_peptides, 4)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_4.parquet')

In [None]:
r = decompose_all_at_k(data_peptides, 3)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_3.parquet')

In [None]:
r = decompose_all_at_k(data_peptides, 2)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_2.parquet')

In [None]:
r = decompose_all_at_k(data_peptides, 1)
pd.DataFrame(r).to_parquet('results/peptides_LPCA_k_1.parquet')

# ZINC

In [3]:
from torch_geometric.datasets import ZINC

# subset=True selects the small version of the dataset
# the split parameter chooses between the test/train/validation sets
# for the SVD analysis its probably best to just use train as its the largest.
data_zinc = ZINC(subset=True, root='data', split='train')

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


In [4]:
data_zinc[0]

Data(x=[29, 1], edge_index=[2, 64], edge_attr=[64], y=[1])

In [9]:
len(data_zinc)

1000

In [5]:
r = decompose_all_at_k(data_zinc, 8)
pd.DataFrame(r).to_parquet('results/zinc_LPCA_k_8.parquet')

100%|██████████| 10000/10000 [07:11<00:00, 23.20it/s]


In [10]:
r = decompose_all_at_k(data_zinc, 8, 'lpca_out/ZINC/k8/')
pd.DataFrame(r).to_parquet('results/zinc_LPCA_k_8_save.parquet')

100%|██████████| 10000/10000 [11:45<00:00, 14.18it/s]


In [5]:
r = decompose_all_at_k(data_zinc, 4)
pd.DataFrame(r).to_parquet('results/zinc_LPCA_k_4.parquet')

100%|██████████| 10000/10000 [2:37:58<00:00,  1.05it/s] 


In [5]:
r = decompose_all_at_k(data_zinc, 2)
pd.DataFrame(r).to_parquet('results/zinc_LPCA_k_2.parquet')

100%|██████████| 10000/10000 [3:48:43<00:00,  1.37s/it] 


# CIFAR10

In [1]:
from torch_geometric.datasets import GNNBenchmarkDataset

# we want specifically the CIFAR10 benchmark from this dataset
data_cifar = GNNBenchmarkDataset(name='CIFAR10', root='data', split='train')

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


In [2]:
data_cifar[0]

Data(x=[110, 3], edge_index=[2, 880], edge_attr=[880], y=[1], pos=[110, 2])

In [14]:
k = 16

In [16]:
r = decompose_all_at_k(data_cifar[:5000], k, f'lpca_out/CIFAR10/k{k}/')
pd.DataFrame(r).to_parquet(f'results/CIFAR10_LPCA_k_{k}_save.parquet')

100%|██████████| 5000/5000 [28:40<00:00,  2.91it/s]
