# Graph example datasets

This notebook is for playing around with some standard graph datasets, e.g. those from pytorch geometric: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html

In [1]:
# System imports
import os
import sys

In [2]:
# External imports
import torch
import torch_geometric

In [3]:
# Local imports
sys.path.append('..')
from data import get_data_loaders

In [4]:
# Dataset storage area
data_path = os.path.expandvars('$SCRATCH/gnn-protmd/datasets')
data_path

'/global/cscratch1/sd/sfarrell/gnn-protmd/datasets'

## TUD Datasets
https://chrsmrrs.github.io/datasets/docs/datasets/

In [5]:
torch_geometric.datasets.TUDataset?

[0;31mInit signature:[0m
[0mtorch_geometric[0m[0;34m.[0m[0mdatasets[0m[0;34m.[0m[0mTUDataset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mroot[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransform[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpre_transform[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpre_filter[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_node_attr[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_edge_attr[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
"REDDIT-BINARY" or "PROTEINS", collected from the `TU Dortmund University
<http://graphkernels.cs.tu-dortmund.de>`_.

.. note::
    Some datasets may not come with any node labels.
    You ca

In [6]:
def get_tudata(data_path, name):
    data_path = os.path.join(data_path, name)
    return torch_geometric.datasets.TUDataset(root=data_path, name=name,
                                              use_node_attr=True, use_edge_attr=True)

In [7]:
def summarize_dataset(dataset):
    print('Number of samples:', len(dataset))
    print('First example contents:', dataset[0])
    print('Node features:', dataset.num_node_attributes)
    print('Node labels:', dataset.num_node_labels)
    print('Edge features:', dataset.num_edge_attributes)
    print('Edge labels:', dataset.num_edge_labels)

In [8]:
protein_dataset = get_tudata(data_path, 'PROTEINS')
summarize_dataset(protein_dataset)

Number of samples: 1113
First example contents: Data(edge_index=[2, 162], x=[42, 4], y=[1])
Node features: 1
Node labels: 3
Edge features: 0
Edge labels: 0


In [9]:
protein_dataset[1000].x

tensor([[6., 0., 1., 0.],
        [6., 0., 1., 0.],
        [6., 0., 1., 0.],
        [4., 0., 1., 0.],
        [3., 0., 1., 0.]])

In [10]:
bzrmd_dataset = get_tudata(data_path, 'BZR_MD')
summarize_dataset(bzrmd_dataset)

Number of samples: 306
First example contents: Data(edge_attr=[342, 6], edge_index=[2, 342], x=[19, 8], y=[1])
Node features: 0
Node labels: 8
Edge features: 1
Edge labels: 5


In [11]:
bzrmd_dataset[0].edge_attr

tensor([[1.3866, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [2.4155, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000],
        [2.8045, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000],
        ...,
        [2.7932, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000],
        [2.4173, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000],
        [1.3960, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000]])

In [12]:
fingerprint_dataset = get_tudata(data_path, 'Fingerprint')
summarize_dataset(fingerprint_dataset)

Number of samples: 2800
First example contents: Data(edge_attr=[2, 2], edge_index=[2, 2], x=[2, 2], y=[1])
Node features: 2
Node labels: 0
Edge features: 2
Edge labels: 0


## ModelNet

https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.ModelNet

In [22]:
modelnet_dataset = torch_geometric.datasets.ModelNet(os.path.join(data_path, 'ModelNet'))

In [33]:
modelnet_dataset

ModelNet10(3991)

In [34]:
modelnet_dataset[0]

Data(face=[3, 5594], pos=[6550, 3], y=[1])

In [29]:
modelnet_dataset.num_edge_features

0

## MNISTSuperPixels

https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.MNISTSuperpixels

In [35]:
mnist_dataset = torch_geometric.datasets.MNISTSuperpixels(
    os.path.join(data_path, 'MNISTSuperpixels'))

Downloading http://ls7-www.cs.uni-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz
Extracting /global/cscratch1/sd/sfarrell/gnn-protmd/datasets/MNISTSuperpixels/raw/mnist_superpixels.tar.gz
Processing...
Done!


In [36]:
mnist_dataset

MNISTSuperpixels(60000)

In [37]:
mnist_dataset[0]

Data(edge_index=[2, 1399], pos=[75, 2], x=[75, 1], y=[1])

## Testing the data pipeline

In [13]:
data_config = dict(
    name='tud',
    tud_name='BZR_MD',
    n_train=256,
    n_valid=128,
    batch_size=1,
    download_path='$SCRATCH/gnn-protmd/datasets',
)

In [14]:
train_loader, valid_loader = get_data_loaders(**data_config)

In [16]:
next(train_loader.__iter__())

Batch(batch=[19], edge_attr=[342, 6], edge_index=[2, 342], x=[19, 8], y=[1])

In [17]:
next(train_loader.__iter__())

Batch(batch=[22], edge_attr=[462, 6], edge_index=[2, 462], x=[22, 8], y=[1])