In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import copy

import os
import sys
import torch
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

from utils import filter_out_isolate, draw_cluster_info, draw_isolate_cluster_info, draw_trainer_info, print_data_info


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def print_data_info(data):
    
    print('Info (attributes) of a single data instance')
    print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
      '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
      '\n number of classifying labels of dataset: ', dataset.num_classes, \
      '\n all the attributes of data: ', data.keys)

In [4]:
local_data_root = '/media/xiangli/storage1/projects/tmpdata/'

### Cora dataset

In [5]:
from torch_geometric.datasets import Planetoid
data_name = 'Cora'
dataset = Planetoid(root = local_data_root + 'Planetoid/Cora', name=data_name)
data = dataset[0]
print_data_info(data)

Info (attributes) of a single data instance
Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708]) 
 number of nodes:  2708 
 number of edges:  10556 
 number of features per ndoe:  1433 
 number of edge features:  0 
 number of classifying labels of dataset:  7 
 all the attributes of data:  ['x', 'edge_index', 'y', 'train_mask', 'val_mask', 'test_mask']


### Citeseer Dataset

In [5]:
from torch_geometric.datasets import Planetoid
data_name = 'CiteSeer'
dataset = Planetoid(root = local_data_root + 'Planetoid/CiteSeer', name=data_name)
data = dataset[0]
print_data_info(data)

Info (attributes) of a single data instance
Data(edge_index=[2, 9104], test_mask=[3327], train_mask=[3327], val_mask=[3327], x=[3327, 3703], y=[3327]) 
 number of nodes:  3327 
 number of edges:  9104 
 number of features per ndoe:  3703 
 number of edge features:  0 
 number of classifying labels of dataset:  6 
 all the attributes of data:  ['x', 'edge_index', 'y', 'train_mask', 'val_mask', 'test_mask']


### PubMed Dataset

In [6]:
from torch_geometric.datasets import Planetoid
data_name = 'PubMed'
dataset = Planetoid(root = local_data_root + 'Planetoid/PubMed', name=data_name)
data = dataset[0]
print_data_info(data)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!
Info (attributes) of a single data instance
Data(edge_index=[2, 88648], test_mask=[19717], train_mask=[19717], val_mask=[19717], x=[19717, 500], y=[19717]) 
 number of nodes:  19717 
 number of edges:  88648 
 number of features per ndoe:  500 
 number of edge features:  0 
 number of classifying labels of dataset:  3

### CoraFull dataset

In [12]:
from torch_geometric.datasets import CoraFull
data_name = 'CoraFull'
dataset = CoraFull(root = local_data_root + 'CoralFull')
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

number of data:  1
Info (attributes) of a single data instance
Data(edge_index=[2, 126842], x=[19793, 8710], y=[19793]) 
 number of nodes:  19793 
 number of edges:  126842 
 number of features per ndoe:  8710 
 number of edge features:  0 
 number of classifying labels of dataset:  70 
 all the attributes of data:  ['x', 'edge_index', 'y']


### Coauthor dataset

In [20]:
from torch_geometric.datasets import Coauthor
data_name = 'cs'
dataset = Coauthor(root = local_data_root + 'Coauthor/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://github.com/shchur/gnn-benchmark/raw/master/data/npz/ms_academic_cs.npz
Processing...
Done!
number of data:  1
Info (attributes) of a single data instance
Data(edge_index=[2, 163788], x=[18333, 6805], y=[18333]) 
 number of nodes:  18333 
 number of edges:  163788 
 number of features per ndoe:  6805 
 number of edge features:  0 
 number of classifying labels of dataset:  15 
 all the attributes of data:  ['x', 'edge_index', 'y']


### Amazon dataset

In [21]:
from torch_geometric.datasets import Amazon
data_name = 'computers'    # can also be 'computers'
dataset = Amazon(root = local_data_root + 'Amazon/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

number of data:  1
Info (attributes) of a single data instance
Data(edge_index=[2, 491722], x=[13752, 767], y=[13752]) 
 number of nodes:  13752 
 number of edges:  491722 
 number of features per ndoe:  767 
 number of edge features:  0 
 number of classifying labels of dataset:  10 
 all the attributes of data:  ['x', 'edge_index', 'y']


In [23]:
from torch_geometric.datasets import PPI
data_name = 'PPI'    # can also be 'computers'
dataset = PPI(root = local_data_root + 'PPI/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/ppi.zip
Extracting /media/xiangli/storage/projects/tmpdata/PPI/PPI/ppi.zip
Processing...
Done!
number of data:  20
Info (attributes) of a single data instance
Data(edge_index=[2, 32318], x=[1767, 50], y=[1767, 121]) 
 number of nodes:  1767 
 number of edges:  32318 
 number of features per ndoe:  50 
 number of edge features:  0 
 number of classifying labels of dataset:  121 
 all the attributes of data:  ['x', 'edge_index', 'y']


### Reddit dataset

In [None]:
from torch_geometric.datasets import Reddit
data_name = 'Reddit'    # can also be 'computers'
dataset = Reddit(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/reddit.zip
Extracting /media/xiangli/storage1/projects/tmpdata/Reddit/raw/reddit.zip
Processing...


### QM7b dataset

In [25]:
from torch_geometric.datasets import QM7b
data_name = 'QM7b'    # can also be 'computers'
dataset = QM7b(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm7b.mat
Processing...
Done!
number of data:  7211
Info (attributes) of a single data instance
Data(edge_attr=[25], edge_index=[2, 25], y=[1, 14]) 
 number of nodes:  5 
 number of edges:  25 
 number of features per ndoe:  0 
 number of edge features:  1 
 number of classifying labels of dataset:  14 
 all the attributes of data:  ['edge_index', 'edge_attr', 'y']


### QM9 dataset

In [26]:
from torch_geometric.datasets import QM9
data_name = 'QM9'    # can also be 'computers'
dataset = QM9(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading http://www.roemisch-drei.de/qm9.tar.gz
Extracting /media/xiangli/storage/projects/tmpdata/QM9/raw/qm9.tar.gz
Processing...
Done!
number of data:  133246
Info (attributes) of a single data instance
Data(edge_attr=[8, 4], edge_index=[2, 8], pos=[5, 3], x=[5, 13], y=[1, 12]) 
 number of nodes:  5 
 number of edges:  8 
 number of features per ndoe:  13 
 number of edge features:  4 
 number of classifying labels of dataset:  12 
 all the attributes of data:  ['x', 'edge_index', 'edge_attr', 'y', 'pos']


### Entities

In [28]:
from torch_geometric.datasets import Entities
data_name = 'AIFB'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/aifb.tgz
Extracting /media/xiangli/storage/projects/tmpdata/Entities/AIFB/aifb.tgz
Processing...
Done!
number of data:  1
Info (attributes) of a single data instance
Data(edge_index=[2, 58086], edge_norm=[58086], edge_type=[58086], test_idx=[36], test_y=[36], train_idx=[140], train_y=[140]) 
 number of nodes:  8285 
 number of edges:  58086 
 number of features per ndoe:  0 
 number of edge features:  0 
 number of classifying labels of dataset:  4 
 all the attributes of data:  ['edge_index', 'edge_type', 'edge_norm', 'train_idx', 'train_y', 'test_idx', 'test_y']


#### Large data scale :  Entities  AM

In [29]:
data_name = 'AM'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/am.tgz
Extracting /media/xiangli/storage/projects/tmpdata/Entities/AM/am.tgz
Processing...


RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 25486294176 bytes. Error code 12 (Cannot allocate memory)


In [30]:
data_name = 'MUTAG'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/mutag.tgz
Extracting /media/xiangli/storage/projects/tmpdata/Entities/MUTAG/mutag.tgz
Processing...
Done!
number of data:  1
Info (attributes) of a single data instance
Data(edge_index=[2, 148454], edge_norm=[148454], edge_type=[148454], test_idx=[68], test_y=[68], train_idx=[272], train_y=[272]) 
 number of nodes:  23644 
 number of edges:  148454 
 number of features per ndoe:  0 
 number of edge features:  0 
 number of classifying labels of dataset:  2 
 all the attributes of data:  ['edge_index', 'edge_type', 'edge_norm', 'train_idx', 'train_y', 'test_idx', 'test_y']


In [31]:
data_name = 'BGS'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/bgs.tgz
Extracting /media/xiangli/storage/projects/tmpdata/Entities/BGS/bgs.tgz
Processing...


http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+^PYSD does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+^PYSD does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+#^PRS does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+#^PRS does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+#^PRS does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+#^RSR does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+#^PRS does not look like a valid URI, trying to serialize this will break.
http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/+#^RSR does not look like a valid URI, tryin

Done!
number of data:  1
Info (attributes) of a single data instance
Data(edge_index=[2, 1832398], edge_norm=[1832398], edge_type=[1832398], test_idx=[29], test_y=[29], train_idx=[117], train_y=[117]) 
 number of nodes:  333845 
 number of edges:  1832398 
 number of features per ndoe:  0 
 number of edge features:  0 
 number of classifying labels of dataset:  2 
 all the attributes of data:  ['edge_index', 'edge_type', 'edge_norm', 'train_idx', 'train_y', 'test_idx', 'test_y']


### GEDDataset

In [33]:
from torch_geometric.datasets import GEDDataset
data_name = 'LINUX'    # can also be 'computers'
dataset = GEDDataset(root = local_data_root + 'GEDDataset/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print('Info (attributes) of a single data instance')
print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
  '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
  '\n all the attributes of data: ', data.keys)

number of data:  800
Info (attributes) of a single data instance
Data(edge_index=[2, 18], i=[1]) 
 number of nodes:  8 
 number of edges:  18 
 number of features per ndoe:  0 
 number of edge features:  0 
 all the attributes of data:  ['edge_index', 'i']


### MNISTSuperpixels

In [41]:
from torch_geometric.datasets import MNISTSuperpixels
data_name = 'MNISTSuperpixels'    # can also be 'computers'
dataset = MNISTSuperpixels(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading http://ls7-www.cs.uni-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz
Extracting /media/xiangli/storage/projects/tmpdata/MNISTSuperpixels/raw/mnist_superpixels.tar.gz
Processing...
Done!
number of data:  60000
Info (attributes) of a single data instance
Data(edge_index=[2, 1399], pos=[75, 2], x=[75, 1], y=[1]) 
 number of nodes:  75 
 number of edges:  1399 
 number of features per ndoe:  1 
 number of edge features:  0 
 number of classifying labels of dataset:  10 
 all the attributes of data:  ['x', 'edge_index', 'y', 'pos']


### ShapeNet

In [43]:
from torch_geometric.datasets import ShapeNet
data_name = 'ShapeNet'    # can also be 'computers'
dataset = ShapeNet(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://shapenet.cs.stanford.edu/iccv17/partseg/train_data.zip
Extracting /media/xiangli/storage/projects/tmpdata/ShapeNet/raw/train_data.zip
Downloading https://shapenet.cs.stanford.edu/iccv17/partseg/train_label.zip
Extracting /media/xiangli/storage/projects/tmpdata/ShapeNet/raw/train_label.zip
Downloading https://shapenet.cs.stanford.edu/iccv17/partseg/val_data.zip
Extracting /media/xiangli/storage/projects/tmpdata/ShapeNet/raw/val_data.zip
Downloading https://shapenet.cs.stanford.edu/iccv17/partseg/val_label.zip
Extracting /media/xiangli/storage/projects/tmpdata/ShapeNet/raw/val_label.zip
Downloading https://shapenet.cs.stanford.edu/iccv17/partseg/test_data.zip
Extracting /media/xiangli/storage/projects/tmpdata/ShapeNet/raw/test_data.zip
Downloading https://shapenet.cs.stanford.edu/iccv17/partseg/test_label.zip
Extracting /media/xiangli/storage/projects/tmpdata/ShapeNet/raw/test_label.zip
Processing...
Done!
number of data:  14007
Info (attributes) of a single data inst

### PCPNetDataset

In [45]:
from torch_geometric.datasets import PCPNetDataset
data_name = 'Noisy'    # can also be 'computers'
dataset = PCPNetDataset(root = local_data_root + 'PCPNetDataset/' + data_name, category = 'Noisy')
print('number of data: ', len(dataset))
data = dataset[0]
print('Info (attributes) of a single data instance')
print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
  '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
  '\n all the attributes of data: ', data.keys)

number of data:  32
Info (attributes) of a single data instance
Data(pos=[100000, 3], test_idx=[5000], x=[100000, 5]) 
 number of nodes:  100000 
 number of edges:  None 
 number of features per ndoe:  5 
 number of edge features:  0 
 all the attributes of data:  ['x', 'pos', 'test_idx']


### S3DIS

All the data set here contains 4096 nodes, with a total number of 20291 data

This dataset can be used as the mini-batch directly

In [34]:
from torch_geometric.datasets import S3DIS
data_name = 'S3DIS'    # can also be 'computers'
dataset = S3DIS(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

Downloading https://shapenet.cs.stanford.edu/media/indoor3d_sem_seg_hdf5_data.zip
Extracting /media/xiangli/storage/projects/tmpdata/S3DIS/indoor3d_sem_seg_hdf5_data.zip
Processing...
Done!
number of data:  20291
Info (attributes) of a single data instance
Data(pos=[4096, 3], x=[4096, 6], y=[4096]) 
 number of nodes:  4096 
 number of edges:  None 
 number of features per ndoe:  6 
 number of edge features:  0 
 number of classifying labels of dataset:  13 
 all the attributes of data:  ['x', 'y', 'pos']


In [40]:
select = [(idx, data.num_nodes) for idx, data in enumerate(dataset) if data.num_nodes > 5000]
print(select)
    

[]
