In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Data used in Geometric Package

In [3]:
import copy

import os
import sys
import torch
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

from utils import filter_out_isolate_normalize_feature, draw_cluster_info, draw_trainer_info, print_data_info, print_edge_index_info


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def print_data_info(data):
    
    print('Info (attributes) of a single data instance')
    print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
      '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
      '\n number of classifying labels of dataset: ', dataset.num_classes, \
      '\n all the attributes of data: ', data.keys)

In [5]:
local_data_root = '/home/xiangli/projects/tmpdata/GCN/Geometric/'

## Belong to the Planetoid series

### Cora dataset

In [7]:
from torch_geometric.datasets import Planetoid
data_name = 'Cora'
dataset = Planetoid(root = local_data_root + 'Planetoid/Cora', name=data_name)
data = dataset[0]
print_data_info(data)

print('\n This is multi-class task')
edge_index, features, label = data.edge_index, data.x, data.y
print(label.shape, type(label), label[:5])
print(edge_index.shape)

print('\n isolated nodes in the graph:')
_, _, _ = filter_out_isolate_normalize_feature(edge_index, features, label)

print_edge_index_info(edge_index)

Info (attributes) of a single data instance
Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708]) 
 number of nodes:  2708 
 number of edges:  10556 
 number of features per ndoe:  1433 
 number of edge features:  0 
 number of classifying labels of dataset:  7 
 all the attributes of data:  ['x', 'edge_index', 'y', 'train_mask', 'val_mask', 'test_mask']

 This is multi-class task
torch.Size([2708]) <class 'torch.Tensor'> tensor([3, 4, 4, 0, 3])
torch.Size([2, 10556])

 isolated nodes in the graph:
No isolated nodes number is found 
Label shape is: torch.Size([2708])
edge index shape is :  torch.Size([2, 10556])
number of self-loops:  0
number of unique edges:  5278
Length of the odds:  0


### Citeseer Dataset

In [None]:
from torch_geometric.datasets import Planetoid
data_name = 'CiteSeer'
dataset = Planetoid(root = local_data_root + 'Planetoid/CiteSeer', name=data_name)
data = dataset[0]
print_data_info(data)

### PubMed Dataset

In [None]:
from torch_geometric.datasets import Planetoid
data_name = 'PubMed'
dataset = Planetoid(root = local_data_root + 'Planetoid/PubMed', name=data_name)
data = dataset[0]
print_data_info(data)

### Stanford Education Dataset

In [None]:
from torch_geometric.datasets import SNAPDataset
# available_datasets = {
#         'ego-facebook': ['facebook.tar.gz'],
#         'ego-gplus': ['gplus.tar.gz'],   # data format failure
#         'ego-twitter': ['twitter.tar.gz'],    # too large for processing
#         'soc-epinions1': ['soc-Epinions1.txt.gz'],
#         'soc-livejournal1': ['soc-LiveJournal1.txt.gz'],
#         'soc-pokec': ['soc-pokec-relationships.txt.gz'],
#         'soc-slashdot0811': ['soc-Slashdot0811.txt.gz'],
#         'soc-slashdot0922': ['soc-Slashdot0902.txt.gz'],
#         'wiki-vote': ['wiki-Vote.txt.gz'],
#     }
data_name = 'ego-facebook'
dataset = SNAPDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'ego-twitter'
dataset = SNAPDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'wiki-vote'
dataset = SNAPDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'soc-pokec'
dataset = SNAPDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'soc-livejournal1'
dataset = SNAPDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'soc-slashdot0811'
dataset = SNAPDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)


### TUD dataset

In [None]:
from torch_geometric.datasets import TUDataset
data_name = 'FIRSTMM_DB'
dataset = TUDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)


In [None]:
from torch_geometric.datasets import TUDataset
data_name = 'REDDIT-MULTI-12K'
dataset = TUDataset(root = local_data_root + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)


## Belong to the CitationFull series

In [None]:
# assert name in ['cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed']
from torch_geometric.datasets import CitationFull
data_name = 'cora_ml'
dataset = CitationFull(root = local_data_root + data_name,  name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'citeseer'
dataset = CitationFull(root = local_data_root + data_name,  name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'dblp'
dataset = CitationFull(root = local_data_root + data_name,  name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
data_name = 'pubmed'
dataset = CitationFull(root = local_data_root + data_name,  name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### CoraFull dataset (alias for CitationFull with name "Cora")

In [None]:
from torch_geometric.datasets import CoraFull
data_name = 'CoraFull'
dataset = CoraFull(root = local_data_root + 'CoralFull')
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### Coauthor dataset

In [None]:
from torch_geometric.datasets import Coauthor
data_name = 'cs'
dataset = Coauthor(root = local_data_root + 'Coauthor/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### Amazon dataset

In [None]:
from torch_geometric.datasets import Amazon
# data_name  can also be 'computers', 'photos'
dataset = Amazon(root = local_data_root + 'Amazon/' + data_name, name='computers')
# print('number of data: ', len(dataset))
data = dataset[0]
# print_data_info(data)

In [None]:
data_name = 'photo'    # can also be 'computers', 'photos'
dataset = Amazon(root = local_data_root + 'Amazon/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### PPI dataset

In [None]:
from torch_geometric.datasets import PPI
data_name = 'PPI'    # can also be 'computers'
dataset = PPI(root = local_data_root + 'PPI/' + data_name)
print('number of data: ', len(dataset))
for i in range(len(dataset)):
    print('Infor for the data #[{}]'.format(i))
    data = dataset[i]
    print_data_info(data)

In [None]:
from torch_geometric.datasets import PPI
data_name = 'PPI'    # can also be 'computers'
dataset = PPI(root = local_data_root + 'PPI/' + data_name, split="test")
print('number of data: ', len(dataset))
for i in range(len(dataset)):
    print('Infor for the data #[{}]'.format(i))
    data = dataset[i]
    print_data_info(data)

In [None]:
from torch_geometric.datasets import PPI
data_name = 'PPI'    # can also be 'computers'
dataset = PPI(root = local_data_root + 'PPI/' + data_name, split="val")
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### Reddit dataset

In [None]:
from torch_geometric.datasets import Reddit
data_name = 'Reddit'    # can also be 'computers'
dataset = Reddit(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

edge_index, features, label = data.edge_index, data.x, data.y
print(label.shape, type(label), label[:5])
print(edge_index.shape)

print('\n isolated nodes in the graph:')
_, _, _ = filter_out_isolate(edge_index, features, label)

Info (attributes) of a single data instance, from HPC Pitzer:

    Data(edge_index=[2, 114615892], test_mask=[232965], train_mask=[232965], val_mask=[232965], x=[232965, 602], y=[232965]) 
     number of nodes:  232965 
     number of edges:  114615892 
     number of features per ndoe:  602 
     number of edge features:  0 
     number of classifying labels of dataset:  41 
     all the attributes of data:  ['x', 'edge_index', 'y', 'train_mask', 'val_mask', 'test_mask']
    Start running for train batch num: 75

    ====================================================================================================
    Start to generate the clustering machine:
    No isolated nodes number is found 
    Batch machine creation costs a total of 740.3321 seconds!

    Edge number:  114615892 
    Node number:  232965 
    Feature number:  602

### QM7b dataset

In [None]:
from torch_geometric.datasets import QM7b
data_name = 'QM7b'    # can also be 'computers'
dataset = QM7b(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### QM9 dataset

In [None]:
from torch_geometric.datasets import QM9
data_name = 'QM9'    # can also be 'computers'
dataset = QM9(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### Entities

In [None]:
from torch_geometric.datasets import Entities
data_name = 'AIFB'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

#### Large data scale :  Entities  AM

In [None]:
data_name = 'MUTAG'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

##### No node features but with labels

In [None]:
data_name = 'BGS'    # can also be 'computers'
dataset = Entities(root = local_data_root + 'Entities/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### GEDDataset

In [None]:
from torch_geometric.datasets import GEDDataset
data_name = 'LINUX'    # can also be 'computers'
dataset = GEDDataset(root = local_data_root + 'GEDDataset/' + data_name, name=data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print('Info (attributes) of a single data instance')
print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
  '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
  '\n all the attributes of data: ', data.keys)

### MNISTSuperpixels

In [None]:
from torch_geometric.datasets import MNISTSuperpixels
data_name = 'MNISTSuperpixels'    # can also be 'computers'
dataset = MNISTSuperpixels(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### ShapeNet

In [None]:
from torch_geometric.datasets import ShapeNet
data_name = 'ShapeNet'    # can also be 'computers'
dataset = ShapeNet(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

### PCPNetDataset

In [None]:
from torch_geometric.datasets import PCPNetDataset
data_name = 'Noisy'    # can also be 'computers'
dataset = PCPNetDataset(root = local_data_root + 'PCPNetDataset/' + data_name, category = 'Noisy')
print('number of data: ', len(dataset))
data = dataset[0]
print('Info (attributes) of a single data instance')
print(data, '\n number of nodes: ', data.num_nodes, '\n number of edges: ', data.num_edges, \
  '\n number of features per ndoe: ', data.num_node_features, '\n number of edge features: ', data.num_edge_features, \
  '\n all the attributes of data: ', data.keys)

### S3DIS

All the data set here contains 4096 nodes, with a total number of 20291 data

This dataset can be used as the mini-batch directly

In [None]:
from torch_geometric.datasets import S3DIS
data_name = 'S3DIS'    # can also be 'computers'
dataset = S3DIS(root = local_data_root + '/' + data_name)
print('number of data: ', len(dataset))
data = dataset[0]
print_data_info(data)

In [None]:
select = [(idx, data.num_nodes) for idx, data in enumerate(dataset) if data.num_nodes > 5000]
print(select)
    

In [None]:
# free GPU memory
!(nvidia-smi | grep 'python' | awk '{ print $3 }' | xargs -n1 kill -9)