# Lab 02 : Loading graph datasets from DGL, PyG and OGB - demo

### Xavier Bresson   


In [1]:
# For Google Colaboratory  
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/GML_May23_codes/codes/08_Datasets'
    print(path_to_file)
    # change current path to the folder containing "path_to_file"
    os.chdir(path_to_file)
    !pwd
    !pip install dgl # Install DGL
    !pip install torch_geometric # Install PyG
    !pip install ogb # Install OGB

## 1. DGL datasets

https://docs.dgl.ai/api/python/dgl.data.html 

In [2]:
from dgl.data import CoraGraphDataset
dataset = CoraGraphDataset()
g = dataset[0]
print(g)
num_class = dataset.num_classes
# get node feature
print(g.ndata['feat'].size())
# get data split
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']
# get labels
print(g.ndata['label'].size())


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})
torch.Size([2708, 1433])
torch.Size([2708])


In [3]:
from dgl.data import QM9Dataset
data = QM9Dataset(label_keys=['mu', 'gap'], cutoff=5.0)
print(data.num_tasks)
g, label = data[0]
print(g)
print(label.size())
print(g.ndata['R'].size()) # get coordinates of each atom
print(g.ndata['Z'].size()) # get atomic numbers of each atom
      

2
Graph(num_nodes=5, num_edges=20,
      ndata_schemes={'R': Scheme(shape=(3,), dtype=torch.float32), 'Z': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
torch.Size([2])
torch.Size([5, 3])
torch.Size([5])


## 2. PyG datasets

https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html 

In [4]:
from torch_geometric.datasets import RelLinkPredDataset
dataset = RelLinkPredDataset(root='./', name='FB15k-237')
data = dataset[0]
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Number of node features: {data.num_node_features}')
print(f'Number of edge features: {data.num_edge_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of relations: {dataset.num_relations// 2}')
print(data)


Downloading https://raw.githubusercontent.com/MichSchli/RelationPrediction/master/data/FB-Toutanova/entities.dict
Downloading https://raw.githubusercontent.com/MichSchli/RelationPrediction/master/data/FB-Toutanova/relations.dict
Downloading https://raw.githubusercontent.com/MichSchli/RelationPrediction/master/data/FB-Toutanova/test.txt
Downloading https://raw.githubusercontent.com/MichSchli/RelationPrediction/master/data/FB-Toutanova/train.txt
Downloading https://raw.githubusercontent.com/MichSchli/RelationPrediction/master/data/FB-Toutanova/valid.txt
Processing...


Number of nodes: 14541
Number of edges: 544230
Number of node features: 0
Number of edge features: 0
Number of classes: 0
Number of relations: 237
Data(edge_index=[2, 544230], num_nodes=14541, edge_type=[544230], train_edge_index=[2, 272115], train_edge_type=[272115], valid_edge_index=[2, 17535], valid_edge_type=[17535], test_edge_index=[2, 20466], test_edge_type=[20466])


Done!


In [5]:
from torch_geometric.datasets import WikiCS
dataset = WikiCS(root='./')
data = dataset[0]
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Number of node features: {data.num_node_features}')
print(f'Number of edge features: {data.num_edge_features}')
print(f'Number of classes: {dataset.num_classes}')
print(data)


Downloading https://github.com/pmernyei/wiki-cs-dataset/raw/master/dataset/data.json
Processing...


Number of nodes: 11701
Number of edges: 431726
Number of node features: 300
Number of edge features: 0
Number of classes: 10
Data(x=[11701, 300], edge_index=[2, 431726], y=[11701], train_mask=[11701, 20], val_mask=[11701, 20], test_mask=[11701], stopping_mask=[11701, 20])


Done!


## 3. OGB datasets

https://ogb.stanford.edu 


In [1]:
# DGL Loader
from ogb.graphproppred import DglGraphPropPredDataset, collate_dgl
from torch.utils.data import DataLoader
dataset = DglGraphPropPredDataset(name = 'ogbg-molhiv')
print(dataset)
split_idx = dataset.get_idx_split()
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True, collate_fn=collate_dgl)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False, collate_fn=collate_dgl)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False, collate_fn=collate_dgl)

# Pytorch Geometric Loader
from ogb.graphproppred import PygGraphPropPredDataset
from torch_geometric.data import DataLoader
dataset = PygGraphPropPredDataset(name = 'ogbg-molhiv') 
print(dataset)
split_idx = dataset.get_idx_split() 
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False)


Downloading http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip


Downloaded 0.00 GB: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.19it/s]


Extracting dataset/hiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41127/41127 [00:00<00:00, 44696.55it/s]


Converting graphs into DGL objects...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41127/41127 [00:08<00:00, 5063.57it/s]


Saving...
DglGraphPropPredDataset(41127)
Loading necessary files...
This might take a while.


Processing...


Processing graphs...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41127/41127 [00:00<00:00, 89816.36it/s]


Converting graphs into PyG objects...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41127/41127 [00:01<00:00, 28148.24it/s]


Saving...


Done!


PygGraphPropPredDataset(41127)


In [2]:
# DGL Loader
from ogb.linkproppred import DglLinkPropPredDataset
dataset = DglLinkPropPredDataset(name ='ogbl-biokg')
split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0] # dgl graph object containing only training edges
print(graph)
#print(graph.ndata)
#print(graph.edata)
#print(graph.ndata['feat'].size())

# Pytorch Geometric Loader
from ogb.linkproppred import PygLinkPropPredDataset
dataset = PygLinkPropPredDataset(name = 'ogbl-biokg') 
split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0] # pyg graph object containing only training edges
print(dataset)


Downloading http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip


Downloaded 0.90 GB: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 920/920 [01:57<00:00,  7.85it/s]


Extracting dataset/biokg.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3998.38it/s]


Converting graphs into DGL objects...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.99s/it]


Saving...
Graph(num_nodes={'disease': 10687, 'drug': 10533, 'function': 45085, 'protein': 17499, 'sideeffect': 9969},
      num_edges={('disease', 'disease-protein', 'protein'): 73547, ('drug', 'drug-disease', 'disease'): 5147, ('drug', 'drug-drug_acquired_metabolic_disease', 'drug'): 63430, ('drug', 'drug-drug_bacterial_infectious_disease', 'drug'): 18554, ('drug', 'drug-drug_benign_neoplasm', 'drug'): 30348, ('drug', 'drug-drug_cancer', 'drug'): 48514, ('drug', 'drug-drug_cardiovascular_system_disease', 'drug'): 94842, ('drug', 'drug-drug_chromosomal_disease', 'drug'): 316, ('drug', 'drug-drug_cognitive_disorder', 'drug'): 34660, ('drug', 'drug-drug_cryptorchidism', 'drug'): 128, ('drug', 'drug-drug_developmental_disorder_of_mental_health', 'drug'): 14314, ('drug', 'drug-drug_endocrine_system_disease', 'drug'): 55994, ('drug', 'drug-drug_fungal_infectious_disease', 'drug'): 36114, ('drug', 'drug-drug_gastrointestinal_system_disease', 'drug'): 83210, ('drug', 'drug-drug_hematopoietic_

Processing...


Processing graphs...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1825.20it/s]


Converting graphs into PyG objects...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1300.16it/s]

Saving...



Done!


PygLinkPropPredDataset()
