# Make Your Own Dataset

In [1]:
import urllib.request
import pandas as pd

import os
os.environ['DGLBACKEND'] = 'pytorch'

import dgl
import torch
from dgl.data import DGLDataset

## `DGLDataset` Object Overview

custom graph dataset should inherit (继承) the `dgl.data.DGLDataset` class and implement the following methods:
 - `__geiitem__(self, i)`: retrieve the `i`th example of the dataset
 - `__len__(self)`: the number of examples in the dataset
 - `process(self)`: load and process raw data from disk

## Creating a Dataset for Node Classification or Link Prediction from CSV

 - **SKIP**

## Creating a Dataset for Graph Classification from CSV

The `__geiitem__` returns both the graph and its graph-level label.

In [2]:
urllib.request.urlretrieve('https://data.dgl.ai/tutorial/dataset/graph_edges.csv',
                           'data/graph_edges.csv')

('data/graph_edges.csv', <http.client.HTTPMessage at 0x7f9ffa301820>)

In [3]:
urllib.request.urlretrieve('https://data.dgl.ai/tutorial/dataset/graph_properties.csv',
                           'data/graph_properties.csv')

('data/graph_properties.csv', <http.client.HTTPMessage at 0x7f9ffa301be0>)

In [4]:
edges = pd.read_csv('data/graph_edges.csv')
properties = pd.read_csv('data/graph_properties.csv')

In [5]:
edges.head(2)

Unnamed: 0,graph_id,src,dst
0,0,0,1
1,0,0,14


In [6]:
edges_group = edges.groupby('graph_id')
edges_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9fb8836790>

In [7]:
edges_group.groups

{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44], 1: [45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74], 2: [75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113], 3: [114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152], 4: [153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203], 5: [204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 

In [8]:
edges_group.get_group(0)

Unnamed: 0,graph_id,src,dst
0,0,0,1
1,0,0,14
2,0,1,0
3,0,1,2
4,0,2,1
5,0,2,3
6,0,3,2
7,0,3,4
8,0,4,3
9,0,4,5


In [9]:
properties.head(2)

Unnamed: 0,graph_id,label,num_nodes
0,0,0,15
1,1,0,10


In [10]:
properties.iterrows()

<generator object DataFrame.iterrows at 0x7f9fb8849660>

In [11]:
for _, row in properties.iterrows():
    print(_)
    print(row)
    break

0
graph_id      0
label         0
num_nodes    15
Name: 0, dtype: int64


In [12]:
class SyntheticDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='synthetic')
    
    def process(self):
        edges = pd.read_csv('data/graph_edges.csv')
        properties = pd.read_csv('data/graph_properties.csv')
        self.graphs = []
        self.labels = []
        
        # create a graph for each graph ID from the edge table
        # first process the properties table into two dictionaries with graph IDs as keys
        # the label and number of nodes are values
        label_dict = {}
        num_nodes_dict = {}
        
        for _, row in properties.iterrows():
            label_dict[row['graph_id']] = row['label']
            num_nodes_dict[row['graph_id']] = row['num_nodes']
            
        # for the edges, first group the table by graoh IDs
        edges_group = edges.groupby('graph_id')
        
        # for each graph ID
        for graph_id in edges_group.groups:
            edges_of_id = edges_group.get_group(graph_id)
            src = edges_of_id['src'].to_numpy()
            dst = edges_of_id['dst'].to_numpy()
            num_nodes = num_nodes_dict[graph_id]
            label = label_dict[graph_id]
            
            g = dgl.graph((src, dst), num_nodes=num_nodes)
            self.graphs.append(g)
            self.labels.append(label)
        
        # convert the label list to tensor for saving
        self.labels = torch.LongTensor(self.labels)
    
    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]
    
    def __len__(self):
        return len(self.graphs)

In [13]:
dataset = SyntheticDataset()
dataset

Dataset("synthetic", num_graphs=40, save_path=/Users/qinzijian/.dgl/synthetic)

In [14]:
graph, label = dataset[0]
print(graph, label)

Graph(num_nodes=15, num_edges=45,
      ndata_schemes={}
      edata_schemes={}) tensor(0)
