# Datahandling in graphs

In [None]:
#https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html

In [2]:
import torch
from torch_geometric.data import Data

In [3]:
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

In [8]:
data

Data(x=[3, 1], edge_index=[2, 4])

In [5]:
data.__dict__

{'_edge_attr_cls': torch_geometric.data.data.DataEdgeAttr,
 '_tensor_attr_cls': torch_geometric.data.data.DataTensorAttr,
 '_store': {'x': tensor([[-1.],
         [ 0.],
         [ 1.]]), 'edge_index': tensor([[0, 1, 1, 2],
         [1, 0, 2, 1]])}}

### Note that edge_index, i.e. the tensor defining the source and target nodes of all edges, is not a list of index tuples. If you want to write your indices this way, you should transpose and call contiguous on it before passing them to the data constructor:

In [9]:
edge_index = torch.tensor([[0, 1],
                           [1, 0],
                           [1, 2],
                           [2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index.t().contiguous())

In [10]:
data

Data(x=[3, 1], edge_index=[2, 4])

### Note that it is necessary that the elements in edge_index only hold indices in the range { 0, ..., num_nodes - 1}. This is needed as we want our final data representation to be as compact as possible, e.g., we want to index the source and destination node features of the first edge (0, 1) via x[0] and x[1], respectively. You can always check that your final Data objects fulfill these requirements by running validate():

In [11]:
data.validate(raise_on_error=True)


True

### Besides holding a number of node-level, edge-level or graph-level attributes, Data provides a number of useful utility functions, e.g.:

In [15]:
data.keys, data["x"]

(['edge_index', 'x'],
 tensor([[-1.],
         [ 0.],
         [ 1.]]))

In [16]:
for key, item in data:
    print(f'{key} found in data')

x found in data
edge_index found in data


In [17]:
'edge_attr' in data

False

In [18]:

data.num_nodes, data.num_edges, data.num_node_features, data.has_isolated_nodes(), data.has_self_loops(), data.is_directed()

(3, 4, 1, False, False, False)

### Transfer data object to GPU

In [20]:
device = torch.device('cuda')
data = data.to(device)

# Common benchmarking datasets
the ENZYMES dataset (consisting of 600 graphs within 6 classes), type:

In [30]:
from torch_geometric.datasets import TUDataset


In [31]:
path = "../data_sl/gat"

In [32]:
dataset = TUDataset(root=path, name='ENZYMES')


In [29]:
dataset, len(dataset), dataset.num_classes, data.num_nodes, data.

(ENZYMES(600), 600, 6, 3)

### We now have access to all 600 graphs in the dataset:

In [34]:
data = dataset[0]
data

Data(edge_index=[2, 168], x=[37, 3], y=[1])

### We can see that the first graph in the dataset contains 37 nodes, each one having 3 features. There are 168/2 = 84 undirected edges and the graph is assigned to exactly one class. In addition, the data object is holding exactly one graph-level target.

In [36]:
data.__dict__

{'_edge_attr_cls': torch_geometric.data.data.DataEdgeAttr,
 '_tensor_attr_cls': torch_geometric.data.data.DataTensorAttr,
 '_store': {'edge_index': tensor([[ 0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
           3,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
           7,  8,  8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
          12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
          16, 16, 17, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20,
          21, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25,
          25, 25, 25, 25, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 28, 28, 28, 28,
          28, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 31, 31, 31, 32,
          32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35,
          35, 35, 36, 36, 36, 36],
         [ 1,  2,  3,  0,  2,  3, 24, 27,  0,  1,  3, 27, 28,  0,  1,  2,  4,  5

In [35]:
data.is_undirected()

True

### We can even use slices, long or bool tensors to split the dataset. E.g., to create a 90/10 train/test split, type:

In [38]:
train_dataset = dataset[:540]
test_dataset = dataset[540:]

### shuffle to permute the data

In [39]:
dataset = dataset.shuffle()


### This is equivalent of doing:

In [40]:
perm = torch.randperm(len(dataset))
dataset = dataset[perm]

In [41]:
dataset

ENZYMES(600)