# Import, preprocess, and store data split

## Import relevant packages

In [6]:
import torch_geometric as tg
import torch
from torch_geometric.datasets import Coauthor, Planetoid, WikipediaNetwork
from torch_geometric.transforms import RandomLinkSplit, RandomNodeSplit
from torch_geometric.utils import subgraph

from torch_geometric.data import Data
import pickle
import bz2
import time

torch.manual_seed(10)

<torch._C.Generator at 0x10c03c5d0>

## Import homogeneous datasets and process

In [2]:
root = '../data'
wiki_datasets = ["chameleon","crocodile"]
planetoid_dataset = "PubMed"
coauthor_dataset = "CS"

wiki_chameleon = WikipediaNetwork(root=root, name=wiki_datasets[0]).data
wiki_crocodile = WikipediaNetwork(root=root, name=wiki_datasets[1], geom_gcn_preprocess=False).data
pubmed = Planetoid(root=root, name=planetoid_dataset).data
cs = Coauthor(root=root, name=coauthor_dataset).data

Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/new_data/chameleon/out1_node_feature_label.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/new_data/chameleon/out1_graph_edges.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_0.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_1.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_2.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_3.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14

## Split each dataset and save

In [9]:
def inductive_split(data: tg.data):
    ''' 
    Function that takes graph data and creates train, test, and valid masks 
    in order to perform inductive link prediction.

    When splitting wikipedia data, use split 0.8, 0.9
    When splitting pubmed and cs, use split 0.30, 0.65
    '''
    # Create train / test / validation masks
    rnp = RandomNodeSplit(num_val=0.1, num_test=0.1)

    data = data.clone()
    data = rnp(data)

    # Create edges lists from train / test / validation masks from randomnodesplit
    adj = torch.sparse_coo_tensor(data.edge_index, torch.ones(data.edge_index.shape[1]), (data.num_nodes, data.num_nodes)).to_dense()
    training_edges = adj[data.train_mask.nonzero().view(-1), :][:, data.train_mask.nonzero().view(-1)]
    val_edges  = adj[data.train_mask.nonzero().view(-1), :][:, data.val_mask.nonzero().view(-1)]
    testing_edges  = adj[data.train_mask.nonzero().view(-1), :][:, data.test_mask.nonzero().view(-1)]

    # Initialize Data objects
    training_data = Data()
    testing_data = Data()
    valid_data = Data()

    # Create Data objects from masks and edge lists
    training_data.x = data.x[data.train_mask]
    training_data.edge_index = training_edges.nonzero().T
    testing_data.x = data.x[data.test_mask]
    testing_data.edge_index = testing_edges.nonzero().T
    valid_data.x = data.x[data.val_mask]
    valid_data.edge_index = val_edges.nonzero().T

    # Use RandomLinkSplit to create edge_label_index for each data object
    rlp = RandomLinkSplit(num_val=0, num_test=0)
    
    training_data, _, _ = rlp(
            Data(
                x = training_data.x,
                edge_index=training_data.edge_index,
                num_nodes=training_data.num_nodes
            )
        )

    testing_data, _, _ = rlp(
            Data(
                x = testing_data.x,
                edge_index=testing_data.edge_index,
                num_nodes=testing_data.num_nodes
            )
        )
    
    valid_data, _, _ = rlp(
            Data(
                x = valid_data.x,
                edge_index=valid_data.edge_index,
                num_nodes=valid_data.num_nodes
            )
        )
            
    # Save each train, test, val subgraph using pickling
    timestr = time.strftime("%Y%m%d-%H%M%S")
    names =['train_data_'+timestr,'valid_data_'+timestr,'test_data_'+timestr]
    data = [training_data, valid_data, testing_data]
    for data, name in zip(data,names):
        pickle.dump(data, bz2.BZ2File('../data/{0}.p'.format(name),'wb'))

In [10]:
datasets = [wiki_chameleon, wiki_crocodile, pubmed, cs]
for set in datasets:
    inductive_split(set)