## Load Data from Raw Citation (Cora)

In [27]:
import time
import argparse
import numpy as np
import pdb

import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

In [37]:
# raw data version
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys

path="../data/ori_cora_data/cora/" 
dataset="cora"
idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),dtype=np.dtype(str))
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
# features = normalize(features) # no normalization in plantoid

In [31]:
labels = encode_onehot(idx_features_labels[:, -1])
# build graph
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                dtype=np.int32)
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                 dtype=np.int32).reshape(edges_unordered.shape)
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                    shape=(labels.shape[0], labels.shape[0]),
                    dtype=np.float32)

# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
idx_train = range(140)
idx_val = range(200, 500)
idx_test = range(500, 1500)

## Save as Planetoid Format

In [None]:
"""
Loads input data from gcn/data directory

ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
    (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
    object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

All objects above must be saved using python pickle module.

:param dataset_str: Dataset name
:return: All data input files loaded (as well the training/test data).
"""
save_root = "../data/ori_cora_data"

In [164]:
pickle.dump(features[idx_train], open(f"{save_root}/ind.cora.x", "wb" ) )
pickle.dump(sp.vstack((features[:idx_test[0]], features[idx_test[-1]+1:])), open( (f"{save_root}/ind.cora.allx", "wb" ) )
pickle.dump(features[idx_test], open( (f"{save_root}/ind.cora.tx", "wb" ) )

In [165]:
pickle.dump(labels[idx_train], open( (f"{save_root}/ind.cora.y", "wb" ) )
pickle.dump(labels[idx_test], open( (f"{save_root}/ind.cora.ty", "wb" ) )
pickle.dump(np.vstack((labels[:idx_test[0]],labels[idx_test[-1]+1:])), open( (f"{save_root}/ind.cora.ally", "wb" ) )

In [166]:
with open('(f"{save_root}/ind.cora.test.index', 'w') as f:
    for item in list(idx_test):
        f.write("%s\n" % item)

In [167]:
# ori_graph
array_adj = np.argwhere(adj.toarray())
ori_graph = defaultdict(list)
for edge in array_adj:
    ori_graph[edge[0]].append(edge[1])
pickle.dump(ori_graph, open( "(f"{save_root}/ind.cora.graph", "wb" ) )

## Validation of our format transfer

In [25]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

dataset_str='cora'
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
    with open("../data/ori_cora_data_nonormalize/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
        if sys.version_info > (3, 0):
            objects.append(pkl.load(f, encoding='latin1'))
        else:
            objects.append(pkl.load(f))

x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file("../data/ori_cora_data_nonormalize/ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)

p_features = sp.vstack((allx[:test_idx_range[0]], tx, allx[test_idx_range[0]:])).tolil()
p_features[test_idx_reorder, :] = features[test_idx_range, :]

o_adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

o_labels = np.vstack((ally[:test_idx_range[0]], ty, ally[test_idx_range[0]:]))
o_labels[test_idx_reorder, :] = labels[test_idx_range, :]

idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)

train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])

y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]

In [65]:
(p_features.nonzero()[0] == features.nonzero()[0]).all()

True

In [66]:
(p_features.nonzero()[1] == features.nonzero()[1]).all()

True

In [70]:
(adj!=o_adj)

<2708x2708 sparse matrix of type '<class 'numpy.bool_'>'
	with 0 stored elements in Compressed Sparse Row format>

### Attention, two loaded graph should be identical if their adj and feature are identical. Their label may (with high prpobability) not match with each other because of label match in
```
classes = set(idx_features_labels[:, -1])
# classes.sort()
# labels.nonzero()[1][:10]
classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                enumerate(classes)}
```