* Analyzing difference between stellargraph datasets and ARGA repository datasets.

In [None]:
# verify that we're using the correct version of StellarGraph for this notebook
import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

In [35]:
import matplotlib.pyplot as plt
from math import isclose
from sklearn.decomposition import PCA
import os
import networkx as nx
import numpy as np
import pandas as pd
from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter
from collections import Counter
import multiprocessing
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
from scipy import sparse

%matplotlib inline

In [36]:
# Loading dataset from stellargraph library
dataset = datasets.Cora()
display(HTML(dataset.description))
# graph, _ = dataset.load(largest_connected_component_only=True, str_node_ids=True)
graph, _ = dataset.load(largest_connected_component_only=True)

In [37]:
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 2485, Edges: 5209

 Node types:
  paper: [2485]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5209]
        Weights: all 1 (default)
        Features: none


In [38]:
features = graph.node_features(nodes=None)
# features = sparse.csr_matrix(features)
features = sparse.csr_matrix(features)
adj = graph.to_adjacency_matrix(nodes=None)

In [41]:
# Author: Tonni

import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pickle
import math
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from collections import defaultdict
from scipy import sparse


def getNHopNeighbors(node, hop, adjList): # It is simply a bfs till nhop, not on whole graph
    neighborsTillHop, n_neighbors = set(), {node}

    for i in range(hop):
        temp = set()

        for curNode in n_neighbors:
            if curNode in adjList:
                temp = temp.union(set(adjList[curNode]))
        
        neighborsTillHop = neighborsTillHop.union(temp)
        n_neighbors = temp
    
    return neighborsTillHop

# converts from adjacency matrix to adjacency list
def convert(numNodes, adj):
    adj = adj.todense()
    adjList = defaultdict(list) # Type: Default value is empty list
    for i in range(numNodes):
        for j in range(numNodes):
                if adj[i,j] == 1:
                    adjList[i].append(j)
    return adjList

def addHopFeatures(features, adj, hop_count):
    print('features_n_hop start')

    numNodes = features.shape[0]

    adjList = convert(numNodes, adj)

    n_hop_neighbors = hop_count

    Vertices_attributes_oneHot = pd.DataFrame.sparse.from_spmatrix(features)

    all_nodes_distribution = np.zeros((numNodes, len(Vertices_attributes_oneHot.columns)))

    for eachNode in range(numNodes):
        Immediate_friends_Nodes = getNHopNeighbors(eachNode, n_hop_neighbors, adjList) # gets a list of adjacent nodes till n hop
        Vertices_attributes_sum = Vertices_attributes_oneHot.iloc[list(Immediate_friends_Nodes)].sum()
        Vertices_attributes_sum = Vertices_attributes_sum.to_numpy()
        Vertices_attributes_sum[Vertices_attributes_sum > 0] = 1 # replace non-zero with 1
        all_nodes_distribution[eachNode] = Vertices_attributes_sum

    features_n_hop = sparse.csr_matrix(all_nodes_distribution) # convert to sparse matrix

    with open('features_n_hop.pickle', 'wb') as handle: pickle.dump(features_n_hop, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print('features_n_hop done')

    return features_n_hop

def addHopAdjacency(adj, hop_count):
    print('adjacency_n_hop start')
    print("---------------------")

    numNodes = adj.shape[0]           # 6271 for mmu
    print('numNodes:', numNodes)

    adjList = convert(numNodes, adj)

    n_hop_neighbors = hop_count

    nHopAdj = np.zeros((numNodes, numNodes), dtype=int)

    for eachNode in range(numNodes):
        Immediate_friends_Nodes = getNHopNeighbors(eachNode, n_hop_neighbors, adjList) # gets a list of adjacent nodes till n hop

        for friends_Node in Immediate_friends_Nodes:
            nHopAdj[eachNode][friends_Node] = 1

    nHopAdj = sparse.csr_matrix(nHopAdj) # convert to sparse matrix

    # with open('KEGG_pickles/adj_n_hop.pickle', 'wb') as handle: pickle.dump(nHopAdj, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print('adj_n_hop done')

    return nHopAdj
    

In [42]:
def store(features, adj, hop_count):
    # Manually store hopped info in pickle
    features = addHopFeatures(features, adj, hop_count)
    adj = sparse.csr_matrix(adj)
    adj = addHopAdjacency(adj, hop_count + 1)

    f1 = 'pickles/' + data_name + '_features_hop_' + str(hop_count) + '_stellergraph' + '.pickle'
    a1 = 'pickles/' + data_name + '_adj_hop_' + str(hop_count) + '_stellergraph' + '.pickle'
    with open(f1, 'wb') as handle: pickle.dump(features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(a1, 'wb') as handle: pickle.dump(adj, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [43]:
hop_count = [2]
data_name = 'cora'

In [44]:
for each in hop_count:
    store(features, adj, each)

features_n_hop start
features_n_hop done
adjacency_n_hop start
---------------------
numNodes: 2485
adj_n_hop done
