# RGCN with StellarGraph

- dataset: AIFB dataset

## start with node-classification

In [1]:
import networkx as nx
import stellargraph as sg
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf

from sklearn import preprocessing, model_selection

from tensorflow.keras import layers, Model, optimizers, losses, callbacks

warnings.filterwarnings('ignore')
np.random.seed(0)
tf.random.set_seed(0)

### graph setting

In [2]:
from stellargraph import datasets

dataset = datasets.AIFB()
G, affiliation = dataset.load()
print(G.info())

StellarDiGraph: Directed multigraph
 Nodes: 8285, Edges: 29043

 Node types:
  default: [8285]
    Features: float32 vector, length 8285
    Edge types: default-http://swrc.ontoware.org/ontology#abstract->default, default-http://swrc.ontoware.org/ontology#address->default, default-http://swrc.ontoware.org/ontology#author->default, default-http://swrc.ontoware.org/ontology#booktitle->default, default-http://swrc.ontoware.org/ontology#carriedOutBy->default, ... (40 more)

 Edge types:
    default-http://swrc.ontoware.org/ontology#publication->default: [4163]
    default-http://www.w3.org/1999/02/22-rdf-syntax-ns#type->default: [4124]
    default-http://swrc.ontoware.org/ontology#author->default: [3986]
    default-http://swrc.ontoware.org/ontology#isAbout->default: [2477]
    default-http://swrc.ontoware.org/ontology#name->default: [1302]
    default-http://swrc.ontoware.org/ontology#year->default: [1227]
    default-http://swrc.ontoware.org/ontology#title->default: [1227]
    default-ht

### val-test split and OHE

In [3]:
tr_target, val_target = model_selection.train_test_split(
    affiliation, train_size=0.2
)

val_target, test_target = model_selection.train_test_split(
    val_target, train_size=0.5
)

### node generator
- RGCN is intended for relational data

In [4]:
from stellargraph.mapper import RelationalFullBatchNodeGenerator

generator = RelationalFullBatchNodeGenerator(G, sparse=True)

tr_flow = generator.flow(tr_target.index, tr_target) # node_id, target
val_flow = generator.flow(val_target.index, val_target)
test_flow = generator.flow(test_target.index, test_target)
tot_flow = generator.flow(affiliation.index, affiliation)

### build implemented model

In [5]:
from stellargraph.layer import RGCN
# like keras models
rgcn = RGCN(layer_sizes=[32, 32], activations=["elu", "elu"], generator=generator, dropout=0.5,
          num_bases=20, bias=True)

# build network
nc_inp, nc_out = rgcn.in_out_tensors()

### inject a few layer(FC) to better do ML things

In [10]:
# nc_layer = layers.Dense(16, activation='relu')(nc_out)
nc_layer = layers.Dense(tr_target.shape[-1], activation='softmax')(nc_out)

In [11]:
nc_model = Model(inputs=nc_inp, outputs=nc_layer)
nc_model.compile(
    optimizer=optimizers.Adam(lr=1e-3),
    loss='categorical_crossentropy',
    metrics=["acc"],
)

In [12]:
es = callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor='val_acc')
nc_hist = nc_model.fit(tr_flow, epochs=200,
                validation_data=val_flow,
                shuffle=False, # should be False!!
                callbacks=[es],
                verbose=1)  

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 1 steps, validate for 1 steps
Epoch 1/200


InternalError: 2 root error(s) found.
  (0) Internal:  Unsupported numpy type: NPY_INT
	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
  (1) Internal:  Unsupported numpy type: NPY_INT
	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_166]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_18028]

Function call stack:
distributed_function -> distributed_function


In [None]:
sg.utils.plot_history(nc_hist)

In [None]:
from sklearn.metrics import f1_score

f1_micro = f1_score(list(map(np.argmax, test_target.values)), list(map(np.argmax, (nc_model.predict(test_flow).squeeze()))), average='micro')
f1_marco = f1_score(list(map(np.argmax, test_target.values)), list(map(np.argmax, (nc_model.predict(test_flow).squeeze()))), average='macro')

print('f1_micro:', round(f1_micro,3), '\nf1_macro:', round(f1_marco, 3)) 


### comparison with non-graph classification

In [None]:
from sklearn.linear_model import LogisticRegression

tr_tar = list(map(lambda x: np.argmax(x), tr_target.values))
lr = LogisticRegression().fit(features.loc[tr_target.index], tr_tar)

f1_micro = f1_score(list(map(np.argmax, test_target.values)), list(map(np.argmax, (lr.predict_proba(features.loc[test_target.index])))), average='micro')
f1_macro = f1_score(list(map(np.argmax, test_target.values)), list(map(np.argmax, (lr.predict_proba(features.loc[test_target.index])))), average='macro')

print('f1_micro:', round(f1_micro,3), '\nf1_macro:', round(f1_marco, 3)) 


### visualize how it classify

In [None]:
emb_model = Model(nc_inp, nc_out)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2)
tsne_x = tsne.fit_transform(emb_model.predict(tot_flow).squeeze())

lbe = preprocessing.LabelEncoder()
col = lbe.fit_transform(target)

alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    tsne_x[:, 0],
    tsne_x[:, 1],
    cmap="rainbow",
    c=col,
    alpha=alpha,
)

## link-prediction

### use EdgeSplitter and seperate into train/val sets

In [None]:
from stellargraph.data import EdgeSplitter

G = sg.StellarGraph(gx, node_features=features)

# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global", keep_connected=True
)


In [None]:
# Define an edge splitter on the reduced graph G_test:
edge_splitter_train = EdgeSplitter(G_test)

G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.1, method="global", keep_connected=True
)


### Link Generator

In [None]:
from stellargraph.mapper import FullBatchLinkGenerator

train_gen = FullBatchLinkGenerator(G_train, method="gcn")
train_flow = train_gen.flow(edge_ids_train, edge_labels_train)

test_gen = FullBatchLinkGenerator(G_test, method="gcn")
test_flow = train_gen.flow(edge_ids_test, edge_labels_test)

In [None]:
gcn = GCN(layer_sizes=[16, 8], activations=["elu", "elu"], generator=train_gen, dropout=0.5)

inp, out = gcn.build()

### add another layers
- especially reshape layer is needed

In [None]:
from stellargraph.layer import link_classification, LinkEmbedding

prediction = LinkEmbedding(activation="sigmoid", method="ip")(out)
prediction = layers.Reshape((-1,))(prediction)

lp_model = Model(inp, prediction)

lp_model.compile(
    optimizer=optimizers.Adam(lr=1e-2),
    loss=losses.binary_crossentropy,
    metrics=["acc"],
)

In [None]:
es = callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_acc')
lp_hist = lp_model.fit(train_flow , epochs=100,
                   validation_data=test_flow,
                   shuffle=False,
                   callbacks=[es],
                    verbose=0)

In [None]:
sg.utils.plot_history(lp_hist)

In [None]:
f1_micro = f1_score(edge_labels_test, list(map(lambda x: 1 if x>0.5 else 0, (lp_model.predict(test_flow).squeeze()))), average='micro')
f1_marco = f1_score(edge_labels_test, list(map(lambda x: 1 if x>0.5 else 0, (lp_model.predict(test_flow).squeeze()))), average='macro')

print('f1_micro:', round(f1_micro,3), '\nf1_macro:', round(f1_marco, 3)) 


In [None]:
train_targets, test_targets = model_selection.train_test_split(
    affiliation, train_size=0.8, test_size=None
)

In [None]:
generator = RelationalFullBatchNodeGenerator(G, sparse=True)

train_gen = generator.flow(train_targets.index, targets=train_targets)
test_gen = generator.flow(test_targets.index, targets=test_targets)


In [None]:
rgcn = RGCN(
    layer_sizes=[32, 32],
    activations=["relu", "relu"],
    generator=generator,
    bias=True,
    num_bases=20,
    dropout=0.5,
)

In [None]:
x_in, x_out = rgcn.build()
predictions = layers.Dense(train_targets.shape[-1], activation="softmax")(x_out)
model = Model(inputs=x_in, outputs=predictions)
model.compile(
    loss="categorical_crossentropy",
    optimizer=optimizers.Adam(0.01),
    metrics=["acc"],
)

In [None]:
history = model.fit(train_gen, validation_data=test_gen, epochs=20)

In [None]:
from tensorflow.keras import Sequential

nn = Sequential()
nn.add(layers.Dense(13))

nn.add(layers.Dense(5))

In [None]:
nn.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
nn.fit(affiliation.values, affiliation.values,
      epochs=10)

In [None]:
tr_flow.on_epoch_end

In [None]:
pip install rdflib
