# Graph Feature Engineering and Classifier Training + Evaluation

### Set up & Initialization

In [19]:
%%capture
pip install graphdatascience==1.1.0rc1 ipywidgets 

In [20]:
# Client import
from graphdatascience import GraphDataScience

# Replace with the actual URI, username and password
CONNECTION_URI = "neo4j+s://demo2.graphconnect.app:7687"
USERNAME = "neo4j"
with open('pass.txt', mode='r') as f:
    PASSWORD = f.readline().strip()

# Client instantiation
gds = GraphDataScience(
    CONNECTION_URI,
    auth=(USERNAME, PASSWORD)
)

### Bind the graph projection to a graph object 
The GDS Python Client works with graph objects in Python. If we were constructing the graph from a neo4j database (or a pandas dataframe), that would automatically return a graph object. Since we're using the graph that we just created with custom Arrow import code, we need to assign it to a graph object using `get`

In [21]:
G=gds.graph.get("gcdemo")

## Engineer FastRP Features

In [22]:
res=gds.fastRP.mutate(
    G,
    #nodeLabels=['Paper', 'Author'],
    #relationshipTypes=['CITES', 'AUTHORED'],
    embeddingDimension=256,
    concurrency=224,
    randomSeed=7474,
    mutateProperty="graphEmbedding"
)

res

FastRP:   0%|          | 0/100 [00:00<?, ?%/s]

nodePropertiesWritten                                            244160499
mutateMillis                                                             1
nodeCount                                                        244160499
preProcessingMillis                                                      1
computeMillis                                                       175540
configuration            {'nodeSelfInfluence': 0, 'relationshipWeightPr...
Name: 0, dtype: object

## Export Labeled Papers with FastRP Features

In [33]:
# start with subgraph projection
g_labeled, res = gds.beta.graph.project.subgraph(
  'labeledProjection',
  G,
  'n:Paper AND (n.flag >= 0)',
  '*',
  concurrency=224
)

res

fromGraphName                            gcdemo
nodeFilter            n:Paper AND (n.flag >= 0)
relationshipFilter                            *
graphName                     labeledProjection
nodeCount                               1251341
relationshipCount                       4035688
projectMillis                             12651
Name: 0, dtype: object

In [34]:
import time
import neo4j_arrow as na

In [35]:
with open('pass.txt', mode='r') as f:
    password = f.readline().strip()

client = na.Neo4jArrowClient('demo2.graphconnect.app', graph="labeledProjection", password=password, concurrency=224)

In [36]:
%%time
dfs = []
for chunk in client.read_nodes(["graphEmbedding", "flag", "years"]):
    dfs.append(chunk.to_pandas())

CPU times: user 1.94 s, sys: 1.34 s, total: 3.28 s
Wall time: 16.8 s


In [37]:
import pandas as pd
df = pd.concat(dfs)
df

Unnamed: 0,nodeId,graphEmbedding,flag,years
0,255702663,"[0.06717815, -0.092532516, -0.1189363, -0.0300...",28,2018
1,255702719,"[0.04853616, 0.05288233, -0.03917888, 0.079376...",60,2018
2,255702776,"[0.07652574, 0.049551904, 0.071946, 0.03915725...",141,2015
3,255702981,"[0.16148047, -0.053723894, 0.0099196695, 0.060...",43,2011
4,255702983,"[-0.024817739, 0.019410789, -0.2168279, -0.092...",141,2016
...,...,...,...,...
9995,162410689,"[0.019068487, -0.0447315, 0.15278411, 0.049438...",41,2016
9996,162411050,"[-0.13176966, -0.21428567, -0.0612996, -0.1759...",62,2017
9997,162411140,"[0.18016608, -0.038522035, -0.044146292, -0.05...",37,2017
9998,162411224,"[0.1963172, 0.008663094, -0.08452889, -0.03905...",0,2018


In [38]:
g_labeled.drop()

graphName                                            labeledProjection
database                                                         neo4j
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                      1251341
relationshipCount                                              4035688
configuration        {'jobId': 'a1258606-0dd1-4fd4-b56f-a905b1f46cb...
density                                                       0.000003
creationTime                       2022-06-11T15:22:15.776788000+00:00
modificationTime                   2022-06-11T15:22:28.428540000+00:00
schema               {'graphProperties': {}, 'relationships': {'CIT...
Name: 0, dtype: object

## Train Nueral Network

In [39]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.8.1


In [40]:
df_train = df[df.years < 2019]
df_test = df[df.years >= 2019]

In [41]:
y_train = df_train.flag
y_test = df_test.flag
X_train = np.stack(df_train.graphEmbedding, axis=0)
X_test = np.stack(df_test.graphEmbedding, axis=0)

In [42]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, kernel_regularizer=tf.keras.regularizers.l2(0.00005)),
    tf.keras.layers.Dense(180, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.00005)),
    tf.keras.layers.Dense(153)
])

In [43]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [44]:
model.fit(X_train, y_train, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f03f536fb50>

## Evaluate and Save Nueral Network

In [45]:
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)

print('\nTest accuracy:', test_acc)

4343/4343 - 13s - loss: 3.1050 - accuracy: 0.3476 - 13s/epoch - 3ms/step

Test accuracy: 0.34762394428253174


In [46]:
model.save('simple-paper-classifier-5')

INFO:tensorflow:Assets written to: simple-paper-classifier-5/assets
