# Run fullchain tests

See: finnet-pipeline/docker-tests/fullchain/run_tests.py

To add packages, append to `dags/requirements_py3.txt` and run `!pip3 install -r /usr/local/dags/requirements_py3.txt`

In [1]:
# !pip3 install -r /usr/local/dags/requirements_py3.txt

## Init Spark

In [2]:
import findspark
findspark.init("/usr/local/spark")

In [3]:
from pyspark import SparkContext

### Stop current SC, test assumes no existing SC
sc = SparkContext.getOrCreate()
sc.stop()

## Imports and Env

In [4]:
import os
os.environ["GRAPH_DB"] = """bolt://neo4j:test@neo4j:7687"""
os.environ["NEO4J_SSH_PORT"] = "22"
os.environ["NEO4J_SSH_USERNAME"] = "root"

In [5]:
os.environ['PIPELINE_DATA_PATH'] = "/datasets/finnet"
os.environ['PIPELINE_DATA_FORMAT'] = "parquet"

In [6]:
import sys
sys.path.insert(0, "/usr/local/dags")

import pandas as pd

In [7]:
from pyspark import SparkConf

import json
import logging
import re

from fncore_py3.utils.graph_specification import GraphSpec
from fncore_py3.utils.neo4j_conf import get_neo4j_context
from fncore_py3.utils.spark_tools import SparkConfFactory, get_spark_context

LOCAL_DATA_PATH = os.path.join(os.getcwd(), 'data')
DATA_PATH = os.environ['PIPELINE_DATA_PATH']

## Run tests

In [8]:
# Set the list of tasks to test
dolist = [
    'build_lists', 'resolve_entities',
    'neo4j_purger', 'neo4j_writer',
    'graph_tools'
]

# Get neo4j ssh username and port
neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j')
neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000))

# Setup the spark configuration
config = dict()
config['SparkConfiguration'] = (SparkConf()
                                .setMaster('local[*]')
                                .setAppName("test create data")
                                .set("spark.executor.memory", "1024m"))

# Get the graph specs
datalist = os.listdir(LOCAL_DATA_PATH)
jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)]

In [9]:
# Only 1 json
gspec = jsonlist[0]

### Load Graph Spec

In [10]:
# Load the graph spec
with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f:
    graph_spec = GraphSpec.from_dict(json.load(f))
    spec = graph_spec.to_dict()

# tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables')
# n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list')
# e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list')
# n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved')
# e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved')

logging.info("Processing " + gspec)

# Use graph specification's neo4j connection
neo_config = {
    'uri': spec['graph_uri'],
    'max_retries': config.get('neo4j.max_retries', 5),
    'max_batchsize': config.get('neo4j.max_batchsize', 10000)
}

In [11]:
# spec

# """
# Note:
# test_data_chocolate_node_list: index_column: hidden: True is ignored as intended
# """

### Entity resolution:

How should we disambiguate `S1234567G:Person` and `S1234567G:SoleProprietor`. Need to generate canonical ID? Should be done before hive ingestion then... not part as graph building...? Let them be separate nodes but with 0 distance

### Purge existing Graph

In [12]:
# Purging the graph
if 'neo4j_purger' in dolist:
    with get_neo4j_context(neo_config['uri']) as neo_context:
        neo_context.run("""MATCH (n) DETACH DELETE n""")
        
        cursor_constraints = neo_context.run("CALL db.constraints")
        for constraint_stmt in [k['description'] for k in cursor_constraints.data()]:
            neo_context.run("DROP {}".format(constraint_stmt))

In [13]:
# Check nodes in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run("MATCH (n) RETURN count(n) as count")
    print(cursor.data())

[{'count': 0}]


### Write nodes and edges to graph

In [14]:
spark_config = SparkConfFactory() \
    .set_master('local[*]') \
    .set_app_name('write neo4j nodes') \
    .set("spark.driver.maxResultSize", "1g") \
    .set('spark.executor.memory', '1g') \

In [15]:
from fncore_py3.tasks.spark2neo4j_writer import graph_to_neo4j_cql

graph_to_neo4j_cql(graph_spec,
                   spark_config,
                   data_format='parquet',
                   array_delimiter=';',
                   debug_write=True
                   )

Pushing nodes...
Pushing...
Pushing edges1...
Pushing...
Pushing edges2...
Pushing...
Pushing edges3...
Pushing...
Pushing edges4...
Pushing...


### Set up constraints and index nodes

In [16]:
from fncore_py3.utils.neo4j_tools import create_index, create_uniqueness_constraint

### Still missing some labels but whatever....
global_labels = {label for node_kind in graph_spec.node_lists
                 for label in node_kind.labels}
global_labels.add('_searchable')

with get_neo4j_context(neo_config['uri']) as neo_context:
    for label in global_labels:
        create_uniqueness_constraint(neo_context, label, '_canonical_id')
        # No need to create index after constraint created?
        # create_index(neo_context, label, '_canonical_id')

## Checks

In [17]:
### Connect with py2neo
from py2neo import Graph
graph = Graph("bolt://neo4j:test@neo4j:7687", user="neo4j", password="test")

In [18]:
# Check nodes in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run("MATCH (n) RETURN count(n) as count")
    print(cursor.data())

[{'count': 45}]


In [19]:
# Check node 1 in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """MATCH (n {_canonical_id: "1"}) RETURN n"""
    )
    
    node = cursor.data()[0]['n']
    print(node.labels)
    assert(
        node.labels 
        == {'chocolate', 'toffee', 'sweets', 'is_target', '8', '_searchable', 'toffee_groups'}
    )
    
    print(node.items())
    assert(
        set(node.items()) 
        == {('_canonical_id', '1'), ('edge_metadata', 'foo'), ('sweetness number', '1')}
    )

frozenset({'_searchable', 'sweets', '8', 'is_target', 'toffee', 'chocolate', 'toffee_groups'})
dict_items([('_canonical_id', '1'), ('edge_metadata', 'foo'), ('sweetness number', '1')])


In [20]:
# Check edges in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run("MATCH ()-[r]->() RETURN count(r) as count")
    print(cursor.data())

[{'count': 131}]


In [21]:
# Check edges in graph
import pandas as pd

with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """
        MATCH (s)-[r]-(t)
        RETURN s._canonical_id as source, t._canonical_id as target
        """)
    
    edge_table = pd.DataFrame(cursor.data())
    print(
        len(set(edge_table["source"].tolist() + edge_table["target"].tolist()))
    )
    print(
        len(edge_table.drop_duplicates())
    )
    

44
232


In [22]:
# Check multi-edges in graph
import pandas as pd

with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """
        MATCH (s)-[r]-(t)
        WITH s, t, count(r) as rel_cnt
        WHERE rel_cnt > 1
        RETURN s._canonical_id as source, t._canonical_id as target
        """)
    
    print(cursor.data())

[{'source': '18', 'target': '3'}, {'source': '7', 'target': '5'}, {'source': '3', 'target': '2'}, {'source': '6', 'target': '9'}, {'source': '19', 'target': '3'}, {'source': '1', 'target': '0'}, {'source': '18', 'target': '28'}, {'source': '6', 'target': '5'}, {'source': '1', 'target': '9'}, {'source': '2', 'target': '1'}, {'source': '14', 'target': '13'}, {'source': '6', 'target': '4'}, {'source': '5', 'target': '7'}, {'source': '3', 'target': '18'}, {'source': '5', 'target': '6'}, {'source': '13', 'target': '14'}, {'source': '9', 'target': '1'}, {'source': '1', 'target': '2'}, {'source': '13', 'target': '29'}, {'source': '28', 'target': '18'}, {'source': '3', 'target': '19'}, {'source': '29', 'target': '13'}, {'source': '13', 'target': '2'}, {'source': '2', 'target': '13'}, {'source': '2', 'target': '3'}, {'source': '4', 'target': '6'}, {'source': '9', 'target': '6'}, {'source': '0', 'target': '1'}]


In [23]:
# Check multi-edges in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """
        MATCH (s)-[r]-(t)
        WHERE s._canonical_id='9' AND t._canonical_id='6'
        RETURN r
        """)
    d = cursor.data()
    print(d)
    
    assert len(d) == 2
    
    # there exists a 6->9 edge and 9<-6 edge

[{'r': <Relationship id=2228 nodes=(<Node id=1224 labels=set() properties={}>, <Node id=1217 labels=set() properties={}>) type='sweets' properties={}>}, {'r': <Relationship id=2252 nodes=(<Node id=1217 labels=set() properties={}>, <Node id=1224 labels=set() properties={}>) type='sweets' properties={}>}]


In [24]:
# Check edges with extra information
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """
        MATCH (s)-[r]-(t)
        WHERE s._canonical_id='1' AND t._canonical_id='0'
        RETURN r
        """)
    d = cursor.data()
    print(d)

[{'r': <Relationship id=2259 nodes=(<Node id=1227 labels=set() properties={}>, <Node id=1048 labels=set() properties={}>) type='toffee_extended' properties={'edge_prop_2': 'prop2'}>}, {'r': <Relationship id=2253 nodes=(<Node id=1048 labels=set() properties={}>, <Node id=1227 labels=set() properties={}>) type='toffee' properties={'edge_prop_friendly': 'ep2'}>}, {'r': <Relationship id=2208 nodes=(<Node id=1048 labels=set() properties={}>, <Node id=1227 labels=set() properties={}>) type='edge_group_type' properties={}>}]


In [25]:
# Check node group edges
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """
        MATCH ()-[r :edge_group_type]->()
        RETURN count(r)
        """)
    d = cursor.data()
    print(d)

[{'count(r)': 9}]


In [26]:
# Check multi-edges in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run(
        """
        MATCH (s)-[r]->(t)
        WHERE s._canonical_id='8' AND t._canonical_id='9'
        RETURN r
        """)
    d = cursor.data()
    print(d)

[{'r': <Relationship id=2263 nodes=(<Node id=1043 labels=set() properties={}>, <Node id=1224 labels=set() properties={}>) type='toffee' properties={'edge_prop_friendly': 'ep2', 'reverse_order': 'reverse'}>}]


## Viz

In [27]:
# ### Connect with py2neo
# from py2neo import Graph
# graph = Graph("bolt://neo4j:test@neo4j:7687", user="neo4j", password="test")

# ### Plot with neo4jupyter
# import neo4jupyter
# neo4jupyter.init_notebook_mode()

# neo4jupyter.draw(graph, {"User": "id"})

In [28]:
spark_config = SparkConfFactory() \
    .set_master('local[*]') \
    .set_app_name('write neo4j nodes') \
    .set("spark.driver.maxResultSize", "1g") \
    .set('spark.executor.memory', '1g') \

In [29]:
from fncore_py3.utils.spark_tools import get_spark_context
from pyspark import SQLContext
from pyspark.sql.functions import array, col

with get_spark_context(spark_config.create()) as spark_context:
    sql_context = SQLContext(spark_context)
    
    df = sql_context.createDataFrame([(1,2), (1,2)])
    df.filter((col("_1") > 0) & (col("_1") > 0)).show()

+---+---+
| _1| _2|
+---+---+
|  1|  2|
|  1|  2|
+---+---+

