# Run fullchain tests

See: finnet-pipeline/docker-tests/fullchain/run_tests.py

## Init Spark

In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "python2"

In [2]:
import findspark
findspark.init("/usr/local/spark")

In [3]:
from pyspark import SparkContext

### Stop current SC, test assumes no existing SC
sc = SparkContext.getOrCreate()
sc.stop()

## Imports and Env

In [4]:
import os
os.environ["GRAPH_DB"] = """bolt://neo4j:test@neo4j:7687"""
os.environ["NEO4J_SSH_PORT"] = "22"
os.environ["NEO4J_SSH_USERNAME"] = "root"

In [5]:
os.environ['PIPELINE_DATA_PATH'] = "/datasets/finnet"
os.environ['PIPELINE_DATA_FORMAT'] = "parquet"

In [6]:
import sys
sys.path.insert(0, "/usr/local/dags")

from run_tests import *

In [7]:
from pyspark.sql import SparkSession

## Run tests

In [8]:
# Set the list of tasks to test
dolist = [
    'build_lists', 'resolve_entities',
    'neo4j_purger', 'neo4j_writer',
    'graph_tools'
]

# Get neo4j ssh username and port
neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j')
neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000))

# Setup the spark configuration
config = dict()
config['SparkConfiguration'] = (SparkConf()
                                .setMaster('local[*]')
                                .setAppName("test create data")
                                .set("spark.executor.memory", "1024m"))

# Get the graph specs
datalist = os.listdir(LOCAL_DATA_PATH)
jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)]

In [9]:
# Only 1 json
gspec = jsonlist[0]

### Load Graph Spec

In [10]:
# Load the graph spec
with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f:
    graph_spec = GraphSpec.from_dict(json.load(f))
    spec = graph_spec.to_dict()

tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables')
n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list')
e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list')
n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved')
e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved')

logging.info("Processing " + gspec)

# Use graph specification's neo4j connection
neo_config = {
    'uri': spec['graph_uri'],
    'max_retries': config.get('neo4j.max_retries', 5),
    'max_batchsize': config.get('neo4j.max_batchsize', 10000)
}

In [11]:
graph_spec.table_details

{'connection': u'data_uri_value',
 'poll_frequency': '0 2 * * *',
 'tables': {(u'test_data_chocolate_edge_list',
   u'fn_test_data_chocolate_edge_list'): {(u'chocolate_s', u'fn_chocolate_s'),
   (u'chocolate_t', u'fn_chocolate_t')},
  (u'test_data_chocolate_node_list',
   u'fn_test_data_chocolate_node_list'): {(u'id', u'fn_id')},
  (u'test_data_sweets_edge_list',
   u'fn_test_data_sweets_edge_list'): {(u'sweets_s', u'fn_sweets_s'),
   (u'sweets_t', u'fn_sweets_t')},
  (u'test_data_sweets_node_list',
   u'fn_test_data_sweets_node_list'): {(u'id', u'fn_id'), (u'prop',
    u'fn_prop')},
  (u'test_data_toffee_edge_list',
   u'fn_test_data_toffee_edge_list'): {(u'toffee_s',
    u'fn_toffee_s'), (u'toffee_t', u'fn_toffee_t')},
  (u'test_data_toffee_node_list',
   u'fn_test_data_toffee_node_list'): {(u'hide', u'fn_hide'), (u'id',
    u'fn_id'), (u'prop', u'fn_prop')}}}

### Build Lists

In [12]:
# Build list
if 'build_lists' in dolist:
    logging.info("Building lists...")
    build_node_lists(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        tables_path=tables_path,
        node_path=n_path,
        data_format=DATA_FORMAT,
    )
    build_edge_lists(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        tables_path=tables_path,
        edge_path=e_path,
        data_format=DATA_FORMAT,
    )
    logging.info("Checking build_lists...")
    with get_spark_context(config['SparkConfiguration']) as spark_ctx:
        sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
        assert test_build_lists(spark_ctx, sql_context, spec)

In [13]:
# Reads only needed columns and writes to HDFS, 1 file per nodekind and edgekind respectively

with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
    sql_context.read.parquet("/datasets/finnet/test_data/node_list/fn_chocolate_nodes").show()

+-------------+
|        fn_id|
+-------------+
|            0|
|            1|
|            2|
|            3|
|            4|
|            5|
|            6|
|            7|
|            8|
|            9|
|     test ΄id|
|           11|
|           12|
|           13|
|           14|
|illegal ΁code|
|           16|
|           17|
|           18|
|           19|
+-------------+
only showing top 20 rows



### Resolve Entities

In [14]:
# Resolve entities
if 'resolve_entities' in dolist:
    logging.info("Resolving entities...")
    resolve_node_entities(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        entity_maps=dict(),
        input_node_path=n_path,
        output_node_path=n_path_res,
        output_node_id='_canonical_id',
        data_format=DATA_FORMAT
    )
    resolve_edge_entities(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        entity_maps=dict(),
        input_edge_path=e_path,
        output_edge_path=e_path_res,
        output_edge_source_id='_canonical_id_source',
        output_edge_target_id='_canonical_id_target',
        data_format=DATA_FORMAT
    )

In [15]:
# Produces _canonical_id columns and writes to HDFS, 1 file per nodekind and edgekind respectively

with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
    sql_context.read.parquet("/datasets/finnet/test_data/node_list_resolved/fn_chocolate_nodes").show()

+-------------+-------------+
|        fn_id|_canonical_id|
+-------------+-------------+
|illegal ΁code|illegal ΁code|
|     test ΄id|     test ΄id|
|           11|           11|
|           29|           29|
|           30|           30|
|           34|           34|
|           31|           31|
|           18|           18|
|           27|           27|
|           17|           17|
|           26|           26|
|           19|           19|
|           23|           23|
|           38|           38|
|           25|           25|
|           33|           33|
|           32|           32|
|           20|           20|
|           36|           36|
|           37|           37|
+-------------+-------------+
only showing top 20 rows



In [16]:
# Produces _canonical_id columns and writes to HDFS, 1 file per nodekind and edgekind respectively

with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
    sql_context.read.parquet("/datasets/finnet/test_data/edge_list_resolved/fn_chocolate_relations").show()

+--------------+--------------+--------------------+--------------------+
|fn_chocolate_s|fn_chocolate_t|_canonical_id_source|_canonical_id_target|
+--------------+--------------+--------------------+--------------------+
|             8| illegal ΁code|                   8|       illegal ΁code|
|            17| illegal ΁code|                  17|       illegal ΁code|
|            34|            14|                  34|                  14|
|             2|            14|                   2|                  14|
|            30|            21|                  30|                  21|
|            14|            21|                  14|                  21|
| illegal ΁code|            21|       illegal ΁code|                  21|
|      test ΄id|            26|            test ΄id|                  26|
|            28|            26|                  28|                  26|
|            32|            26|                  32|                  26|
|      test ΄id|            25|       

In [17]:
# What does EntityMapper do???
with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    from fncore.tasks.resolve_entities import EntityMapper
    
    entityMapper = EntityMapper(spark_ctx, {})
    # Think these are just placeholder column names
    print(entityMapper._from)
    print(entityMapper._to)
    entityMapper._map.show()

from-22a129b0-d1e6-4355-9698-55666d5c1892
to-10791e2a-bbb7-4d3b-8057-3e4070131f2d
+-----------------------------------------+---------------------------------------+
|from-22a129b0-d1e6-4355-9698-55666d5c1892|to-10791e2a-bbb7-4d3b-8057-3e4070131f2d|
+-----------------------------------------+---------------------------------------+
+-----------------------------------------+---------------------------------------+



In [18]:
# Seems like EntityMapper isn't doing anything here,
# since entity_map param is empty, _canonical_id just takes the id from index_column

### Purge existing Graph

In [19]:
# Purging the graph
if 'neo4j_purger' in dolist:
    logging.info("Purging Neo4j...")
    neo4j_manager.purge(graph_spec,
                        username=neo4j_ssh_username,
                        port=neo4j_ssh_port)
    logging.info("Checking purging neo4j...")
    with get_neo4j_context(neo_config['uri']) as neo_context:
        assert test_neo4j_purger(neo_context)

  m.add_string(self.Q_C.public_numbers().encode_point())
  self.curve, Q_S_bytes
  hm.add_string(self.Q_C.public_numbers().encode_point())
  algorithm=hashes.SHA1(),


In [20]:
# Check nodes in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run("MATCH (n) RETURN count(n) as count")
    print(cursor.data())

[{u'count': 0}]


### Write nodes and edges to graph

In [21]:
# Graph writer
if 'neo4j_writer' in dolist:
    logging.info("Writing to Neo4j...")
        
    graph_to_neo4j.graph_to_neo4j(graph_specification=graph_spec,
                                  spark_config=SparkConfFactory()
                                  .set_master('local[*]')
                                  .set_app_name('write neo4j nodes')
                                  .set("spark.driver.maxResultSize",
                                       "1g")
                                  .set('spark.executor.memory',
                                       '1g'),
                                  input_node_path=n_path_res,
                                  input_edge_path=e_path_res,
                                  username=neo4j_ssh_username,
                                  port=neo4j_ssh_port,
                                  debug_write=True,
                                  verbose=True
                                  )
    

Wrote temp nodes .csv to /tmp/tmp_djZSj/tmpc6XnOw.csv
Wrote temp edges .csv to /tmp/tmp_djZSj/tmpJ1eGlK.csv
Wrote temp edges .csv to /tmp/tmp_djZSj/tmpT59rGp.csv
Wrote temp edges .csv to /tmp/tmp_djZSj/tmp8Pjvad.csv


In [22]:
# Combined nodes

with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
    pdf_combinednodes = sql_context.read.parquet("/datasets/finnet/test_data/node_list_resolved/combined_nodes").toPandas()

In [23]:
graph_spec.to_dict()["node_lists"]

[{u'index_column': {u'hidden': False,
   u'name': u'id',
   u'resolution_alias': u'chocolate',
   u'safe_name': u'fn_id',
   u'use_as_label': False,
   u'variable_definition': u'String'},
  u'metadata_columns': [],
  u'name': u'chocolate nodes',
  u'safe_name': u'fn_chocolate_nodes',
  u'safe_table_name': u'fn_test_data_chocolate_node_list',
  u'table_name': u'test_data_chocolate_node_list',
  u'tags': [u'chocolate']},
 {u'index_column': {u'hidden': False,
   u'name': u'id',
   u'resolution_alias': u'sweets',
   u'safe_name': u'fn_id',
   u'use_as_label': False,
   u'variable_definition': u'String'},
  u'metadata_columns': [{u'friendly_name': u'sweetness number',
    u'hidden': False,
    u'name': u'prop',
    u'safe_name': u'fn_prop',
    u'use_as_label': True,
    u'variable_definition': u'String'}],
  u'name': u'sweets nodes',
  u'safe_name': u'fn_sweets_nodes',
  u'safe_table_name': u'fn_test_data_sweets_node_list',
  u'table_name': u'test_data_sweets_node_list',
  u'tags': [u'swee

In [24]:
# node_kind.index_column.safe_name becomes _canonical_id:ID
# no specified label for chocolate_nodes, default to none, but
# ids' in chocolate nodes appears elsewhere too
# line 230 groups by same "_canonical_id:ID", and collects all _labels and :LABEL
# sweets use fn_prop as _label too

pdf_combinednodes

Unnamed: 0,_canonical_id:ID,_label,:LABEL
0,3,3;6,_searchable;sweets;chocolate;toffee
1,8,1;8,_searchable;sweets;chocolate;toffee
2,1,1;8,_searchable;sweets;chocolate;toffee
3,2,2;7,_searchable;sweets;chocolate;toffee
4,4,5;4,_searchable;sweets;chocolate;toffee
5,9,9;0,_searchable;sweets;chocolate;toffee
6,7,2;7,_searchable;sweets;chocolate;toffee
7,0,9;0,_searchable;sweets;chocolate;toffee
8,6,3;6,_searchable;sweets;chocolate;toffee
9,5,5;4,_searchable;sweets;chocolate;toffee


In [25]:
# Check nodes in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run("MATCH (n) RETURN n")
    for n in cursor.data():
        print(n)

{u'n': <Node id=0 labels=set([u'_searchable', u'sweets', u'chocolate']) properties={u'_label': u'16', u'_canonical_id': u'16'}>}
{u'n': <Node id=1 labels=set([u'_searchable', u'chocolate']) properties={u'_canonical_id': u'35'}>}
{u'n': <Node id=2 labels=set([u'_searchable', u'chocolate']) properties={u'_canonical_id': u'20'}>}
{u'n': <Node id=3 labels=set([u'_searchable', u'chocolate']) properties={u'_canonical_id': u'36'}>}
{u'n': <Node id=4 labels=set([u'_searchable', u'sweets', u'chocolate']) properties={u'_label': u'12', u'_canonical_id': u'12'}>}
{u'n': <Node id=5 labels=set([u'_searchable', u'chocolate']) properties={u'_canonical_id': u'illegal \u0381code'}>}
{u'n': <Node id=6 labels=set([u'_searchable', u'sweets', u'toffee', u'chocolate']) properties={u'_label': u'3;6', u'_canonical_id': u'3'}>}
{u'n': <Node id=7 labels=set([u'_searchable', u'sweets', u'toffee', u'chocolate']) properties={u'_label': u'1;8', u'_canonical_id': u'8'}>}
{u'n': <Node id=8 labels=set([u'_searchable', 

In [26]:
# Transformed edges

with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
    pdf_edges = sql_context.read.parquet("/datasets/finnet/test_data/edge_list_resolved/transformed_edges_fn_chocolate_relations").toPandas()

In [27]:
pdf_edges.head(10)

Unnamed: 0,chocolate_s,chocolate_t,_canonical_id_source,_canonical_id_target,:START_ID,:END_ID,:TYPE
0,17,illegal ΁code,17,illegal ΁code,17,illegal ΁code,chocolate
1,test ΄id,26,test ΄id,26,test ΄id,26,chocolate
2,test ΄id,25,test ΄id,25,test ΄id,25,chocolate
3,14,9,14,9,14,9,chocolate
4,17,19,17,19,17,19,chocolate
5,18,28,18,28,18,28,chocolate
6,illegal ΁code,21,illegal ΁code,21,illegal ΁code,21,chocolate
7,17,8,17,8,17,8,chocolate
8,30,21,30,21,30,21,chocolate
9,8,illegal ΁code,8,illegal ΁code,8,illegal ΁code,chocolate


## Viz

In [28]:
### Connect with py2neo
from py2neo import Graph
graph = Graph("bolt://neo4j:test@neo4j:7687", user="neo4j", password="test")

In [29]:
### Plot with neo4jupyter
import neo4jupyter
neo4jupyter.init_notebook_mode()

neo4jupyter.draw(graph, {"User": "id"})

<IPython.core.display.Javascript object>

In [30]:
# Second part?
if 'neo4j_writer' in dolist:
    # This part inserts the remainder of node properties that were not captured above
    neo4j_writer.write_neo4j_nodes(graph_specification=spec,
                                   spark_config=SparkConfFactory()
                                   .set_master('local[*]')
                                   .set_app_name('write neo4j nodes')
                                   .set('spark.executor.memory',
                                        '1g')
                                   )

    datetime_now = datetime.now()
    logging.info("Backing up db, then purge it...")
    neo4j_manager.backup(graph_spec, datetime_now,
                         username=neo4j_ssh_username,
                         port=neo4j_ssh_port)
    neo4j_manager.purge(graph_spec,
                        username=neo4j_ssh_username,
                        port=neo4j_ssh_port)
    logging.info("Restoring the backup to db...")
    neo4j_manager.restore(graph_spec,
                          datetime_now,
                          username=neo4j_ssh_username,
                          port=neo4j_ssh_port)

    logging.info("Checking write neo4j...")
    with get_spark_context(config['SparkConfiguration']) as spark_ctx:
        sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
        with get_neo4j_context(neo_config['uri']) as neo_context:
            assert test_neo4j_writer(
                spark_ctx, sql_context, neo_context, spec
            )

In [31]:
# Check nodes in graph
with get_neo4j_context(neo_config['uri']) as neo_context:
    cursor = neo_context.run("MATCH (n) RETURN n")
    for n in cursor.data():
        print(n)

{u'n': <Node id=0 labels=set([u'_searchable', u'sweets', u'chocolate']) properties={u'sweetness number': [u'16'], u'fn_id': u'16', u'_label': u'16', u'_canonical_id': u'16', u'_node_id': u'16'}>}
{u'n': <Node id=1 labels=set([u'_searchable', u'chocolate']) properties={u'fn_id': u'35', u'_canonical_id': u'35', u'_node_id': u'35'}>}
{u'n': <Node id=2 labels=set([u'_searchable', u'chocolate']) properties={u'fn_id': u'20', u'_canonical_id': u'20', u'_node_id': u'20'}>}
{u'n': <Node id=3 labels=set([u'_searchable', u'chocolate']) properties={u'fn_id': u'36', u'_canonical_id': u'36', u'_node_id': u'36'}>}
{u'n': <Node id=4 labels=set([u'_searchable', u'sweets', u'chocolate']) properties={u'sweetness number': [u'12'], u'fn_id': u'12', u'_label': u'12', u'_canonical_id': u'12', u'_node_id': u'12'}>}
{u'n': <Node id=5 labels=set([u'_searchable', u'chocolate']) properties={u'fn_id': u'illegal \u0381code', u'_canonical_id': u'illegal \u0381code', u'_node_id': u'ILLEGAL \u0381CODE'}>}
{u'n': <Node

In [None]:
# if 'graph_tools' in dolist:
#     # Test graph_construction_coi.get_graph_dataframes
#     data_path = os.environ['PIPELINE_DATA_PATH']
#     graph_name = graph_spec.name
#     node_path_resolved = os.path.join(data_path, graph_name, 'node_list_resolved')
#     edge_path_resolved = os.path.join(data_path, graph_name, 'edge_list_resolved')
#     with get_spark_context(config['SparkConfiguration']) as spark_ctx:
#         sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
#         graph = get_graph_dataframes(graph_spec, sql_context,
#                                      node_path_resolved, edge_path_resolved,
#                                      DATA_FORMAT)

#         assert 'node_list' in graph
#         assert 'edge_list' in graph
#         assert len(graph['node_list']) == len(graph_spec.node_lists)
#         for cur_node_list in graph_spec.node_lists:
#             assert cur_node_list.safe_name in graph['node_list']
#         assert len(graph['edge_list']) == len(graph_spec.edge_lists)
#         for cur_edge_list in graph_spec.edge_lists:
#             assert cur_edge_list.safe_name in graph['edge_list']

#     # Test graph_construction_coi.data_loading
#     with get_spark_context(config['SparkConfiguration']) as spark_ctx:
#         sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
#         tables = load_node_edge_lists(sql_context, graph_spec,
#                                       node_path_resolved, edge_path_resolved,
#                                       DATA_FORMAT)
#         for cur_edge_list in graph_spec.edge_lists:
#             assert (cur_edge_list.safe_table_name,
#                     cur_edge_list.source_column.safe_name,
#                     cur_edge_list.target_column.safe_name) in tables
#         assert len(tables) == len(graph_spec.node_lists) + len(graph_spec.edge_lists)