# Run fullchain tests

See: finnet-pipeline/docker-tests/fullchain/run_tests.py

## Init Spark

In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "python2"

In [2]:
import findspark
findspark.init("/usr/local/spark")

In [3]:
from pyspark import SparkContext

### Stop current SC, test assumes no existing SC
sc = SparkContext.getOrCreate()
sc.stop()

## Imports and Env

In [4]:
import os
os.environ["GRAPH_DB"] = """bolt://neo4j:test@neo4j:7687"""
os.environ["NEO4J_SSH_PORT"] = "22"
os.environ["NEO4J_SSH_USERNAME"] = "root"

In [5]:
os.environ['PIPELINE_DATA_PATH'] = "/datasets/finnet"
os.environ['PIPELINE_DATA_FORMAT'] = "parquet"

In [6]:
import sys
sys.path.insert(0, "/usr/local/dags")

from run_tests import *

In [7]:
from pyspark.sql import SparkSession

## Run tests

In [8]:
# Set the list of tasks to test
dolist = [
    'build_lists', 'resolve_entities',
    'neo4j_purger', 'neo4j_writer',
    'graph_tools'
]

# Get neo4j ssh username and port
neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j')
neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000))

# Setup the spark configuration
config = dict()
config['SparkConfiguration'] = (SparkConf()
                                .setMaster('local[*]')
                                .setAppName("test create data")
                                .set("spark.executor.memory", "1024m"))

# Get the graph specs
datalist = os.listdir(LOCAL_DATA_PATH)
jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)]

In [9]:
# Only 1 json
gspec = jsonlist[0]

In [10]:
# Load the graph spec
with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f:
    graph_spec = GraphSpec.from_dict(json.load(f))
    spec = graph_spec.to_dict()

tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables')
n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list')
e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list')
n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved')
e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved')

logging.info("Processing " + gspec)

# Use graph specification's neo4j connection
neo_config = {
    'uri': spec['graph_uri'],
    'max_retries': config.get('neo4j.max_retries', 5),
    'max_batchsize': config.get('neo4j.max_batchsize', 10000)
}

In [11]:
# Build list
if 'build_lists' in dolist:
    logging.info("Building lists...")
    build_node_lists(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        tables_path=tables_path,
        node_path=n_path,
        data_format=DATA_FORMAT,
    )
    build_edge_lists(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        tables_path=tables_path,
        edge_path=e_path,
        data_format=DATA_FORMAT,
    )
    logging.info("Checking build_lists...")
    with get_spark_context(config['SparkConfiguration']) as spark_ctx:
        sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
        assert test_build_lists(spark_ctx, sql_context, spec)

In [12]:
# Resolve entities
if 'resolve_entities' in dolist:
    logging.info("Resolving entities...")
    resolve_node_entities(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        entity_maps=dict(),
        input_node_path=n_path,
        output_node_path=n_path_res,
        output_node_id='_canonical_id',
        data_format=DATA_FORMAT
    )
    resolve_edge_entities(
        graph_specification=graph_spec,
        spark_config=(SparkConfFactory()
                      .set_master('local[*]')
                      .set_app_name('test create data')
                      .set('spark.executor.memory', '1g')),
        entity_maps=dict(),
        input_edge_path=e_path,
        output_edge_path=e_path_res,
        output_edge_source_id='_canonical_id_source',
        output_edge_target_id='_canonical_id_target',
        data_format=DATA_FORMAT
    )

In [13]:
# Purging the graph
if 'neo4j_purger' in dolist:
    logging.info("Purging Neo4j...")
    neo4j_manager.purge(graph_spec,
                        username=neo4j_ssh_username,
                        port=neo4j_ssh_port)
    logging.info("Checking purging neo4j...")
    with get_neo4j_context(neo_config['uri']) as neo_context:
        assert test_neo4j_purger(neo_context)

  self.ecdsa_curve.curve_class(), pointinfo
  m.add_string(self.Q_C.public_numbers().encode_point())
  self.curve, Q_S_bytes
  hm.add_string(self.Q_C.public_numbers().encode_point())
  algorithm=hashes.SHA1(),


In [14]:
# Graph writer
if 'neo4j_writer' in dolist:
    logging.info("Writing to Neo4j...")
        
    graph_to_neo4j.graph_to_neo4j(graph_specification=graph_spec,
                                  spark_config=SparkConfFactory()
                                  .set_master('local[*]')
                                  .set_app_name('write neo4j nodes')
                                  .set("spark.driver.maxResultSize",
                                       "1g")
                                  .set('spark.executor.memory',
                                       '1g'),
                                  input_node_path=n_path_res,
                                  input_edge_path=e_path_res,
                                  username=neo4j_ssh_username,
                                  port=neo4j_ssh_port
                                  )
    
    neo4j_writer.write_neo4j_nodes(graph_specification=spec,
                                   spark_config=SparkConfFactory()
                                   .set_master('local[*]')
                                   .set_app_name('write neo4j nodes')
                                   .set('spark.executor.memory',
                                        '1g')
                                   )

    datetime_now = datetime.now()
    logging.info("Backing up db, then purge it...")
    neo4j_manager.backup(graph_spec, datetime_now,
                         username=neo4j_ssh_username,
                         port=neo4j_ssh_port)
    neo4j_manager.purge(graph_spec,
                        username=neo4j_ssh_username,
                        port=neo4j_ssh_port)
    logging.info("Restoring the backup to db...")
    neo4j_manager.restore(graph_spec,
                          datetime_now,
                          username=neo4j_ssh_username,
                          port=neo4j_ssh_port)

    logging.info("Checking write neo4j...")
    with get_spark_context(config['SparkConfiguration']) as spark_ctx:
        sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
        with get_neo4j_context(neo_config['uri']) as neo_context:
            assert test_neo4j_writer(
                spark_ctx, sql_context, neo_context, spec
            )

In [15]:
if 'graph_tools' in dolist:
    # Test graph_construction_coi.get_graph_dataframes
    data_path = os.environ['PIPELINE_DATA_PATH']
    graph_name = graph_spec.name
    node_path_resolved = os.path.join(data_path, graph_name, 'node_list_resolved')
    edge_path_resolved = os.path.join(data_path, graph_name, 'edge_list_resolved')
    with get_spark_context(config['SparkConfiguration']) as spark_ctx:
        sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
        graph = get_graph_dataframes(graph_spec, sql_context,
                                     node_path_resolved, edge_path_resolved,
                                     DATA_FORMAT)

        assert 'node_list' in graph
        assert 'edge_list' in graph
        assert len(graph['node_list']) == len(graph_spec.node_lists)
        for cur_node_list in graph_spec.node_lists:
            assert cur_node_list.safe_name in graph['node_list']
        assert len(graph['edge_list']) == len(graph_spec.edge_lists)
        for cur_edge_list in graph_spec.edge_lists:
            assert cur_edge_list.safe_name in graph['edge_list']

    # Test graph_construction_coi.data_loading
    with get_spark_context(config['SparkConfiguration']) as spark_ctx:
        sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
        tables = load_node_edge_lists(sql_context, graph_spec,
                                      node_path_resolved, edge_path_resolved,
                                      DATA_FORMAT)
        for cur_edge_list in graph_spec.edge_lists:
            assert (cur_edge_list.safe_table_name,
                    cur_edge_list.source_column.safe_name,
                    cur_edge_list.target_column.safe_name) in tables
        assert len(tables) == len(graph_spec.node_lists) + len(graph_spec.edge_lists)

## Viz

In [18]:
### Connect with py2neo
from py2neo import Graph
graph = Graph("bolt://neo4j:test@neo4j:7687", user="neo4j", password="test")

In [19]:
### Plot with neo4jupyter
import neo4jupyter
neo4jupyter.init_notebook_mode()

neo4jupyter.draw(graph, {"User": "id"})

<IPython.core.display.Javascript object>