# Neo4J APOC test

A notebook to test the implementation of Neo4J APOC library 

In [1]:
# Enable these line if live changes in the codebase are made
%load_ext autoreload
%autoreload 2

In [2]:
# Specific instruction to run the notebooks from a sub-folder.
import sys
sys.path.append("..")

In [3]:
import logging
from bugfinder.settings import LOGGER
from bugfinder.dataset import CWEClassificationDataset as Dataset
from bugfinder.dataset.processing.dataset_ops import CopyDataset, RightFixer
from bugfinder.features.any_hop.all_flows import FeatureExtractor as AnyHopAllFlowsExtractor
from bugfinder.features.any_hop.single_flow import FeatureExtractor as AnyHopSingleFlowExtractor
from bugfinder.features.single_hop.raw import FeatureExtractor as SingleHopRawExtractor
from bugfinder.features.pca import FeatureExtractor as PCA

In [4]:
# Setup logging to only output INFO level messages
LOGGER.setLevel(logging.INFO)

In [5]:
# Dataset directories (DO NOT EDIT)
cwe121_v__0_dataset_path = [
    "../data/cwe121_v110", "../data/cwe121_v120", "../data/cwe121_v210", "../data/cwe121_v220", 
#     "../data/cwe121_v310", "../data/cwe121_v320"
]
cwe121_v__1_dataset_path = [
    "../data/cwe121_v111", "../data/cwe121_v121", "../data/cwe121_v211", "../data/cwe121_v221", 
#     "../data/cwe121_v311", "../data/cwe121_v321"
]
cwe121_v__2_dataset_path = [
    "../data/cwe121_v112", "../data/cwe121_v122", "../data/cwe121_v212", "../data/cwe121_v222", 
#     "../data/cwe121_v312", "../data/cwe121_v322"
]
cwe121_v__3_dataset_path = [
    "../data/cwe121_v113", "../data/cwe121_v123", "../data/cwe121_v213", "../data/cwe121_v223", 
#     "../data/cwe121_v313", "../data/cwe121_v323"
]
# cwe121_v__4_dataset_path = [
#     "../data/cwe121_v114", "../data/cwe121_v124", "../data/cwe121_v214", "../data/cwe121_v224", 
#     "../data/cwe121_v314", "../data/cwe121_v324"
# ]

## Next section


In [6]:
from py2neo import Graph
from bugfinder.utils.statistics import get_time

neo4j_db = Graph(
    scheme="http",
    host="0.0.0.0",
    port=55487,
)

In [7]:
entrypoints_cmd = """
    MATCH (n)
    WHERE n.ast = "CFGEntryNode"
    RETURN id(n) as id_n
"""
start_time = get_time()
entrypoints_list = [data["id_n"] for data in neo4j_db.run(entrypoints_cmd).data()]
print("Found %d entrypoints in %dms" % (len(entrypoints_list), get_time() - start_time))

Found 2035 entrypoints in 816ms


In [11]:
exec_chain_cmd = """
    MATCH (n) WHERE id(n)=%s 
    CALL apoc.path.subgraphAll(n, {relationshipFilter: "FLOWS_TO|REACHES|CONTROLS"})
    YIELD nodes, relationships
    RETURN nodes, relationships
"""

start_time = get_time()

exec_chains = [neo4j_db.run(exec_chain_cmd % entrypoint_id).data() for entrypoint_id in entrypoints_list]

print("Spent %dms" % (get_time() - start_time)90)

Spent 20950ms


In [12]:
start_time = get_time()

final_chains = list()

for chain in exec_chains:
    result_chain = {
        "nodes": dict(),
        "rels": list()
    }
    
    for node in chain[0]["nodes"]:
        result_chain["nodes"][node.identity] = node.get("ast")
        
    for rel in chain[0]["relationships"]:
        new_rel = (
            rel.start_node.identity,
            type(rel).__name__,
            rel.end_node.identity
        )
        result_chain["rels"].append(new_rel)
        
    final_chains.append(result_chain)

print("Spent %dms" % (get_time() - start_time))

Spent 182ms


In [9]:
import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size


print(get_size(exec_chains))
print(get_size(final_chains))

47046319
13051308
