In [1]:
# Enable these line if live changes in the codebase are made
%load_ext autoreload
%autoreload 2

In [2]:
# Specific instruction to run the notebooks from a sub-folder.
import sys
sys.path.append("..")

In [3]:
import logging
from bugfinder.settings import LOGGER
from bugfinder.dataset import CWEClassificationDataset as Dataset
from bugfinder.dataset.processing.dataset_ops import CopyDataset, RightFixer
from bugfinder.features.any_hop.all_flows import FeatureExtractor as AnyHopAllFlowsExtractor
from bugfinder.features.any_hop.single_flow import FeatureExtractor as AnyHopSingleFlowExtractor
from bugfinder.features.single_hop.raw import FeatureExtractor as SingleHopRawExtractor
from bugfinder.features.pca import FeatureExtractor as PCA

In [None]:
# Setup logging to only output INFO level messages
LOGGER.setLevel(logging.INFO)

In [None]:
# Dataset directories (DO NOT EDIT)
cwe121_v__0_dataset_path = [
    "../data/cwe121_v110", "../data/cwe121_v120", "../data/cwe121_v210", "../data/cwe121_v220", 
#     "../data/cwe121_v310", "../data/cwe121_v320"
]
cwe121_v__1_dataset_path = [
    "../data/cwe121_v111", "../data/cwe121_v121", "../data/cwe121_v211", "../data/cwe121_v221", 
#     "../data/cwe121_v311", "../data/cwe121_v321"
]
cwe121_v__2_dataset_path = [
    "../data/cwe121_v112", "../data/cwe121_v122", "../data/cwe121_v212", "../data/cwe121_v222", 
#     "../data/cwe121_v312", "../data/cwe121_v322"
]
cwe121_v__3_dataset_path = [
    "../data/cwe121_v113", "../data/cwe121_v123", "../data/cwe121_v213", "../data/cwe121_v223", 
#     "../data/cwe121_v313", "../data/cwe121_v323"
]
# cwe121_v__4_dataset_path = [
#     "../data/cwe121_v114", "../data/cwe121_v124", "../data/cwe121_v214", "../data/cwe121_v224", 
#     "../data/cwe121_v314", "../data/cwe121_v324"
# ]

## Next section


In [None]:
from sklearn import preprocessing, cluster, metrics

In [None]:
dataset = Dataset(cwe121_v__1_dataset_path[0])
feats = dataset.features
    
labels = feats["result"].to_numpy()
del feats["name"]
del feats["result"]
features = preprocessing.scale(feats.to_numpy())
print("Feature size: %s" % str(features.shape))

In [None]:
kmeans_range = range(20, 50)
kmeans_args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 1000,
    "random_state": 42,
}

kmeans_results = list()

for k in kmeans_range:
    if k % 10 == 0:
        print("Searching for %d clusters..." % (k*10))
        
    kmeans = cluster.KMeans(
        n_clusters=k*10,
        **kmeans_args
    )
    
    kmeans.fit(features)
    kmeans_results.append(kmeans)
    
print("Search done!")

In [None]:
sse = [k.inertia_ for k in kmeans_results]
sil = [metrics.silhouette_score(features, k.labels_) for k in kmeans_results]

In [None]:
import matplotlib.pyplot as plt

plt.style.use("fivethirtyeight")
plt.plot(kmeans_range, sse)
plt.xlabel("Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(kmeans_range, sil)
plt.xlabel("Clusters")
plt.ylabel("Sil. score")
plt.show()

In [4]:
from py2neo import Graph
from bugfinder.utils.statistics import get_time

neo4j_db = Graph(
    scheme="http",
    host="0.0.0.0",
    port=55487,
)

In [5]:
entrypoints_cmd = """
    MATCH (n)
    WHERE n.ast = "CFGEntryNode"
    RETURN id(n) as id_n
"""
start_time = get_time()
entrypoints_list = [data["id_n"] for data in neo4j_db.run(entrypoints_cmd).data()]
print("Found %d entrypoints in %dms" % (len(entrypoints_list), get_time() - start_time))

Found 2035 entrypoints in 1523ms


In [95]:
exec_chain_cmd1 = """
    MATCH (n)-[:FLOWS_TO|CONTROLS|REACHESS*]->(m)
    WHERE id(n) = %s
    RETURN distinct m as nodes
"""

start_time = get_time()

exec_chains1 = [neo4j_db.run(exec_chain_cmd1 % entrypoint_id).data() for entrypoint_id in entrypoints_list[:15]]
    
print("Spent %dms" % (get_time() - start_time))

Spent 1663ms


In [148]:
exec_chain_cmd2 = """
    MATCH (n) 
    WHERE id(n)=%s 
    CALL apoc.path.subgraphAll(n, {relationshipFilter: "FLOWS_TO|REACHES|CONTROLS"})
    yield nodes, relationships
    return nodes, relationships
"""

start_time = get_time()

exec_chains2 = [neo4j_db.run(exec_chain_cmd2 % entrypoint_id).data() for entrypoint_id in entrypoints_list]
    
print("Spent %dms" % (get_time() - start_time))

Spent 39378ms


In [108]:
ch1_nodes = [set([nodes["nodes"] for nodes in chain]) for chain in exec_chains1]
ch2_nodes = [set(chain[0]["nodes"]) for chain in exec_chains2]
ch1_ids = [set([n.identity for n in chain]) for chain in ch1_nodes]
ch2_ids = [set([n.identity for n in chain]) for chain in ch2_nodes]

all_nodes = ch1_nodes + ch2_nodes
node_dict = dict()

for node_list in all_nodes:
    for node in node_list:
        node_dict[str(node.identity)] = node

for x in range(len(ch1_ids)):
    print("Sample: %02d/%02d" % (x+1, len(ch1_ids)))
    ids_diff = ch1_ids[x].union(ch2_ids[x]) - ch1_ids[x].intersection(ch2_ids[x])
    
    for node_id in ids_diff:
        print("Id %s not present in both: %s" % (str(node_id), str(node_dict[str(node_id)])))
        
    print("******************")

Sample: 01/15
Id 99 not present in both: (_99:GenericNode:UpstreamNode {ast: 'CFGEntryNode', code: 'ENTRY', command: 'ANR', functionId: '101177', isCFGNode: 'True', type: 'CFGEntryNode'})
******************
Sample: 02/15
Id 197 not present in both: (_197:GenericNode:UpstreamNode {ast: 'CFGEntryNode', code: 'ENTRY', command: 'ANR', functionId: '101284', isCFGNode: 'True', type: 'CFGEntryNode'})
******************
Sample: 03/15
Id 222 not present in both: (_222:GenericNode:UpstreamNode {ast: 'CFGEntryNode', code: 'ENTRY', command: 'ANR', functionId: '101382', isCFGNode: 'True', type: 'CFGEntryNode'})
******************
Sample: 04/15
Id 316 not present in both: (_316:GenericNode:UpstreamNode {ast: 'CFGEntryNode', code: 'ENTRY', command: 'ANR', functionId: '115827', isCFGNode: 'True', type: 'CFGEntryNode'})
******************
Sample: 05/15
Id 400 not present in both: (_400:GenericNode:UpstreamNode {ast: 'CFGEntryNode', code: 'ENTRY', command: 'ANR', functionId: '115927', isCFGNode: 'True',

In [131]:
rel = exec_chains2[0][0]["relationships"][0]

print("TYPE: %s" % type(rel).__name__)
print(rel.start_node.identity)
print(rel.end_node.identity)

TYPE: DOM
99
4


In [151]:
import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size


print(get_size(exec_chains1))
print(get_size(exec_chains2))

183248
47046319


In [1]:
exec_chains3 = list()

start_time = get_time()

for chain in exec_chains2:
    result_chain = {
        "nodes": dict(),
        "rels": list()
    }
    
    for node in chain[0]["nodes"]:
        result_chain["nodes"][node.identity] = node.get("ast")
        
    for rel in chain[0]["relationships"]:
        new_rel = (
            rel.start_node.identity,
            type(rel).__name__,
            rel.end_node.identity
        )
        result_chain["rels"].append(new_rel)
        
    exec_chains3.append(result_chain)
        
print("Spent %dms" % (get_time() - start_time))

NameError: name 'get_time' is not defined

In [None]:
# These are the usual ipython objects, including this one you are creating
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# # Get a sorted list of the objects and their sizes
# sorted([
#     (x, get_size(globals().get(x))) 
#     for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], 
#     key=lambda x: x[1], reverse=True)

In [153]:
get_size(exec_chains3)

13051308