# Dataset processing

Notebook to interface with the raw CWE121 Juliet data. Make sure the data has been downloaded using `./tools/download_cwe121.sh`.

## 1. Prepare the dataset

In [1]:
# Setup logging
from os.path import realpath
import logging
from tools.settings import LOGGER

LOGGER.setLevel(logging.INFO)

In [2]:
# Import dataset classes and processing operations
from tools.dataset import CWEClassificationDataset as Dataset
from tools.dataset.processing.dataset_ops import *
from tools.dataset.processing.file_ops import *
from tools.dataset.processing.content_ops import *

# Dataset directories
extracted_dataset_path = "./data/cwe121_annot"
cleaned_dataset_path = "./data/cwe121_dataset"
cwe121_1000_ref_dataset_path = "./data/cwe121_1000"
cwe121_1000_dataset_path = "./data/cwe121_1000a"

In [17]:
# Create a copy of the annotated dataset to avoid overwriting
extracted_dataset = Dataset(extracted_dataset_path)
extracted_dataset.queue_operation(CopyDataset, {"to_path": cleaned_dataset_path, "force": True})
extracted_dataset.process()

[2019-09-06 17:34:20][INFO] Dataset index build in 524ms. 9888 test_cases, 2 classes.
[2019-09-06 17:34:20][INFO] Running operation 1/1 (CopyDataset)...
[2019-09-06 17:34:24][INFO] 1 operations run in 4626ms.


In [18]:
# Cleanup new dataset
cleaned_dataset = Dataset(cleaned_dataset_path)

cleaned_dataset.queue_operation(RemoveCppFiles)
cleaned_dataset.queue_operation(RemoveMainFunction)
cleaned_dataset.queue_operation(ReplaceLitterals)

cleaned_dataset.process()

[2019-09-06 17:34:38][INFO] Dataset index build in 558ms. 9888 test_cases, 2 classes.
[2019-09-06 17:34:38][INFO] Running operation 1/3 (RemoveCppFiles)...
[2019-09-06 17:34:40][INFO] Dataset index build in 421ms. 8684 test_cases, 2 classes.
[2019-09-06 17:34:40][INFO] Running operation 2/3 (RemoveMainFunction)...
[2019-09-06 17:34:46][INFO] Dataset index build in 431ms. 8684 test_cases, 2 classes.
[2019-09-06 17:34:46][INFO] Running operation 3/3 (ReplaceLitterals)...
[2019-09-06 17:35:00][INFO] 3 operations run in 21445ms.


In [19]:
# Extract a subset of 1000 samples for training, test and validation purposes. 
cleaned_dataset.queue_operation(
    ExtractSampleDataset, {"to_path": cwe121_1000_ref_dataset_path, "sample_nb": 1000, "force": True}
)
cleaned_dataset.process()

[2019-09-06 17:35:04][INFO] Running operation 1/1 (ExtractSampleDataset)...
[2019-09-06 17:35:04][INFO] 1 operations run in 365ms.


In [20]:
# Copy the dataset for future references.
cwe121_1000_ref_dataset = Dataset(cwe121_1000_ref_dataset_path)
cwe121_1000_ref_dataset.queue_operation(CopyDataset, {"to_path": cwe121_1000_dataset_path, "force": True})

cwe121_1000_ref_dataset.process()

[2019-09-06 17:35:07][INFO] Dataset index build in 103ms. 1000 test_cases, 2 classes.
[2019-09-06 17:35:07][INFO] Running operation 1/1 (CopyDataset)...
[2019-09-06 17:35:08][INFO] 1 operations run in 425ms.


In [3]:
# Build the dataset that is going to be used
cwe121_1000_dataset = Dataset(cwe121_1000_dataset_path)

[2019-09-06 18:15:40][INFO] Dataset index build in 70ms. 1000 test_cases, 2 classes.


## 2. Apply joern

In this step, the code will be transform in a graph and stored in a Neo4J database.

In [6]:
from tools.libs.joern.v040 import main as run_joern_v040

In [7]:
run_joern_v040(realpath(cwe121_1000_dataset.path))

[2019-09-06 17:40:31][INFO] Starting Joern 0.4.0...
[2019-09-06 17:40:31][INFO] Starting joern-lite:0.4.0 (Z0d8IL0G9A)...
[2019-09-06 17:41:04][INFO] Joern execution finished.
[2019-09-06 17:41:04][INFO] Joern database generated. Formatting CSV files...
[2019-09-06 17:41:05][INFO] CSV files formatted. Preparing import in Neo4j 3.5...
[2019-09-06 17:41:05][INFO] Import to Neo4j 3.5 ready. Importing...
[2019-09-06 17:41:05][INFO] Starting neo4j-v3-GleMU...
[2019-09-06 17:41:25][INFO] neo4j-v3-GleMU started.
[2019-09-06 17:41:41][INFO] CSV file imported.
[2019-09-06 17:41:42][INFO] Successful import to Neo4j 3.5.
[2019-09-06 17:41:42][INFO] Starting neo4j-v3-S06cw...
[2019-09-06 17:42:00][INFO] neo4j-v3-S06cw started.
[2019-09-06 17:42:01][INFO] Starting neo4j-v3-ZpBwU...
[2019-09-06 17:42:23][INFO] neo4j-v3-ZpBwU started.
[2019-09-06 17:42:23][INFO] Running commands...
[2019-09-06 17:42:38][INFO] Command 1 out of 5 run in 15161ms
[2019-09-06 17:42:39][INFO] Command 2 out of 5 run in 428m

## 3. Markup AST

In [8]:
from tools.libs.ast.v02 import main as ast_v02

In [9]:
ast_v02(realpath("%s/neo4j_v3.db" % cwe121_1000_dataset.path))

[2019-09-06 17:43:08][INFO] Starting neo4j-v3-V8LxN...
[2019-09-06 17:43:28][INFO] neo4j-v3-V8LxN started.
[2019-09-06 17:43:28][INFO] Connected to Neo4j. Retrieving nodes...
[2019-09-06 17:43:33][INFO] 20637 nodes found. Processing...
[2019-09-06 17:43:33][INFO] Querying nodes...
[2019-09-06 17:43:44][INFO] Node info retrieved. Querying links...
[2019-09-06 17:44:00][INFO] Links info retrieved. Building tree...
[2019-09-06 17:46:00][INFO] Tree built. Uploading ASTs...
[2019-09-06 17:46:00][INFO] Update dict generated (20637 entries). Uploading...
[2019-09-06 17:46:00][INFO] 11 commands prepared
[2019-09-06 17:46:00][INFO] Prepping command...
[2019-09-06 17:46:00][INFO] Updating AST...
[2019-09-06 17:46:08][INFO] Prepping command...
[2019-09-06 17:46:08][INFO] Updating AST...
[2019-09-06 17:46:12][INFO] Prepping command...
[2019-09-06 17:46:13][INFO] Updating AST...
[2019-09-06 17:46:16][INFO] Prepping command...
[2019-09-06 17:46:16][INFO] Updating AST...
[2019-09-06 17:46:21][INFO] P

## 4. Extract features

In [4]:
from tools.features.rel_count_single_hop_v02 import extract_features
from tools.libs.neo4j.ai import start_container as run_neo4j_v3
from tools.utils.containers import stop_container_by_name
from py2neo import Graph

In [6]:
db_path = realpath("%s/neo4j_v3.db" % cwe121_1000_dataset.path)

neo4j_container_obj, neo4j_container_name = run_neo4j_v3(db_path, stop_after_execution=False)

# Neo4j database pre-loaded with Joern
neo4j_db = Graph(
    scheme="http",
    host="0.0.0.0",
    port="7474"
)

extract_features(neo4j_db, cwe121_1000_dataset.path)

stop_container_by_name(neo4j_container_name)

[2019-09-06 18:16:17][INFO] Starting neo4j-v3-aUOHG...
[2019-09-06 18:16:39][INFO] neo4j-v3-aUOHG started.
[2019-09-06 18:16:39][INFO] Retrieving test cases from the database...
[2019-09-06 18:16:44][INFO] 2188 test cases to process (31212 features)...
[2019-09-06 18:18:02][INFO] Processed 10% of the dataset.
[2019-09-06 18:19:12][INFO] Processed 20% of the dataset.
[2019-09-06 18:20:32][INFO] Processed 30% of the dataset.
[2019-09-06 18:21:43][INFO] Processed 40% of the dataset.
[2019-09-06 18:23:19][INFO] Processed 50% of the dataset.
[2019-09-06 18:24:25][INFO] Processed 60% of the dataset.
[2019-09-06 18:25:42][INFO] Processed 70% of the dataset.
[2019-09-06 18:26:57][INFO] Processed 80% of the dataset.
[2019-09-06 18:28:10][INFO] Processed 90% of the dataset.
[2019-09-06 18:29:14][INFO] Processed 100% of the dataset.
[2019-09-06 18:29:14][INFO] Analyzed 2188 test cases


## Format feature files

This step will convert feature files to csv for easy import in pandas

In [13]:
import numpy as np
import pandas as pd
from scipy.io import mmread

In [8]:
features_filename = "%s/features/features.mtx" % cwe121_1000_dataset.path
labels_filename = "%s/features/labels.txt" % cwe121_1000_dataset.path

features_csv_filename = "%s/features/features.csv" % cwe121_1000_dataset.path

In [41]:
labels = np.genfromtxt(labels_filename, delimiter=',', dtype=None)
labels = pd.DataFrame(labels)
labels["results"] = labels["f0"].apply(lambda it: 1 if it else 0)

labels.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,f0,f1,results
0,True,b'CWE121_Stack_Based_Buffer_Overflow__CWE193_w...,1
1,True,b'CWE121_Stack_Based_Buffer_Overflow__CWE193_w...,1
2,True,b'CWE121_Stack_Based_Buffer_Overflow__CWE193_w...,1
3,True,b'CWE121_Stack_Based_Buffer_Overflow__CWE193_w...,1
4,True,b'CWE121_Stack_Based_Buffer_Overflow__CWE193_w...,1


In [38]:
features = mmread(features_filename).tocsr()
features = pd.DataFrame(features.todense())
features.columns = [str(c) for c in features.columns]

features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31202,31203,31204,31205,31206,31207,31208,31209,31210,31211
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
non_empty_cols = list()

for col in features:
    for item in features[col]:
        if item != 0:
            non_empty_cols.append(col)
            break

empty_cols = [col for col in features if col not in non_empty_cols]

In [42]:
simple_features = features.drop(empty_cols, axis=1)
simple_features["results"] = labels["results"]

simple_features.head()

Unnamed: 0,102,104,105,106,111,112,114,116,118,119,...,30271,30272,30274,30277,30279,30281,30292,30704,31010,results
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [43]:
simple_features.to_csv(features_csv_filename, index=False)

# Feature analysis

In [44]:
# Reload the feature and dump them in a CSV file
feature_filename = "%s/features/features.csv" % cwe121_1000_dataset.path
feats = pd.read_csv(feature_filename)

In [45]:
feats.head(n=10)

Unnamed: 0,102,104,105,106,111,112,114,116,118,119,...,30271,30272,30274,30277,30279,30281,30292,30704,31010,results
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [46]:
output_data = feats["results"]
output_data.head(n=10)

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: results, dtype: int64

In [47]:
input_data = feats.drop("results", axis=1)
input_data.head(n=10)

Unnamed: 0,102,104,105,106,111,112,114,116,118,119,...,30264,30271,30272,30274,30277,30279,30281,30292,30704,31010
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train test split

In [48]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    input_data, output_data, test_size=0.33, random_state=101
)

# Model creation and training

In [50]:
feat_cols = list()
feat_cols_append = feat_cols.append

for x in X_train.columns:
    feat_cols_append(tf.feature_column.numeric_column(x))

In [51]:
len(feat_cols)

3540

In [52]:
X_train.head()

Unnamed: 0,102,104,105,106,111,112,114,116,118,119,...,30264,30271,30272,30274,30277,30279,30281,30292,30704,31010
1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
y_train.head()

1961    0
698     1
964     0
2184    1
124     1
Name: results, dtype: int64

In [57]:
input_fn = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, shuffle=True, batch_size=100, num_epochs=100)

In [58]:
lin_cls_model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpexd01u72', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f99c1572198>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [59]:
lin_cls_model.train(input_fn=input_fn, steps=100)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpexd01u72/model.ckpt.
INFO:tensorflow:loss = 69.31472, step = 1
INFO:tensorflow:Saving checkpoints for 100 into /tmp/tmpexd01u72/model.ckpt.
INFO:tensorflow:Loss for final step: 39.807228.


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f99c1572048>

# Evaluation

In [60]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

In [61]:
results = lin_cls_model.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-09-09-13:22:00
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpexd01u72/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-09-09-13:26:14
INFO:tensorflow:Saving dict for global step 100: accuracy = 0.66390043, accuracy_baseline = 0.6500692, auc = 0.7518838, auc_precision_recall = 0.8805074, average_loss = 0.5158318, global_step = 100, label/mean = 0.6500692, loss = 5.108855, precision = 0.6907563, prediction/mean = 0.6777051, recall = 0.8744681
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: /tmp/tmpexd01u72/model.ckpt-100


In [62]:
results

{'accuracy': 0.66390043,
 'accuracy_baseline': 0.6500692,
 'auc': 0.7518838,
 'auc_precision_recall': 0.8805074,
 'average_loss': 0.5158318,
 'label/mean': 0.6500692,
 'loss': 5.108855,
 'precision': 0.6907563,
 'prediction/mean': 0.6777051,
 'recall': 0.8744681,
 'global_step': 100}