# Dataset processing

Notebook to interface with the raw CWE121 Juliet data. Make sure the data has been downloaded using `./tools/download_cwe121.sh`.

## 1. Prepare the dataset

In [1]:
# Setup logging
from os.path import realpath
import logging
from tools.settings import LOGGER

LOGGER.setLevel(logging.INFO)

In [2]:
# Import dataset classes and processing operations
from tools.dataset import CWEClassificationDataset as Dataset
from tools.dataset.processing.dataset_ops import *
from tools.dataset.processing.file_ops import *
from tools.dataset.processing.content_ops import *

# Dataset directories
extracted_dataset_path = "./data/cwe121_annot"
cleaned_dataset_path = "./data/cwe121_dataset"
cwe121_1000_ref_dataset_path = "./data/cwe121_1000"
cwe121_1000_dataset_path = "./data/cwe121_1000a"

In [None]:
# Create a copy of the annotated dataset to avoid overwriting
extracted_dataset = Dataset(extracted_dataset_path)
extracted_dataset.queue_operation(CopyDataset, {"to_path": cleaned_dataset_path, "force": True})
extracted_dataset.process()

In [None]:
# Cleanup new dataset
cleaned_dataset = Dataset(cleaned_dataset_path)

cleaned_dataset.queue_operation(RemoveCppFiles)
cleaned_dataset.queue_operation(RemoveMainFunction)
cleaned_dataset.queue_operation(ReplaceLitterals)

cleaned_dataset.process()

In [None]:
# Extract a subset of 1000 samples for training, test and validation purposes. 
cleaned_dataset.queue_operation(
    ExtractSampleDataset, {"to_path": cwe121_1000_ref_dataset_path, "sample_nb": 1000, "force": True}
)
cleaned_dataset.process()

In [None]:
# Copy the dataset for future references.
cwe121_1000_ref_dataset = Dataset(cwe121_1000_ref_dataset_path)
cwe121_1000_ref_dataset.queue_operation(CopyDataset, {"to_path": cwe121_1000_dataset_path, "force": True})

cwe121_1000_ref_dataset.process()

In [3]:
# Build the dataset that is going to be used
cwe121_1000_dataset = Dataset(cwe121_1000_dataset_path)

[2019-09-10 15:55:27][INFO] Dataset index build in 49ms. 1000 test_cases, 2 classes.


## 2. Apply joern

In this step, the code will be transform in a graph and stored in a Neo4J database.

In [None]:
from tools.libs.joern.v040 import main as run_joern_v040

In [None]:
run_joern_v040(realpath(cwe121_1000_dataset.path))

## 3. Markup AST

In [None]:
from tools.libs.ast.v02 import main as ast_v02

In [None]:
ast_v02(realpath("%s/neo4j_v3.db" % cwe121_1000_dataset.path))

## 4. Extract features

In [None]:
from tools.features.rel_count_single_hop_v02 import extract_features
from tools.libs.neo4j.ai import start_container as run_neo4j_v3
from tools.utils.containers import stop_container_by_name
from py2neo import Graph

In [None]:
db_path = realpath("%s/neo4j_v3.db" % cwe121_1000_dataset.path)

neo4j_container_obj, neo4j_container_name = run_neo4j_v3(db_path, stop_after_execution=False)

# Neo4j database pre-loaded with Joern
neo4j_db = Graph(
    scheme="http",
    host="0.0.0.0",
    port="7474"
)

extract_features(neo4j_db, cwe121_1000_dataset.path)

stop_container_by_name(neo4j_container_name)

## Format feature files

This step will convert feature files to csv for easy import in pandas

In [16]:
import numpy as np
import pandas as pd
from scipy.io import mmread
from sklearn.decomposition import PCA

In [5]:
features_filename = "%s/features/features.mtx" % cwe121_1000_dataset.path
labels_filename = "%s/features/labels.txt" % cwe121_1000_dataset.path

features_csv_filename = "%s/features/features.csv" % cwe121_1000_dataset.path

In [None]:
labels = np.genfromtxt(labels_filename, delimiter=',', dtype=None)
labels = pd.DataFrame(labels)
labels["results"] = labels["f0"].apply(lambda it: 1 if it else 0)

labels.head()

In [None]:
features = mmread(features_filename).tocsr()
features = pd.DataFrame(features.todense())
features.columns = [str(c) for c in features.columns]

features.head()

In [None]:
non_empty_cols = list()

for col in features:
    for item in features[col]:
        if item != 0:
            non_empty_cols.append(col)
            break

empty_cols = [col for col in features if col not in non_empty_cols]

In [None]:
simple_features = features.drop(empty_cols, axis=1)
simple_features["results"] = labels["results"]

simple_features.head()

In [None]:
simple_features.to_csv(features_csv_filename, index=False)

# Feature analysis

In [6]:
# Reload the feature and dump them in a CSV file
feature_filename = "%s/features/features.csv" % cwe121_1000_dataset.path
feats = pd.read_csv(feature_filename)

In [7]:
feats.head(n=10)

Unnamed: 0,102,104,105,106,111,112,114,116,118,119,...,30271,30272,30274,30277,30279,30281,30292,30704,31010,results
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
output_data = feats["results"]
output_data.head(n=10)

In [9]:
input_data = feats.drop("results", axis=1)
input_data.head(n=10)

Unnamed: 0,102,104,105,106,111,112,114,116,118,119,...,30264,30271,30272,30274,30277,30279,30281,30292,30704,31010
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# PCA for dimension reduction
pca = PCA(n_components=500)
pca.fit(input_data)

cols = ["pca_%d" % i for i in range(500)]
input_data_pca = pd.DataFrame(pca.transform(input_data), columns=cols, index=input_data.index)
input_data_pca.head()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,...,pca_490,pca_491,pca_492,pca_493,pca_494,pca_495,pca_496,pca_497,pca_498,pca_499
0,0.896416,0.060043,-0.034811,-0.005252,-0.000471,-0.003054,0.00027,-0.007055,-0.004201,-0.001093,...,8.950012e-07,6.701585e-07,-6.302654e-07,5.665932e-07,2.666466e-06,-1.416175e-06,-1.151046e-07,-8.086482e-07,4.729194e-09,9.134162e-07
1,-0.226815,-0.146523,-0.043333,-0.007771,0.053339,-0.026843,0.000928,-0.023489,0.140703,0.094888,...,0.0008697118,-0.0001600627,0.001438556,-0.0002238752,0.0004912793,-0.0001725252,-0.001309704,-0.001719222,0.0002922743,-0.0004554892
2,-0.223007,-0.148615,-0.038521,-0.019957,0.034818,-0.033748,-0.000379,-0.019782,0.133967,0.090829,...,0.001435756,6.094176e-05,0.002407791,-0.00116651,-0.0005847401,0.001643681,-0.001633848,-0.0001124024,0.0009159506,0.0008824962
3,-0.442159,1.095995,0.000691,0.044418,0.017183,0.047564,0.005103,0.014592,0.00684,-0.02067,...,-4.497221e-07,2.854191e-07,3.774753e-07,6.792047e-08,4.735262e-07,-9.336907e-07,4.134144e-07,-1.18634e-07,6.596757e-07,-1.87813e-07
4,-0.442159,1.095995,0.000691,0.044418,0.017183,0.047564,0.005103,0.014592,0.00684,-0.02067,...,-4.497221e-07,2.854191e-07,3.774753e-07,6.792047e-08,4.735262e-07,-9.336907e-07,4.134144e-07,-1.18634e-07,6.596757e-07,-1.87813e-07


# Train test split

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    input_data, output_data, test_size=0.33, random_state=101
)

# Model creation and training

In [None]:
feat_cols = list()
feat_cols_append = feat_cols.append

for x in X_train.columns:
    feat_cols_append(tf.feature_column.numeric_column(x))

In [None]:
len(feat_cols)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
input_fn = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, shuffle=True, batch_size=100, num_epochs=100)

In [None]:
lin_cls_model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=2)

In [None]:
lin_cls_model.train(input_fn=input_fn, steps=100)

# Evaluation

In [None]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

In [None]:
results = lin_cls_model.evaluate(eval_input_fn)

In [None]:
results