# Dataset processing

Notebook to interface with the raw CWE121 Juliet data. Make sure the data has been downloaded using `./tools/download_cwe121.sh`.

## 1. Prepare the dataset

In [None]:
# Setup logging
from os.path import realpath
import logging
from tools.settings import LOGGER

LOGGER.setLevel(logging.INFO)

In [None]:
# Import dataset classes and processing operations
from tools.dataset import CWEClassificationDataset as Dataset
from tools.dataset.processing.dataset_ops import *
from tools.dataset.processing.file_ops import *
from tools.dataset.processing.content_ops import *

# Dataset directories
extracted_dataset_path = "./data/cwe121_annot"
cleaned_dataset_path = "./data/cwe121_dataset"
cwe121_1000_ref_dataset_path = "./data/cwe121_1000"
cwe121_1000_dataset_path = "./data/cwe121_1000a"

In [None]:
# Create a copy of the annotated dataset to avoid overwriting
extracted_dataset = Dataset(extracted_dataset_path)
extracted_dataset.queue_operation(CopyDataset, {"to_path": cleaned_dataset_path, "force": True})
extracted_dataset.process()

In [None]:
# Cleanup new dataset
cleaned_dataset = Dataset(cleaned_dataset_path)

cleaned_dataset.queue_operation(RemoveCppFiles)
cleaned_dataset.queue_operation(RemoveMainFunction)
cleaned_dataset.queue_operation(ReplaceLitterals)

cleaned_dataset.process()

In [None]:
# Extract a subset of 1000 samples for training, test and validation purposes. 
cleaned_dataset.queue_operation(
    ExtractSampleDataset, {"to_path": cwe121_1000_ref_dataset_path, "sample_nb": 1000, "force": True}
)
cleaned_dataset.process()

In [None]:
# Copy the dataset for future references.
cwe121_1000_ref_dataset = Dataset(cwe121_1000_ref_dataset_path)
cwe121_1000_ref_dataset.queue_operation(CopyDataset, {"to_path": cwe121_1000_dataset_path, "force": True})

cwe121_1000_ref_dataset.process()

In [None]:
# Build the dataset that is going to be used
cwe121_1000_dataset = Dataset(cwe121_1000_dataset_path)

## 2. Apply joern

In this step, the code will be transform in a graph and stored in a Neo4J database.

In [None]:
from tools.libs.joern.v040 import main as run_joern_v040

In [None]:
run_joern_v040(realpath(cwe121_1000_dataset.path))

## 3. Markup AST

In [None]:
from tools.libs.ast.v02 import main as ast_v02

In [None]:
ast_v02(realpath("%s/neo4j_v3.db" % cwe121_1000_dataset.path))

## 4. Extract features

In [None]:
from tools.features.rel_count_single_hop_v02 import extract_features
from tools.libs.neo4j.ai import start_container as run_neo4j_v3
from tools.utils.containers import stop_container_by_name
from py2neo import Graph

In [None]:
db_path = realpath("%s/neo4j_v3.db" % cwe121_1000_dataset.path)

neo4j_container_obj, neo4j_container_name = run_neo4j_v3(db_path, stop_after_execution=False)

# Neo4j database pre-loaded with Joern
neo4j_db = Graph(
    scheme="http",
    host="0.0.0.0",
    port="7474"
)

extract_features(neo4j_db, cwe121_1000_dataset.path)

stop_container_by_name(neo4j_container_name)

## Format feature files

This step will convert feature files to csv for easy import in pandas

In [None]:
import numpy as np
import pandas as pd
from scipy.io import mmread

In [None]:
features_filename = "%s/features/features.mtx" % cwe121_1000_dataset.path
labels_filename = "%s/features/labels.txt" % cwe121_1000_dataset.path

features_csv_filename = "%s/features/features.csv" % cwe121_1000_dataset.path

In [None]:
labels = np.genfromtxt(labels_filename, delimiter=',', dtype=None)
labels = pd.DataFrame(labels)
labels["results"] = labels["f0"].apply(lambda it: 1 if it else 0)

labels.head()

In [None]:
features = mmread(features_filename).tocsr()
features = pd.DataFrame(features.todense())
features.columns = [str(c) for c in features.columns]

features.head()

In [None]:
non_empty_cols = list()

for col in features:
    for item in features[col]:
        if item != 0:
            non_empty_cols.append(col)
            break

empty_cols = [col for col in features if col not in non_empty_cols]

In [None]:
simple_features = features.drop(empty_cols, axis=1)
simple_features["results"] = labels["results"]

simple_features.head()

In [None]:
simple_features.to_csv(features_csv_filename, index=False)

# Feature analysis

In [None]:
# Reload the feature and dump them in a CSV file
feature_filename = "%s/features/features.csv" % cwe121_1000_dataset.path
feats = pd.read_csv(feature_filename)

In [None]:
feats.head(n=10)

In [None]:
output_data = feats["results"]
output_data.head(n=10)

In [None]:
input_data = feats.drop("results", axis=1)
input_data.head(n=10)

# Train test split

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    input_data, output_data, test_size=0.33, random_state=101
)

# Model creation and training

In [None]:
feat_cols = list()
feat_cols_append = feat_cols.append

for x in X_train.columns:
    feat_cols_append(tf.feature_column.numeric_column(x))

In [None]:
len(feat_cols)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
input_fn = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, shuffle=True, batch_size=100, num_epochs=100)

In [None]:
lin_cls_model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=2)

In [None]:
lin_cls_model.train(input_fn=input_fn, steps=100)

# Evaluation

In [None]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

In [None]:
results = lin_cls_model.evaluate(eval_input_fn)

In [None]:
results