# Dataset processing

Notebook to interface with the raw CWE121 Juliet data. Make sure the data has been downloaded using `./tools/download_cwe121.sh`.

## 1. Prepare the dataset

In [1]:
# Setup logging
import logging
from tools.settings import LOGGER

LOGGER.setLevel(logging.INFO)

In [24]:
# Import dataset classes and processing operations
from tools.dataset import CWEClassificationDataset as Dataset
from tools.dataset.processing.dataset_ops import *
from tools.dataset.processing.file_ops import *
from tools.dataset.processing.content_ops import *

# Dataset directories
extracted_dataset_path = "./data/cwe121_annot"
cleaned_dataset_path = "./data/cwe121_dataset"
cwe121_1000_ref_dataset_path = "./data/cwe121_1000"
cwe121_1000_dataset_path = "./data/cwe121_1000a"

In [3]:
# Create a copy of the annotated dataset to avoid overwriting
extracted_dataset = Dataset(extracted_dataset_path)
extracted_dataset.queue_operation(CopyDataset, {"to_path": cleaned_dataset_path, "force": True})
extracted_dataset.process()

[2019-08-22 17:53:11][INFO] Dataset index build in 928ms. 9888 test_cases, 2 classes.
[2019-08-22 17:53:11][INFO] Running operation 1/1 (CopyDataset)...
[2019-08-22 17:53:20][INFO] 1 operations run in 8713ms.


In [4]:
# Cleanup new dataset
cleaned_dataset = Dataset(cleaned_dataset_path)

cleaned_dataset.queue_operation(RemoveCppFiles)
cleaned_dataset.queue_operation(RemoveMainFunction)
cleaned_dataset.queue_operation(ReplaceLitterals)

cleaned_dataset.process()

[2019-08-22 17:53:30][INFO] Dataset index build in 485ms. 9888 test_cases, 2 classes.
[2019-08-22 17:53:30][INFO] Running operation 1/3 (RemoveCppFiles)...
[2019-08-22 17:53:31][INFO] Dataset index build in 377ms. 8684 test_cases, 2 classes.
[2019-08-22 17:53:31][INFO] Running operation 2/3 (RemoveMainFunction)...
[2019-08-22 17:53:38][INFO] Dataset index build in 740ms. 8684 test_cases, 2 classes.
[2019-08-22 17:53:38][INFO] Running operation 3/3 (ReplaceLitterals)...
[2019-08-22 17:53:59][INFO] 3 operations run in 29281ms.


In [25]:
# Extract a subset of 1000 samples for training, test and validation purposes. 
cleaned_dataset.queue_operation(
    ExtractSampleDataset, {"to_path": cwe121_1000_ref_dataset_path, "sample_nb": 1000, "force": True}
)
cleaned_dataset.process()

[2019-08-23 12:07:39][INFO] Running operation 1/1 (ExtractSampleDataset)...
[2019-08-23 12:07:39][INFO] 1 operations run in 465ms.
[2019-08-23 12:07:39][INFO] Dataset index build in 66ms. 1000 test_cases, 2 classes.
[2019-08-23 12:07:39][INFO] Running operation 1/1 (CopyDataset)...
[2019-08-23 12:07:39][INFO] 1 operations run in 301ms.
[2019-08-23 12:07:39][INFO] Dataset index build in 59ms. 1000 test_cases, 2 classes.


In [None]:
# Copy the dataset for future references.
cwe121_1000_ref_dataset = Dataset(cwe121_1000_ref_dataset_path)
cwe121_1000_ref_dataset.queue_operation(CopyDataset, {"to_path": cwe121_1000_dataset_path, "force": True})

cwe121_1000_ref_dataset.process()
cwe121_1000_dataset = Dataset(cwe121_1000_dataset_path)

## 2. Apply joern

In this step, the code will be transform in a graph stored in a Neo4J database.

## 3. Markup AST

## 4. Preview data