In [2]:
"""
preprocess the extracted inputs and bitmaps
    1. input and bitmap may have duplicates;
    2. get unique bitmaps (paths) using SQL
    3. for each path count how many inputs fall into the path
    4. transform byte array to numpy ndarray
    5. store the numpy ndarray to disk as npz file
"""
import preprocess as prep
import numpy as np
import sqlite3 as sql
import os

DB_PATH = prep.DB_PATH       # "./data/afl.db"
RAW_TBL = prep.RAW_TBL       # "data"
PRUNED_TBL = prep.PRUNED_TBL # 'pruned'   # duplicate rows removed
PATH_TBL = prep.PATH_TBL     # 'path'     # path and count of inputs with bitmap equales to path

# 1. REMOVE DUPLICATES and COUNT INPUT for EACH PATH

In [4]:
ALL_TABLES = {}
with sql.connect(DB_PATH) as conn:
    cur = conn.cursor()
    cur.execute("select tbl_name from sqlite_master where type='table'")
    ALL_TABLES = set(x[0] for x in cur.fetchall())

# THIS MAY RUN FOR SEVERAL MINs
if RAW_TBL in ALL_TABLES:
    # remove duplicates
    with sql.connect(DB_PATH) as conn:
        conn.executescript(
            f""" 
            DROP TABLE IF EXISTS {PRUNED_TBL};
            CREATE TABLE {PRUNED_TBL} (
                input blob,
                bitmap blob
            );
            DROP TABLE IF EXISTS {PATH_TBL};
            CREATE TABLE {PATH_TBL} (
                bitmap blob,
                freq integer
            );
            INSERT INTO {PRUNED_TBL} SELECT DISTINCT input, bitmap FROM {RAW_TBL};
            DROP TABLE {RAW_TBL};
            INSERT INTO {PATH_TBL} SELECT bitmap, count(*) FROM {PRUNED_TBL} GROUP BY bitmap;
            """
        )

# 2. EXPORT TO NUMPY FORMAT

In [3]:
def save_to_np_file(x_file, y_file, path_file):
    df_inputs = prep.read_db(DB_PATH, PRUNED_TBL)
    df_paths = prep.read_db(DB_PATH, PATH_TBL)
    mat_bitmap_orig = np.array([np.frombuffer(buf, dtype=np.uint8) for buf in df_inputs.bitmap])
    se_inputs = df_inputs.input
    del df_inputs
    # calculate edge mask
    mat_path = np.array([np.frombuffer(buf, dtype=np.uint8) for buf in df_paths.bitmap])
    edge_hit = mat_path.sum(0) > 0
    mat_path_compress = mat_path[:, edge_hit]
    mat_bitmap_compress = mat_bitmap_orig[:, edge_hit]
    mat_flag_bitmap_compress = mat_bitmap_compress > 0
    np.savez_compressed(y_file, y_orig = mat_bitmap_compress, y_compress = mat_flag_bitmap_compress)
    input_lst = [np.frombuffer(buf, dtype=np.byte) for buf in se_inputs]
    mat_input = np.array(input_lst)
    np.savez_compressed(x_file,x=mat_input)
    np.savez_compressed(path_file, path=mat_path_compress, input_count=df_paths.freq.values)


X_FILE_NAME = prep.X_FILE_NAME
Y_FILE_NAME = prep.Y_FILE_NAME
PATH_FILE_NAME = prep.PATH_FILE_NAME

if not os.path.exists(Y_FILE_NAME):
    save_to_np_file(X_FILE_NAME, Y_FILE_NAME, PATH_FILE_NAME)
