# D. Dataset Creator (experimental)

A notebook to create datasets containing a subset of features.

In [None]:
# Enable these line if live changes in the codebase are made
%load_ext autoreload
%autoreload 2

In [None]:
# Disable tensorflow logging
import tensorflow as tf
import os
import logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # any {'0', '1', '2', '3'}
logging.getLogger('tensorflow').setLevel(logging.FATAL)

In [None]:
# Specific instruction to run the notebooks from a sub-folder.
import sys
sys.path.append("..")

In [None]:
from bugfinder.settings import LOGGER
from bugfinder.dataset import CWEClassificationDataset as Dataset
from bugfinder.models.dnn_classifier import DNNClassifierTraining
from bugfinder.models.linear_classifier import LinearClassifierTraining
from bugfinder.dataset.processing.dataset_ops import CopyDataset, RightFixer
from bugfinder.features.any_hop.all_flows import FeatureExtractor as AnyHopAllFlowsExtractor
from bugfinder.features.any_hop.single_flow import FeatureExtractor as AnyHopSingleFlowExtractor
from bugfinder.features.single_hop.raw import FeatureExtractor as SingleHopRawExtractor
from bugfinder.features.pca import FeatureExtractor as PCA

from os.path import join, exists, basename, dirname
from shutil import rmtree, copytree

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from bugfinder.dataset.processing import DatasetProcessing, DatasetProcessingCategory
from bugfinder.settings import LOGGER
from bugfinder.utils.statistics import has_better_metrics
from os.path import join, exists
from shutil import rmtree, copytree

import tensorflow as tf
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from pprint import pprint
from copy import deepcopy
import numpy as np
import pandas as pd
import xlsxwriter
from xlsxwriter.utility import xl_rowcol_to_cell
import sklearn.decomposition
from tqdm.notebook import tqdm
import math
import pickle
import random

In [None]:
# Setup logging to only output INFO level messages
LOGGER.setLevel(logging.INFO)
LOGGER.propagate = False

In [None]:
output_path = "/mnt/data/ai-bugfinder/aiwe"

In [None]:
# # Global parameters
output_path = "/mnt/data/aiwe"
# dataset_path = "../data/ds-rev-pca"
# tensorboard_path = "/home/pnd/tb-test"
# dataset = Dataset(dataset_path)

# model_config = [10, 10, 10]
# batch_size=250
# training_epochs=5

# orig_cols = list(dataset.features.columns)

# # Remove result columns
# df = dataset.features
# df = df.drop(orig_cols[-2:], axis=1)

# orig_cols = orig_cols[:-2]
feature_file = "/mnt/data/ai-bugfinder/aiwe-all.csv"
df = pd.read_csv(feature_file)

In [None]:
with open(f"{output_path}/output.csv", "w") as csv_file:
    df["result"].to_csv(csv_file, index=None)

In [None]:
cols = df.columns
feat_df = df.drop(cols[-2:], axis=1)

In [None]:
with open(f"{output_path}/default.csv", "w") as csv_file:
    feat_df.to_csv(csv_file, index=None)

In [None]:
std = df.std().to_numpy()
for std_limit in [1e-2, 1e-1]:
    std_cols = [df.columns[std_idx] for std_idx in range(len(std)) if std[std_idx] >= std_limit]

    df_significant = df[std_cols]

    with open(f"{output_path}/sign{str(std_limit).replace('.', '')}.csv", "w") as csv_file:
        df_significant.to_csv(csv_file, index=False)

In [None]:
def get_bags(bag_count, seed=None):
    item_count = round(len(orig_cols) / bag_count) + 1
    print(f"Creating {bag_count} bags containing {item_count} features...")
    
    copy_cols = deepcopy(orig_cols)
    
    if seed:
        random.seed(seed)
        random.shuffle(copy_cols)
        
    column_bags = [
        copy_cols[bag_index*item_count:(bag_index+1)*item_count] for bag_index in range(bag_count)
    ]
    del copy_cols

    print(f"Creating list of features...")
    df_list = [df[columns] for columns in column_bags]
    
    for df_idx in range(len(df_list)):
        filename = f"bag{df_idx:02d}of{bag_count}with{item_count}features{seed}random"
        with open(f"{output_path}/{filename}.csv", "w") as csv_file:
            df_list[df_idx].to_csv(csv_file, index=None)
            
        std = df_list[df_idx].std().to_numpy()
        for std_limit in [1e-2, 1e-1]:
            std_cols = [
                df_list[df_idx].columns[std_idx] 
                for std_idx in range(len(std)) if std[std_idx] >= std_limit
            ]
            
            if len(std_cols) < 50:
                continue

            df_significant = df_list[df_idx][std_cols]

            with open(f"{output_path}/{filename}{std_limit}.csv", "w") as csv_file:
                df_significant.to_csv(csv_file, index=None)
        

In [None]:
random.seed(101)
bags = [10, 20, 30, 40, 50]
seeds = [
    random.randint(10, 99) for x in range(4) 
]
print(f"Seeds: {str(seeds)}")

for bag_count in bags:
    get_bags(bag_count)
    
    for seed in seeds:
        get_bags(bag_count, seed)

In [None]:
pca_components = 50

def get_pca_weights(X, n_comp):
    mu = np.mean(X, axis=0)
    pca = sklearn.decomposition.PCA(n_components=n_comp)
    pca.fit(X)
    
    return pca, mu
    
print("Calculating PCA operations...")
pca_compute_list = [
    get_pca_weights(df, pca_components) for df in df_list   
]

print("Computing PCA...")
df_list_pca = [
    pca_compute_list[idx][0].transform(df_list[idx])  
    for idx in range(len(df_list))
]

# print("Reversing PCA...")
# df_list_pca_rev = [
#     np.dot(
#         df_list_pca[idx][:,:pca_components], 
#         pca_compute_list[idx][0].components_[:pca_components,:]
#     ) + pca_compute_list[idx][1]
#     for idx in range(len(df_list))
# ]

In [None]:
def train(input_data, output_data, model_config, model_dir, batch_size, training_epochs, max_training_items=None):
    # Renaming input columns to avoid forbidden characters
    columns = [
        f"feat{feature_nb:03d}" for feature_nb in range(input_data.shape[1])
    ]
    input_data = pd.DataFrame(input_data, columns=columns)

    # Splitting into training set and test set
    input_train, input_test, output_train, output_test = train_test_split(
        input_data, output_data, test_size=0.33, random_state=101
    )

    train_fn = tf.estimator.inputs.pandas_input_fn(
        x=input_train,
        y=output_train,
        shuffle=True,
        batch_size=batch_size,
    )
    test_fn = tf.estimator.inputs.pandas_input_fn(
        x=input_test, y=output_test, shuffle=False, batch_size=batch_size
    )

    # Creating the model
    model = tf.estimator.DNNClassifier(
        hidden_units = model_config,
        feature_columns=[
            tf.feature_column.numeric_column(col) for col in columns
        ],
        n_classes=2,
        model_dir=model_dir,
    )

    LOGGER.debug(
        "Training %s on %d samples and testing on %d samples..." % (
            model.__class__.__name__, input_train.shape[0], input_test.shape[0]
        )
    )

    # Train the model and evaluate for the given number of epochs
    for epoch_num in range(training_epochs):  
        LOGGER.debug("Training dataset for epoch %d/%d..." % (epoch_num + 1, training_epochs))
        model.train(input_fn=train_fn, steps=max_training_items)
        preds = model.evaluate(input_fn=test_fn)
        
    return 2 * preds["precision"] * preds["recall"] / (preds["precision"] + preds["recall"])

In [None]:
LOGGER.info("Training %s..." % dataset_path)
scores = list()

for df_idx in range(len(df_list_pca)):
    print(f"Training with dataset {df_idx+1}/{len(df_list_pca)}")
    
    score = train(
        df_list_pca[df_idx],
        dataset.features["result"],
        model_config,
        None, 
        batch_size, 
        training_epochs * 3
    )
    scores.append(score)

print(scores)
print("Job done!")

In [None]:
scores

In [None]:
def init_model(dataset_path, input_data, model_config, model_dir):
    LOGGER.info(f"Loading dataset at {dataset_path}")
    dataset = Dataset(dataset_path)
    
    columns = [
        f"feat{feature_nb:03d}" for feature_nb in range(input_data.shape[1])
    ]
    LOGGER.info(f"Found {len(columns)} columns")

    model = tf.estimator.DNNClassifier(
        hidden_units = model_config,
        feature_columns=[
            tf.feature_column.numeric_column(col) for col in columns
        ],
        n_classes=2,
        model_dir=model_dir,
    )

    return model, columns

In [None]:
# Reload trained model
chosen_model, model_cols = init_model(
    dataset_path,
    pcaX,
    model_config,
    join(tensorboard_path, basename(dataset_path))
)
LOGGER.info("Model loaded!")

In [None]:
chosen_model.get_variable_names()

In [None]:
chosen_model_vars = [var for var in chosen_model.get_variable_names() if basename(var) == "kernel"]
layers = sorted(set([basename(dirname(var)) for var in chosen_model_vars]))

weights = dict()
cols = model_cols

for layer in layers:    
    kernel = chosen_model.get_variable_value(f"dnn/{layer}/kernel").transpose()
    
    layer_weights = np.zeros((kernel.shape[0], kernel.shape[1]+1))
    
    layer_weights[:, :-1] = kernel
    layer_weights[:, -1] = chosen_model.get_variable_value(f"dnn/{layer}/bias")
    
    layer_name = f"dnn/{layer}/bias"
    print(f"Layer size for {layer_name}: {chosen_model.get_variable_value(layer_name).shape}")
    
    weights[layer] = pd.DataFrame(layer_weights, columns=cols+["bias"])
    
    cols = [f"{layer}_n{index}" for index in range(layer_weights.shape[0])]

In [None]:
randX = df.sample(n=25, random_state=101)
LOGGER.info(f"Testing with matrix {randX.shape}...")

pca_randX = pca.transform(randX)
LOGGER.info(f"Matrix size after PCA: {pca_randX.shape}")

pca_randX = pd.DataFrame(pca_randX, columns=model_cols)

input_fn = tf.estimator.inputs.pandas_input_fn(x=pca_randX, shuffle=False)
predictions = list(chosen_model.predict(input_fn))

randX = randX.to_numpy()
pca_randX = pca_randX.to_numpy()
print(predictions[0])
LOGGER.info(f"Predictions generated")

In [None]:
workbook = "/mnt/data/ai_bugfinder/model-weights.xlsx"

LOGGER.info(f"Starting writing {basename(workbook)}...")
workbook  = xlsxwriter.Workbook(workbook)

start_row = 3
start_col = 1

LOGGER.info("Writing sample data...")
worksheet = workbook.add_worksheet("Sample data")

row_idx = start_row

for row in randX:
    worksheet.write(row_idx, 0, f"item{row_idx-start_row}")
    
    col_idx = start_col
    
    for item in row:
        worksheet.write(row_idx, col_idx, item)
        col_idx += 1
        
    row_idx += 1
    
row_idx += start_row

for row in pca_randX:
    worksheet.write(row_idx, 0, f"item{row_idx-2*start_row-len(randX)}")
    
    col_idx = start_col
    
    for item in row:
        worksheet.write(row_idx, col_idx, item)
        col_idx += 1
        
    row_idx += 1
    
row_idx += start_row

for row in predictions:
    worksheet.write(row_idx, 0, f"item{row_idx-3*start_row-len(randX)*2}")
    
    col_idx = start_col
    
    for item in row["probabilities"]:
        worksheet.write(row_idx, col_idx, item)
        col_idx += 1   
    
    worksheet.write(row_idx, col_idx, row["logits"])
    col_idx += 1

    worksheet.write(row_idx, col_idx, row["logistic"])
    col_idx += 1
        
    row_idx += 1


LOGGER.info(f"Writing PCA weights...")
worksheet = workbook.add_worksheet("PCA weights")
cols = dataset.features.columns[:-2]

for x in range(pca_weights.shape[0]-1):
    worksheet.write(x+3, 0, f"pca{x}")
    
worksheet.write(pca_weights.shape[0]+2, 0, "mu")

for y in range(len(cols)):
    worksheet.write(2, y+1, cols[y])

row_idx = start_row

for row in pca_weights:
    col_idx = 1
    row_avg = np.mean(row)
    
    for item in row:
        worksheet.write(row_idx, col_idx, item)
        col_idx += 1
        
    start = xl_rowcol_to_cell(row_idx, 1)
    end = xl_rowcol_to_cell(row_idx, col_idx-1)    
    
    worksheet.conditional_format(f"{start}:{end}", {
        "type": "3_color_scale",
        "min_color": "blue",
        "mid_color": "white",
        "max_color": "red"
    })
    row_idx += 1

LOGGER.info(f"Writing DNN weights...")
worksheet = workbook.add_worksheet("DNN weights")

row_idx = start_row

for layer, weight_df in weights.items():
    col_idx = start_col
        
    if row_idx != start_row:
        row_idx -= len(weight_df.columns) + start_row - 1
        
        for col in weight_df.columns[:-1]:
            row_idx += 1
            worksheet.write(row_idx, 0, col)
            
        row_idx += start_row
    
    for col in weight_df.columns:
        worksheet.write(row_idx, col_idx, col)
        col_idx += 1
        
        
    for index, item in weight_df.iterrows():
        row_idx += 1
        col_idx = start_col
        
        for item_val in item.values:
            worksheet.write(row_idx, col_idx, item_val)
            col_idx += 1
            
        start = xl_rowcol_to_cell(row_idx, 1)
        end = xl_rowcol_to_cell(row_idx, col_idx-1)
        worksheet.conditional_format(f"{start}:{end}", {
            "type": "3_color_scale",
            "min_color": "blue",
            "mid_color": "white",
            "max_color": "red"
        })
        
    row_idx += start_row
    

workbook.close()
LOGGER.info("Job done!")