# 05. Dimension reduction algorithms

https://scikit-learn.org/stable/modules/feature_selection.html

## 05.a. Imports, logging configuration and dataset preparation

In [None]:
# Enable these line if live changes in the codebase are made
# %load_ext autoreload
# %autoreload 2

In [None]:
# Disable tensorflow logging
import os
import logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
logging.getLogger('tensorflow').setLevel(logging.FATAL)

In [None]:
# Specific instruction to run the notebooks from a sub-folder.
import sys
sys.path.append("..")

In [None]:
import logging
from bugfinder.settings import LOGGER
from bugfinder.dataset import CWEClassificationDataset as Dataset

In [None]:
from bugfinder.features.reduction.variance_threshold import FeatureSelector as VarianceThreshold
from bugfinder.features.reduction.univariate_select import FeatureSelector as UnivariateSelect
from bugfinder.features.reduction.select_from_model import FeatureSelector as SelectFromModel
from bugfinder.features.reduction.auto_encoder import FeatureSelector as AutoEncoder
from bugfinder.features.reduction.sequential_feature_selector import FeatureSelector as SequentialFeatureSelector
from bugfinder.features.reduction.pca import FeatureSelector as PCA
from bugfinder.features.reduction.recursive_feature_elimination import FeatureSelector as RecursiveFeatureElimination

In [None]:
# Setup logging to only output INFO level messages
LOGGER.setLevel(logging.INFO)

In [None]:
# Dataset directories (DO NOT EDIT)
cwe121_v__2_dataset_path = [
    "../data/cwe121_v112", "../data/cwe121_v122", "../data/cwe121_v212", "../data/cwe121_v222", 
#     "../data/cwe121_v312", "../data/cwe121_v322"
]
cwe121_v__3_dataset_path = [
    "../data/cwe121_v113", "../data/cwe121_v123", "../data/cwe121_v213", "../data/cwe121_v223", 
#     "../data/cwe121_v313", "../data/cwe121_v323"
]

## Low variance filter

Removes features with little changes.

In [None]:
threshold = 0.995

In [None]:
for dataset_path in cwe121_v__2_dataset_path[:1]:
    LOGGER.info("Processing %s..." % dataset_path)
    dataset = Dataset(dataset_path)
    dataset.queue_operation(VarianceThreshold, {"threshold": threshold, "dry_run": True})
    dataset.process()

## Univariate feature selection


In [None]:
scoring_functions = ["chi2", "f_classif", "mutual_info_classif"]
scoring_fn = scoring_functions[0]

scoring_modes = ["k_best", "percentile", "fpr", "fdr", "fwe"]
scoring_mode = scoring_modes[0]

scoring_param = 200

In [None]:
for dataset_path in cwe121_v__2_dataset_path[:1]:
    LOGGER.info("Processing %s..." % dataset_path)
    dataset = Dataset(dataset_path)
    dataset.queue_operation(UnivariateSelect, {"function": scoring_fn, "mode": scoring_mode, "param": scoring_param, "dry_run": True})
    dataset.process()

## Select from model

In [None]:
estimators = [
    "LogisticRegression",
    "LogisticRegressionCV",
    "PassiveAggressive",
    "Perceptron",
    "Ridge",
    "RidgeCV",
    "SGD",
    "DecisionTree",
    "ExtraTree",
    "AdaBoost",    
    "ExtraTrees",
    "GradientBoosting",
    "RandomForest",
    "SVC",
    "SVR",
    "NuSVC",
    "NuSVR",
    "OneClassSVM"
]
estimator = estimators[0]

In [None]:
for dataset_path in cwe121_v__2_dataset_path[:1]:
    LOGGER.info("Processing %s..." % dataset_path)
    dataset = Dataset(dataset_path)
    dataset.queue_operation(SelectFromModel, {"model": estimator, "dry_run": True})
    dataset.process()

## Recursive feature elimination: 

removes features with the least impact on the sum of squares error.

In [None]:
estimator = estimators[0]
cross_validation = False
features_to_keep = 1000

In [None]:
for dataset_path in cwe121_v__2_dataset_path[:1]:
    LOGGER.info("Processing %s..." % dataset_path)
    dataset = Dataset(dataset_path)
    dataset.queue_operation(RecursiveFeatureElimination, {"model": estimator, "cross_validation": cross_validation, "features": features_to_keep, "dry_run": True})
    dataset.process()

## Sequential feature selection: 

adds features with the most impact on the sum of squares error.

-> Sequential Feature selection

In [None]:
estimator = estimators[0]
directions = ["forward", "backward"]
direction = directions[0]
features_to_keep = 1000

In [None]:
for dataset_path in cwe121_v__2_dataset_path[:1]:
    LOGGER.info("Processing %s..." % dataset_path)
    dataset = Dataset(dataset_path)
    dataset.queue_operation(SequentialFeatureSelector, {"model": estimator, "direction": direction, "features": features_to_keep, "dry_run": True})
    dataset.process()

## Auto encoders

defines a neural network with the same number of input and output neurons as the number of features, the hidden layers have a smaller number of neurons to perform dimension reduction.

-> create a new model

In [None]:
# Input features
dimension = 250
layers = "500,100,500"
model_path = "/tmp/encoder.mdl"

In [None]:
for dataset_path in cwe121_v__2_dataset_path[:1]:
    LOGGER.info("Processing %s..." % dataset_path)
    dataset = Dataset(dataset_path)
    dataset.queue_operation(
        AutoEncoder, 
        {
            "dimension": dimension, 
            "layers": layers, 
            "encoder_path": features_to_keep, 
            "dry_run": True
        }
    )
    dataset.process()