In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from utility import Dataset, oh_encoder, t_encoder, l_encoder, scaler, get_best_features, cfs, rfe, sfs, pca, eval_metric
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

from sklearn.metrics import roc_curve, auc

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

starting_train_df = pd.read_csv(f'dataset/nsl-kdd/KDDTrain+.txt', header=None)
starting_test_df = pd.read_csv(f'dataset/nsl-kdd/KDDTest+.txt', header=None)

def pipeline(train_df, test_df):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'LinearSVC': LinearSVC(),
        'GaussianNB': GaussianNB()
    }

    y_train = train_df['label']
    y_test = test_df['label']
    X_train = train_df.drop(columns=['label'])
    X_test = test_df.drop(columns=['label'])

    n_features = X_train.shape[1]
    for k in range(1, n_features + 1):
        for method in [chi2, f_classif, mutual_info_classif]:
            print(f"Selecting {k} best features using {method.__name__}...")
            X_train, X_test = get_best_features(train_df, test_df, method, k)
            y_train = train_df['label']
            y_test = test_df['label']

            for name, model in models.items():
                print(f"Training {name}...")
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                print(classification_report(y_test, y_pred, target_names=model.classes_))


        X_train, X_test = rfe(train_df, test_df, k)
        print(f"Selecting {k} best features using RFE...")
        y_train = train_df['label']
        y_test = test_df['label']

        for name, model in models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            print(classification_report(y_test, y_pred, target_names=model.classes_))


        X_train, X_test = pca(train_df, test_df, k)
        print(f"Selecting {k} best features using PCA...")
        y_train = train_df['label']
        y_test = test_df['label']

        for name, model in models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            print(classification_report(y_test, y_pred, target_names=model.classes_))

    X_train, X_test = cfs(train_df, test_df)
    print(f"Selecting best features using CFS...")
    y_train = train_df['label']
    y_test = test_df['label']

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred, target_names=model.classes_))

1. Con 2 label

In [8]:
train_df = Dataset(starting_train_df, columns).get_label2()
test_df = Dataset(starting_test_df, columns).get_label2()

1.1 Scalato

In [9]:
scaled_train_df, scaled_test_df = scaler(train_df, test_df, numeric_features, MinMaxScaler())

1.1.1 Solo one hot encoder

In [10]:
train_df, test_df = oh_encoder(scaled_train_df, scaled_test_df, ['protocol_type', 'flag', 'service'])

1.1.1.1 f_classif \
1.1.1.2 chi2 \
1.1.1.3 mutual_info_classif \
1.1.1.4 cfs \
1.1.1.5 rfe \
1.1.1.6 pca 

In [None]:
pipeline(train_df, test_df)

1.1.2 Solo label encoder

In [None]:
train_df, test_df = l_encoder(scaled_train_df, scaled_test_df, ['service', 'protocol_type', 'flag'])

1.1.2.1 f_classif \
1.1.2.2 chi2 \
1.1.2.3 mutual_info_classif \
1.1.2.4 cfs \
1.1.2.5 rfe \
1.1.2.6 pca 

In [None]:
pipeline(train_df, test_df)

1.1.3 solo target encoder

In [None]:
train_df, test_df = t_encoder(scaled_train_df, scaled_test_df, ['service', 'protocol_type', 'flag'])

1.1.3.1 f_classif \
1.1.3.2 chi2 \
1.1.3.3 mutual_info_classif \
1.1.3.4 cfs \
1.1.3.5 rfe \
1.1.3.6 pca 

In [None]:
pipeline(train_df, test_df)

1.1.4 ohe + label

In [None]:
train_df, test_df = oh_encoder(scaled_train_df, scaled_test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service',])

1.1.4.1 f_classif \
1.1.4.2 chi2 \
1.1.4.3 mutual_info_classif \
1.1.4.4 cfs \
1.1.4.5 rfe \
1.1.4.6 pca 

In [None]:
pipeline(train_df, test_df)

1.1.5 ohe + target

In [None]:
train_df, test_df = t_encoder(scaled_train_df, scaled_test_df, ['service', ])
train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag'])

1.1.5.1 f_classif \
1.1.5.2 chi2 \
1.1.5.3 mutual_info_classif \
1.1.5.4 cfs \
1.1.5.5 rfe \
1.1.5.6 pca 

In [None]:
pipeline(train_df, test_df)

1.2 Non scalato

In [None]:
# DIOCANE
unscaled_train_df = Dataset(starting_train_df, columns).get_label2()
unscaled_test_df = Dataset(starting_test_df, columns).get_label2()

1.2.1 Solo one hot encoder

In [None]:
train_df, test_df = oh_encoder(unscaled_train_df, unscaled_test_df, ['protocol_type', 'flag', 'service'])

1.2.1.1 f_classif \
1.2.1.2 chi2 \
1.2.1.3 mutual_info_classif \
1.2.1.4 cfs \
1.2.1.5 rfe \
1.2.1.6 pca 

In [None]:
pipeline(train_df, test_df)

1.2.2 Solo label encoder

In [None]:
train_df, test_df = l_encoder(unscaled_train_df, unscaled_test_df, ['service', 'protocol_type', 'flag'])

1.2.2.1 f_classif \
1.2.2.2 chi2 \
1.2.2.3 mutual_info_classif \
1.2.2.4 cfs \
1.2.2.5 rfe \
1.2.2.6 pca 

In [None]:
pipeline(train_df, test_df)

1.2.3 solo target encoder

In [None]:
train_df, test_df = t_encoder(unscaled_train_df, unscaled_test_df, ['service', 'protocol_type', 'flag'])

1.2.3.1 f_classif \
1.2.3.2 chi2 \
1.2.3.3 mutual_info_classif \
1.2.3.4 cfs \
1.2.3.5 rfe \
1.2.3.6 pca 

In [None]:
pipeline(train_df, test_df)

1.2.4 ohe + label

In [None]:
train_df, test_df = oh_encoder(unscaled_train_df, unscaled_test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', ])

1.2.4.1 f_classif \
1.2.4.2 chi2 \
1.2.4.3 mutual_info_classif \
1.2.4.4 cfs \
1.2.4.5 rfe \
1.2.4.6 pca 

In [None]:
pipeline(train_df, test_df)

1.2.5 ohe + target

In [None]:
train_df, test_df = t_encoder(unscaled_train_df, unscaled_test_df, ['service', ])
train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag'])

1.2.5.1 f_classif \
1.2.5.2 chi2 \
1.2.5.3 mutual_info_classif \
1.2.5.4 cfs \
1.2.5.5 rfe \
1.2.5.6 pca 

In [None]:
pipeline(train_df, test_df)

1.3 Discretizzato

In [None]:
train_df = Dataset(starting_train_df, columns).get_label2()
test_df = Dataset(starting_test_df, columns).get_label2()

discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_train = train_df.drop(columns=['label'])
X_test = test_df.drop(columns=['label'])
X_train = discretizer.fit_transform(X_train)
X_test = discretizer.transform(X_test)

discretized_train_df = pd.concat([pd.DataFrame(X_train), train_df['label']], axis=1)
discretized_test_df = pd.concat([pd.DataFrame(X_test), test_df['label']], axis=1)

1.3.1 Solo one hot encoder

In [None]:
train_df, test_df = oh_encoder(discretized_train_df, discretized_test_df, ['protocol_type', 'flag', 'service'])

1.1.3.1 f_classif \
1.1.3.2 chi2 \
1.1.3.3 mutual_info_classif \
1.1.3.4 cfs \
1.1.3.5 rfe \
1.1.3.6 pca 

In [None]:
pipeline(train_df, test_df)

1.3.2 Solo label encoder

In [None]:
train_df, test_df = l_encoder(discretized_train_df, discretized_test_df, ['service', 'protocol_type', 'flag'])

1.3.2.1 f_classif \
1.3.2.2 chi2 \
1.3.2.3 mutual_info_classif \
1.3.2.4 cfs \
1.3.2.5 rfe \
1.3.2.6 pca 

In [None]:
pipeline(train_df, test_df)

1.3.3 solo target encoder

In [None]:
train_df, test_df = t_encoder(discretized_train_df, discretized_test_df, ['service', 'protocol_type', 'flag'])

1.3.3.1 f_classif \
1.3.3.2 chi2 \
1.3.3.3 mutual_info_classif \
1.3.3.4 cfs \
1.3.3.5 rfe \
1.3.3.6 pca 

In [None]:
pipeline(train_df, test_df)

1.3.4 ohe + label

In [None]:
train_df, test_df = oh_encoder(discretized_train_df, discretized_test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', ])

1.3.4.1 f_classif \
1.3.4.2 chi2 \
1.3.4.3 mutual_info_classif \
1.3.4.4 cfs \
1.3.4.5 rfe \
1.3.4.6 pca 

In [None]:
pipeline(train_df, test_df)

1.3.5 ohe + target

In [None]:
train_df, test_df = t_encoder(discretized_train_df, discretized_test_df, ['service', ])
train_df, test_df = oh_encoder(train_df, test_df, ['protocol_type', 'flag'])

1.3.5.1 f_classif \
1.3.5.2 chi2 \
1.3.5.3 mutual_info_classif \
1.3.5.4 cfs \
1.3.5.5 rfe \
1.3.5.6 pca 

In [None]:
pipeline(train_df, test_df)

2. Con 5 label

In [None]:
train_df = Dataset(starting_train_df, columns).get_label5()
test_df = Dataset(starting_test_df, columns).get_label5()

2.1 Scalato

In [None]:
scaled_train_df, scaled_test_df = scaler(train_df, test_df, numeric_features, MinMaxScaler())

2.1.1 Solo ohe encoder

In [None]:
train_df, test_df = oh_encoder(scaled_train_df, scaled_test_df, ['protocol_type', 'flag', 'service'])

2.1.1.1 f_classif \
2.1.1.2 chi2 \
2.1.1.3 mutual_info_classif \
2.1.1.4 cfs \
2.1.1.5 rfe \
2.1.1.6 pca 

In [None]:
pipeline(train_df, test_df)

2.1.2 Solo label encoder

In [None]:
train_df, test_df = l_encoder(scaled_train_df, scaled_test_df, ['service', 'protocol_type', 'flag'])

2.1.2.1 f_classif \
2.1.2.2 chi2 \
2.1.2.3 mutual_info_classif \
2.1.2.4 cfs \
2.1.2.5 rfe \
2.1.2.6 pca 

In [None]:
pipeline(train_df, test_df)

2.1.3 ohe + label

In [None]:
train_df, test_df = oh_encoder(scaled_train_df, scaled_test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', ])

2.1.3.1 f_classif \
2.1.3.2 chi2 \
2.1.3.3 mutual_info_classif \
2.1.3.4 cfs \
2.1.3.5 rfe \
2.1.3.6 pca 

In [None]:
pipeline(train_df, test_df)

2.2 Non scalato

In [None]:
# MADONNA BOCCHINARA
unscaled_train_df = Dataset(starting_train_df, columns).get_label5()
unscaled_test_df = Dataset(starting_test_df, columns).get_label5()

2.2.1 Solo ohe encoder

In [None]:
train_df, test_df = oh_encoder(unscaled_train_df, unscaled_test_df, ['protocol_type', 'flag', 'service'])

2.2.1.1 f_classif \
2.2.1.2 chi2 \
2.2.1.3 mutual_info_classif \
2.2.1.4 cfs \
2.2.1.5 rfe \
2.2.1.6 pca 

In [None]:
pipeline(train_df, test_df)

2.2.2 Solo label encoder

In [None]:
train_df, test_df = l_encoder(unscaled_train_df, unscaled_test_df, ['service', 'protocol_type', 'flag'])

2.2.2.1 f_classif \
2.2.2.2 chi2 \
2.2.2.3 mutual_info_classif \
2.2.2.4 cfs \
2.2.2.5 rfe \
2.2.2.6 pca 

In [None]:
pipeline(train_df, test_df)

2.2.3 ohe + label

In [None]:
train_df, test_df = oh_encoder(unscaled_train_df, unscaled_test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', ])

2.2.3.1 f_classif \
2.2.3.2 chi2 \
2.2.3.3 mutual_info_classif \
2.2.3.4 cfs \
2.2.3.5 rfe \
2.2.3.6 pca 

In [None]:
pipeline(train_df, test_df)

2.3 Discretizzato

In [None]:
train_df = Dataset(starting_train_df, columns).get_label5()
test_df = Dataset(starting_test_df, columns).get_label5()

discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_train = train_df.drop(columns=['label'])
X_test = test_df.drop(columns=['label'])
X_train = discretizer.fit_transform(X_train)
X_test = discretizer.transform(X_test)

discretized_train_df = pd.concat([pd.DataFrame(X_train), train_df['label']], axis=1)
discretized_test_df = pd.concat([pd.DataFrame(X_test), test_df['label']], axis=1)

2.3.1 Solo one hot encoder

In [None]:
train_df, test_df = oh_encoder(discretized_train_df, discretized_test_df, ['protocol_type', 'flag', 'service'])

2.1.3.1 f_classif \
2.1.3.2 chi2 \
2.1.3.3 mutual_info_classif \
2.1.3.4 cfs \
2.1.3.5 rfe \
2.1.3.6 pca 

In [None]:
pipeline(train_df, test_df)

2.3.2 Solo label encoder

In [None]:
train_df, test_df = l_encoder(discretized_train_df, discretized_test_df, ['service', 'protocol_type', 'flag'])

2.3.2.1 f_classif \
2.3.2.2 chi2 \
2.3.2.3 mutual_info_classif \
2.3.2.4 cfs \
2.3.2.5 rfe \
2.3.2.6 pca 

In [None]:
pipeline(train_df, test_df)

1.3.4 ohe + label

In [None]:
train_df, test_df = oh_encoder(discretized_train_df, discretized_test_df, ['protocol_type', 'flag'])
train_df, test_df = l_encoder(train_df, test_df, ['service', ])

1.3.4.1 f_classif \
1.3.4.2 chi2 \
1.3.4.3 mutual_info_classif \
1.3.4.4 cfs \
1.3.4.5 rfe \
1.3.4.6 pca 

In [None]:
pipeline(train_df, test_df)