In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from utility import Dataset

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

train_df = pd.read_csv(f'dataset/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'dataset/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

# Encoding of categorical variables
- One-Hot Encoding converts categorical variables into binary vectors where each category is represented by a column, and a value of 1 indicates the presence of that category. While this method is effective for non-ordinal features, it can lead to a significant increase in the number of features, especially for high-cardinality variables (for instance, the feature "service" has 70 possible values)
- Label Encoding assigns an integer to each category in a categorical variable. Each unique category is mapped to a specific number, making it more efficient

In [21]:
def oh_encoder(train_df, test_df, nominal_features):
    enc = OneHotEncoder()
    train_encoded = enc.fit_transform(train_df[nominal_features]).toarray()
    test_encoded = enc.transform(test_df[nominal_features]).toarray()
    new_columns = []
    for i, feature in enumerate(nominal_features):
        new_columns.extend([f"{feature}_{str(cat)}" for cat in enc.categories_[i]])

    train_ohe = train_df.drop(nominal_features, axis=1)
    train_ohe = pd.concat([train_ohe, pd.DataFrame(train_encoded, columns=new_columns)], axis=1)

    test_ohe = test_df.drop(nominal_features, axis=1)
    test_ohe = pd.concat([test_ohe, pd.DataFrame(test_encoded, columns=new_columns)], axis=1)

    return train_ohe, test_ohe

def l_encoder(train_df, test_df, nominal_features):
    enc = LabelEncoder()
    for feature in nominal_features:
        train_df[feature] = enc.fit_transform(train_df[feature])
        test_df[feature] = enc.transform(test_df[feature])

    return train_df, test_df

train_new, test_new = oh_encoder(train_df, test_df, ['protocol_type', 'service', 'flag'])
train_new.head()


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,146,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,232,8153,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Scaling

In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train = train_new.copy()
scaled_test = test_new.copy()
scaled_train[numeric_features] = scaler.fit_transform(train_new[numeric_features])
scaled_test[numeric_features] = scaler.transform(test_new[numeric_features])

scaled_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,3.558064e-07,0.0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.057999e-07,0.0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.681203e-07,6.223962e-06,0,0.0,0.0,0.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.442067e-07,3.20626e-07,0,0.0,0.0,0.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Feature Selection 
Several feature extraction methods have been tried

SelectKBest is a selection routine of Scikit-learn that selects the top k features based on statistical tests. Various scoring functions 
- Mutual Information (mutual\_info\_classif): estimate mutual information for a discrete target variable, capturing non-linear relationships
- Chi-Square (chi2): compute chi-squared stats between each non-negative feature and class
- ANOVA F-Value (f\_classif): compute the ANOVA F-value for the provided sample

In [23]:
def get_best_features(train_data, test_data, score_func, k):
    X = train_data.drop(['label'], axis=1)
    y = train_data['label']
    selector = SelectKBest(score_func=score_func, k = k).fit(X, y)
    X_train_selected = selector.transform(X)
    X_test_selected = selector.transform(test_data.drop(['label'], axis=1))
    selected_features = X.columns[selector.get_support()]
    print(selected_features)
    return X_train_selected, X_test_selected

k = 30
train_reduced, test_reduced = get_best_features(scaled_train, scaled_test, f_classif, k)
print(train_reduced.shape)
train_reduced, test_reduced = get_best_features(scaled_train, scaled_test, chi2, k)
print(train_reduced.shape)
train_reduced, test_reduced = get_best_features(scaled_train, scaled_test, mutual_info_classif, k)
print(train_reduced.shape)

Index(['hot', 'logged_in', 'root_shell', 'is_guest_login', 'count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'protocol_type_icmp', 'protocol_type_tcp',
       'service_eco_i', 'service_http', 'service_private', 'flag_RSTR',
       'flag_S0', 'flag_SF'],
      dtype='object')
(125973, 30)
Index(['logged_in', 'root_shell', 'is_guest_login', 'count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'ds

Correlation-based Feature Selection (CFS) selects features based on how well they correlate with the target variable while minimizing redundancy among the features. It aims to find a subset that is both highly relevant and uncorrelated with each other

In [None]:
def cfs(train_data, test_data):
    X_train = train_data.drop(['label'], axis=1)
    X_test = test_data.drop(['label'], axis=1)
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    X_train = X_train.drop(to_drop, axis=1)
    X_test = X_test.drop(to_drop, axis=1)
    print(X_train.columns)
    return X_train, X_test

k = 30
train_reduced, test_reduced = cfs(scaled_train, scaled_test)
print(train_reduced.shape)

In Recursive Feature Elimination (RFE), first an estimator is trained on the initial set of features. Then, the least important feature (or features, if a step>1 is used) is pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached

In [10]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def rfe(train_data, test_data, k):
    X = train_data.drop(['label'], axis=1)
    y = train_data['label']
    X_test = test_data.drop(['label'], axis=1)

    model = RandomForestClassifier()
    rfe = RFE(model, 
              n_features_to_select=k).fit(X, y)
    X_train_selected = rfe.transform(X)
    X_test_selected = rfe.transform(X_test)
    print(X.columns[rfe.get_support()])
    return X_train_selected, X_test_selected
k = 10
train_reduced, test_reduced = rfe(scaled_train, scaled_test, k)
print(train_reduced.shape)


Index(['protocol_type', 'service', 'flag', 'count', 'same_srv_rate',
       'diff_srv_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_serror_rate'],
      dtype='object')
(125973, 10)


Sequential Feature Selection (SFS) is another iterative method that either adds (forward selection) or removes (backward selection) features based on the model’s performance. It builds up the feature set step-by-step to find the combination that maximizes a cross-validated score

In [47]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

def sfs(train_data, test_data, k):
    X = train_data.drop(['label'], axis=1)
    y = train_data['label']
    X_test = test_data.drop(['label'], axis=1)

    model = RandomForestClassifier()
    sfs = SequentialFeatureSelector(model, 
                                    cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                                    scoring = 'accuracy', 
                                    direction='forward', 
                                    n_features_to_select=k).fit(X, y)
    X_train_selected = sfs.transform(X)
    X_test_selected = sfs.transform(X_test)
    print(X.columns[sfs.get_support()])
    return X_train_selected, X_test_selected

train_reduced, test_reduced = sfs(scaled_train, scaled_test, k)
print(train_reduced.shape)

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'wrong_fragment', 'hot', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login',
       'srv_count', 'rerror_rate', 'srv_rerror_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_rerror_rate'],
      dtype='object')
(125973, 30)


Principal Component Analysis (PCA) is a dimensionality reduction technique that transforms features into a smaller set of uncorrelated components, retaining the variance of the original data. While it doesn’t directly select features, PCA is useful for reducing noise and computational complexity, especially in high-dimensional datasets

In [24]:
from sklearn.decomposition import PCA

def pca(train_data, test_data, k):
    X = train_data.drop(['label'], axis=1)
    X_test = test_data.drop(['label'], axis=1)

    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X)
    X_test_pca = pca.transform(X_test)
    
    print(X_train_pca.shape)
    print(pca.components_)
    print(pca.explained_variance_ratio_)
    print(pca.explained_variance_)
    print(pca.singular_values_)
    print(pca.singular_values_.sum())

    return X_train_pca, X_test_pca

k = 10
train_reduced, test_reduced = pca(scaled_train, scaled_test, k)
print(train_reduced.shape)

(125973, 10)
[[ 1.97239503e-04  8.74210130e-07  4.01557696e-07 ... -3.28222212e-05
  -3.29683995e-01  1.63419185e-03]
 [ 1.54281505e-02  5.11690574e-05  3.97206242e-05 ... -1.61307442e-04
  -1.64290313e-01 -4.21078364e-04]
 [ 1.73229208e-03 -1.61375042e-05 -6.04738304e-06 ... -3.56023482e-04
   1.88934025e-01  1.03135283e-03]
 ...
 [ 4.03490360e-02  8.88548644e-05  1.10158497e-04 ...  6.45131001e-04
  -4.25828878e-02  1.41110399e-02]
 [-1.28611309e-02 -1.68730407e-04 -4.13224447e-05 ... -1.25405259e-03
   7.63766085e-02 -3.70843352e-03]
 [ 1.01648253e-01  9.47433680e-05  2.88094806e-04 ...  1.42498069e-03
  -1.85103583e-01  2.21159493e-02]]
[0.41252402 0.15651982 0.10942724 0.05073837 0.04106383 0.02822428
 0.02268929 0.01979969 0.01497984 0.01378681]
[1.74893431 0.66358048 0.46392705 0.21511009 0.17409395 0.11965947
 0.09619337 0.08394264 0.06350844 0.05845049]
[469.37911462 289.12378026 241.74742763 164.61423976 148.09106357
 122.77517341 110.08029591 102.83201099  89.44431379  85.80