Lavoro sui dataset iniziali con tutte le etichette

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

import sys
sys.path.insert(1, '')
from utility import Dataset

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
           'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
           'dst_host_srv_rerror_rate', 'label', 'score']

nominal_features = ['protocol_type', 'service', 'flag']
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_features = [feature for feature in columns if feature not in nominal_features + binary_features + ['label', 'score', 'num_outbound_cmds']]

train_df = pd.read_csv(f'dataset/nsl-kdd/KDDTrain+.txt', header=None)
test_df = pd.read_csv(f'dataset/nsl-kdd/KDDTest+.txt', header=None)

train_df = Dataset(train_df, columns).get_label5()
test_df = Dataset(test_df, columns).get_label5()

# Encoding of categorical variables
One hot encoder per 'protocol type' (3 valori) e 'flag' (11 valori) \
Target encoder per 'service' (70 valori) \
ATTENZIONE: se ci dovesse essere overfitting, sostituire Target encoder con Frequency encoder \
oppure fare smoothing o cross-validation (chatGPT spiega come si fa)

In [7]:
def oh_encoder(train_df, test_df, nominal_features):
    enc = OneHotEncoder()
    train_encoded = enc.fit_transform(train_df[nominal_features]).toarray()
    test_encoded = enc.transform(test_df[nominal_features]).toarray()
    new_columns = []
    for i, feature in enumerate(nominal_features):
        new_columns.extend([f"{feature}_{str(cat)}" for cat in enc.categories_[i]])

    train_ohe = train_df.drop(nominal_features, axis=1)
    train_ohe = pd.concat([train_ohe, pd.DataFrame(train_encoded, columns=new_columns)], axis=1)

    test_ohe = test_df.drop(nominal_features, axis=1)
    test_ohe = pd.concat([test_ohe, pd.DataFrame(test_encoded, columns=new_columns)], axis=1)

    return train_ohe, test_ohe

def l_encoder(train_df, test_df, nominal_features):
    enc = LabelEncoder()
    for feature in nominal_features:
        train_df[feature] = enc.fit_transform(train_df[feature])
        test_df[feature] = enc.transform(test_df[feature])

    return train_df, test_df

train_new, test_new = l_encoder(train_df, test_df, ['protocol_type', 'service', 'flag', 'label'])
print(train_new)


        duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0              0              1       20     9        491          0     0   
1              0              2       44     9        146          0     0   
2              0              1       49     5          0          0     0   
3              0              1       24     9        232       8153     0   
4              0              1       24     9        199        420     0   
...          ...            ...      ...   ...        ...        ...   ...   
125968         0              1       49     5          0          0     0   
125969         8              2       49     9        105        145     0   
125970         0              1       54     9       2231        384     0   
125971         0              1       30     5          0          0     0   
125972         0              1       20     9        151          0     0   

        wrong_fragment  urgent  hot  ...  dst_host_srv_count  \

# Scaling

In [8]:
# Per le feature continue: Se queste seguono o sono vicine a una distribuzione normale, 
# lo StandardScaler è spesso una buona scelta. Se invece ci sono outlier o i dati 
# sono distribuiti in modo non normale, considera RobustScaler o MinMaxScaler. (CONTROLLARE GLI OUTLIERS)
# Per le feature categoriali codificate (come One-Hot Encoded): in generale, 
# non è necessario applicare uno scaler poiché i valori saranno già binari (0 e 1). 
# Tuttavia, se hai usato un encoding come il Target Encoding, potresti voler applicare 
# uno scaler come MinMaxScaler per portare i valori target in un intervallo uniforme.

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

features_to_scale = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count', 
                     'serror_rate', 'rerror_rate', 'same_srv_rate', 'diff_srv_rate', 
                     'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
                     'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
                     'dst_host_serror_rate', 'dst_host_rerror_rate']

# def scaler(train_df, test_df, numeric_features, scaler = MinMaxScaler()):
#     train_scaled = scaler.fit_transform(train_df[numeric_features])
#     test_scaled = scaler.transform(test_df[numeric_features])

#     train_ss = train_df.drop(numeric_features, axis=1)
#     train_ss = pd.concat([train_ss, pd.DataFrame(train_scaled, columns=numeric_features)], axis=1)

#     test_ss = test_df.drop(numeric_features, axis=1)
#     test_ss = pd.concat([test_ss, pd.DataFrame(test_scaled, columns=numeric_features)], axis=1)

#     return train_ss, test_ss

ss = MinMaxScaler()
train_new[features_to_scale] = ss.fit_transform(train_new[features_to_scale])
test_new[features_to_scale] = ss.transform(test_new[features_to_scale])

# Discretization

In [None]:
# from sklearn.preprocessing import KBinsDiscretizer

# discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
# X_discretized = discretizer.fit_transform(X_train)

# Feature Selection 

1. Attribute Ratio 

Attribute Ratio approach is used for feature selection purposes. This approach was described by Hee-su Chae and Sang Hyun Choi in Feature Selection for efficient Intrusion Detection using Attribute Ratio and Feature Selection for Intrusion Detection using NSL-KDD

This approach is also used for nominal variables as they were encoded as binary variables above.

As it is a possible to have 'null' values because binary features could have Frequency(0) = 0, those 'null' values are replaced with 1000.0 (magic number). For NSL KDD dataset it is related only for 'protocol_type_tcp' ohe variable.

In section 4, we explain NSL-KDD data which has three attribute types. We use attribute average and frequency for each class calculate the AR from numeric and binary type. AR can be calculated as :                             (7)   Class Ratio (CR) is attribute is ratio of each class for Attribute i. CR is calculated by two methods according to the type of attributes. CR can be calculated as for numeric :                                      (8)  CR can be calculated as for binary : 

After calculating AR(i), Features rank ordering larger AR. Table 4 shows the rank of features with a calculated AR. 

In [None]:
# ho calcolato l'attribute ratio per ogni feature (divisione in 5 classi)
train_5_df = Dataset(train_df).get_label5()

def attributeRatio(feature, dataset):
    class_ratio = {}

    if feature in nominal_features:
        return 0

    if feature in binary_features:
        for label in dataset['label'].unique():
            class_ones = dataset[dataset['label'] == label][feature].sum()
            class_zeros = dataset[dataset['label'] == label][feature].count() - class_ones
            class_ratio[label] = class_ones / class_zeros
    else:
        total_mean = dataset[feature].mean()
        for label in dataset['label'].unique():
            class_mean = dataset[dataset['label'] == label][feature].mean()
            class_ratio[label] = class_mean / total_mean
    return max(class_ratio.values())

# calcola l'attribute ratio per ogni feature tranne 'label' e 'score'
attribute_ratios = {}
for feature in train_5_df.columns:
    if feature not in ['label', 'score', 'num_outbound_cmds']:
        attribute_ratios[feature] = attributeRatio(feature, train_5_df)

# ordina le feature in base all'attribute ratio
sorted_attribute_ratios = sorted(attribute_ratios.items(), key=lambda x: x[1], reverse=True)
# stampa una tabella con le feature ordinate. la prima colonna è il rank, la seconda la feature e la terza l'attribute ratio
for i, (feature, ratio) in enumerate(sorted_attribute_ratios):
    print(f"{i+1} & {feature} & {ratio:.2f} \\\\")


2. Filter methods

https://medium.com/@sariq16/correlation-based-feature-selection-in-a-data-science-project-3ca08d2af5c6

seleziona le k feature più rilevanti secondo il chi2 (hanno alta correlazione con il label e bassa con le altre feature), ANOVA F-test, Mutual Information.

In [None]:
def get_best_features(train_data, test_data, score_func, k):
    X = train_data.drop(['label'], axis=1)
    y = train_data['label']
    selector = SelectKBest(score_func=score_func, k = k).fit(X, y)
    X_train_selected = selector.transform(X)
    X_test_selected = selector.transform(test_data.drop(['label'], axis=1))
    selected_features = X.columns[selector.get_support()]
    print(selected_features)
    return X_train_selected, X_test_selected

def cfs(train_data, test_data):
    X_train = train_data.drop(['label'], axis=1)
    X_test = test_data.drop(['label'], axis=1)
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    X_train = X_train.drop(to_drop, axis=1)
    X_test = X_test.drop(to_drop, axis=1)
    print(X_train.columns)
    return X_train, X_test

k = 30
train_reduced, test_reduced = get_best_features(train_new, test_new, f_classif, k)
print(train_reduced.shape)
train_reduced, test_reduced = get_best_features(train_new, test_new, chi2, k)
print(train_reduced.shape)
train_reduced, test_reduced = get_best_features(train_new, test_new, mutual_info_classif, k)
print(train_reduced.shape)
train_reduced, test_reduced = cfs(train_new, test_new)
print(train_reduced.shape)

3. Wrapper Methods

Questi metodi iterano attraverso le combinazioni di feature e valutano la loro importanza usando un modello predittivo.

Recursive Feature Elimination (RFE): Utilizza un modello per selezionare feature rilevanti, eliminando quelle meno importanti ad ogni iterazione. Funziona bene con modelli come SVM, alberi decisionali o regressioni logistiche.

In [10]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def rfe(train_data, test_data, k):
    X = train_data.drop(['label'], axis=1)
    y = train_data['label']
    X_test = test_data.drop(['label'], axis=1)

    model = RandomForestClassifier()
    rfe = RFE(model, 
              n_features_to_select=k).fit(X, y)
    X_train_selected = rfe.transform(X)
    X_test_selected = rfe.transform(X_test)
    print(X.columns[rfe.get_support()])
    return X_train_selected, X_test_selected
k = 10
train_reduced, test_reduced = rfe(train_new, test_new, k)
print(train_reduced.shape)


Index(['protocol_type', 'service', 'flag', 'count', 'same_srv_rate',
       'diff_srv_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_serror_rate'],
      dtype='object')
(125973, 10)


Sequential Feature Selection (SFS): Aggiunge o rimuove feature in modo sequenziale, valutando il modello in ogni passaggio.

In [47]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

def sfs(train_data, test_data, k):
    X = train_data.drop(['label'], axis=1)
    y = train_data['label']
    X_test = test_data.drop(['label'], axis=1)

    model = RandomForestClassifier()
    sfs = SequentialFeatureSelector(model, 
                                    cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                                    scoring = 'accuracy', 
                                    direction='forward', 
                                    n_features_to_select=k).fit(X, y)
    X_train_selected = sfs.transform(X)
    X_test_selected = sfs.transform(X_test)
    print(X.columns[sfs.get_support()])
    return X_train_selected, X_test_selected

train_reduced, test_reduced = sfs(train_new, test_new, k)
print(train_reduced.shape)

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'wrong_fragment', 'hot', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login',
       'srv_count', 'rerror_rate', 'srv_rerror_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_rerror_rate'],
      dtype='object')
(125973, 30)


In [None]:
# calcola information gain 
# def get_information_gain(dataset, feature):
#     entropy = 0
#     for label in dataset['label'].unique():
#         p = dataset[dataset['label'] == label][feature].sum() / dataset[feature].sum()
#         entropy += p * np.log2(p)
#     return entropy

# information_gains = {}
# for feature in train_5_df.columns:
#     if feature not in ['label', 'score', 'num_outbound_cmds']:
#         information_gains[feature] = get_information_gain(train_5_df, feature)

# # ordina le feature in base all'information gain
# sorted_information_gains = sorted(information_gains.items(), key=lambda x: x[1], reverse=True)
# # stampa una tabella con le feature ordinate. la prima colonna è il rank, la seconda la feature e la terza l'information gain
# for i, (feature, gain) in enumerate(sorted_information_gains):
#     print(f"{i+1} & {feature} & {gain:.2f} \\\\")


# # calcola gain ratio
# def get_gain_ratio(dataset, feature):
#     split_info = 0
#     for label in dataset['label'].unique():
#         p = dataset[dataset['label'] == label][feature].sum() / dataset[feature].sum()
#         split_info += p * np.log2(p)
#     split_info = -split_info
#     return information_gains[feature] / split_info

# gain_ratios = {}
# for feature in train_5_df.columns:
#     if feature not in ['label', 'score', 'num_outbound_cmds']:
#         gain_ratios[feature] = get_gain_ratio(train_5_df, feature)

# # ordina le feature in base al gain ratio
# sorted_gain_ratios = sorted(gain_ratios.items(), key=lambda x: x[1], reverse=True)
# # stampa una tabella con le feature ordinate. la prima colonna è il rank, la seconda la feature e la terza il gain ratio
# for i, (feature, ratio) in enumerate(sorted_gain_ratios):
#     print(f"{i+1} & {feature} & {ratio:.2f} \\\\")



4. PCA

In [None]:
from sklearn.decomposition import PCA

def pca(train_data, test_data, k):
    X = train_data.drop(['label'], axis=1)
    X_test = test_data.drop(['label'], axis=1)

    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X)
    X_test_pca = pca.transform(X_test)
    
    print(X_train_pca.shape)
    print(pca.components_)
    print(pca.explained_variance_ratio_)
    print(pca.explained_variance_)
    print(pca.singular_values_)
    print(pca.singular_values_.sum())

    return X_train_pca, X_test_pca