## Multi-Label Classifier and Features Importance with XGboost

In [1]:
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.metrics import classification_report, accuracy_score

In [2]:
import pandas as pd
import numpy as np

In [3]:
CICIDS_data = pd.read_csv('../Wednesday-workingHours.pcap_ISCX.csv')

In [4]:
# dropping duplications
def find_duplicated_columns(df):
    grouped_columns = []
    seen_columns = set()
    for col in df.columns:
        if col not in seen_columns:
            duplicates = [col] + [c for c in df.columns if c != col and df[col].equals(df[c])]
            if len(duplicates)>1:
                grouped_columns.extend(duplicates[1:])
            seen_columns.update(duplicates)
    return grouped_columns

In [5]:
# data preproccessing 
CICIDS_data.columns = [col.strip() for col in CICIDS_data.columns]
CICIDS_data.drop(CICIDS_data.columns[CICIDS_data.eq(0).all()], axis=1, inplace=True)
CICIDS_data.drop(find_duplicated_columns(CICIDS_data), axis=1, inplace=True)

In [6]:
# splitting my data into features (X) and target (y) ignoring the columns with inf values
X = CICIDS_data.drop(['Label','Flow Bytes/s', 'Flow Packets/s'],axis=1).copy()
y = CICIDS_data['Label'].copy()

In [7]:
# filling nan values with -1
X.fillna(-1, inplace=True)

In [8]:
# scaling my features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# encoding my categorical labels
onehot_encoder = OneHotEncoder(sparse_output=False)
y_encoded = onehot_encoder.fit_transform(y.values.reshape(-1, 1))

In [10]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [11]:
# building the model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [12]:
# creating multi-class model based on my xgboost model
multi_target_xgb = MultiOutputClassifier(xgb_model, n_jobs=-1)

In [13]:
# training the model on the training set
multi_target_xgb.fit(X_train, y_train)

In [14]:
# predicting the test set classes
y_pred = multi_target_xgb.predict(X_test)

In [15]:
# evaluating model performance
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=onehot_encoder.categories_[0]))

Classification Report:
                  precision    recall  f1-score   support

          BENIGN       1.00      1.00      1.00     88170
   DoS GoldenEye       1.00      1.00      1.00      2017
        DoS Hulk       1.00      1.00      1.00     46147
DoS Slowhttptest       1.00      0.99      0.99      1090
   DoS slowloris       0.99      0.99      0.99      1114
      Heartbleed       1.00      1.00      1.00         3

       micro avg       1.00      1.00      1.00    138541
       macro avg       1.00      1.00      1.00    138541
    weighted avg       1.00      1.00      1.00    138541
     samples avg       1.00      1.00      1.00    138541



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# decoding the classes names
y_pred_decoded = onehot_encoder.categories_[0][np.argmax(y_pred, axis=1)]
y_test_decoded = onehot_encoder.categories_[0][np.argmax(y_test, axis=1)]

In [17]:
# accuracy for each class
for i, label in enumerate(onehot_encoder.categories_[0]):
    accuracy = accuracy_score(y_test_decoded == label, y_pred_decoded == label)
    print(f"Accuracy for {label}: {accuracy:.2f}")

Accuracy for BENIGN: 1.00
Accuracy for DoS GoldenEye: 1.00
Accuracy for DoS Hulk: 1.00
Accuracy for DoS Slowhttptest: 1.00
Accuracy for DoS slowloris: 1.00
Accuracy for Heartbleed: 1.00


In [18]:
# n top features with or without score
def get_important_features(X, model, n_features, with_score = False):
    all_features = X.columns.to_list()
    feature_importances = []
    for idx, estimator in enumerate(multi_target_xgb.estimators_):
        importances = estimator.feature_importances_
        feature_importances.append(importances)
    average_importances = np.mean(feature_importances, axis=0)
    indices_of_largest = np.argsort(average_importances)[-n_features:][::-1]
    if with_score:
        n_important_features = [(all_features[i],average_importances[i]) for i in indices_of_largest]
    else:
        n_important_features = [all_features[i] for i in indices_of_largest]
    return n_important_features

In [19]:
n_important_features = get_important_features(X, multi_target_xgb, 10, True)

In [20]:
# top 10 features calculated with xgboost features importance 
n_important_features

[('Bwd Packet Length Std', 0.19318448),
 ('Packet Length Mean', 0.11843445),
 ('Active Std', 0.110743515),
 ('Bwd Packet Length Max', 0.07032435),
 ('Bwd IAT Std', 0.05684802),
 ('Active Min', 0.04791169),
 ('Total Backward Packets', 0.046306435),
 ('Packet Length Std', 0.039558824),
 ('Destination Port', 0.03697658),
 ('Bwd Packets/s', 0.029356735)]