<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_DDoS_attacks_using_XGB_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load and filter the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Separate features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df.columns else pd.Series([])

# Encode target labels into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Check for valid features and labels
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (173777, 46)
Target shape: (173777,)


# 5 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.037645
1     Header_Length  1.138591
34              Min  1.048405
38         Tot size  1.043659
41         Magnitue  1.035201
36              AVG  1.032079
33          Tot sum  1.028595
2     Protocol Type  0.982687
35              Max  0.976891
26              TCP  0.667943
15        syn_count  0.616002
30             ICMP  0.534726
8   syn_flag_number  0.528574
4              Rate  0.525554
5             Srate  0.525462
0     flow_duration  0.480493
18        rst_count  0.432804
27              UDP  0.432019
16        fin_count  0.400263
17        urg_count  0.387378
14        ack_count  0.382086
7   fin_flag_number  0.367408
9   rst_flag_number  0.358779
10  psh_flag_number  0.355179
11  ack_flag_number  0.353204
37              Std  0.227711
43       Covariance  0.222270
42           Radius  0.221354
44         Variance  0.147691
3          Duration  0.060852
19             HTTP  0.029799
20            HTTPS  0.016401
40        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
7   fin_flag_number      1
8   syn_flag_number      1
10  psh_flag_number      1
27              UDP      2
39              IAT      3
34              Min      4
30             ICMP      5
17        urg_count      6
15        syn_count      7
44         Variance      8
1     Header_Length      9
18        rst_count     10
33          Tot sum     11
43       Covariance     12
37              Std     13
41         Magnitue     14
38         Tot size     15
26              TCP     16
19             HTTP     17
45           Weight     18
2     Protocol Type     19
11  ack_flag_number     20
4              Rate     21
35              Max     22
14        ack_count     23
20            HTTPS     24
42           Radius     25
0     flow_duration     26
16        fin_count     27
9   rst_flag_number     28
31              IPv     29
3          Duration     30
40           Number     31
36              AVG     32
21              DNS     33
23             SMTP     34
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['syn_flag_number', 'Header_Length', 'psh_flag_number', 'IAT', 'fin_flag_number']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: []
top_features_combined =  ['syn_flag_number', 'Header_Length', 'psh_flag_number', 'IAT', 'fin_flag_number']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9982572095623294
Precision: 0.9982512644929352
Recall: 0.9982572095623294
F1 Score: 0.9982539056204952
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       513
           1       0.85      0.82      0.84        56
           2       1.00      1.00      1.00     12772
           3       0.97      0.96      0.96       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.42      0.42      0.42        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       0.98      0.98      0.98       548

    accuracy                           1.00     60822
   macro avg       0.93      0.93      0.93     60822
weighted avg       1.00      1.00      1.00     6

# 10 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.037636
1     Header_Length  1.139532
34              Min  1.047411
38         Tot size  1.042684
41         Magnitue  1.036512
33          Tot sum  1.030449
36              AVG  1.029815
35              Max  0.978178
2     Protocol Type  0.977986
26              TCP  0.667274
15        syn_count  0.613907
30             ICMP  0.533262
8   syn_flag_number  0.530390
5             Srate  0.525770
4              Rate  0.525511
0     flow_duration  0.479255
27              UDP  0.436314
18        rst_count  0.434267
16        fin_count  0.401572
17        urg_count  0.392326
14        ack_count  0.379281
7   fin_flag_number  0.363786
10  psh_flag_number  0.360051
9   rst_flag_number  0.357129
11  ack_flag_number  0.352174
37              Std  0.226539
43       Covariance  0.223701
42           Radius  0.222929
44         Variance  0.146826
3          Duration  0.056623
19             HTTP  0.030074
20            HTTPS  0.018144
31        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
7   fin_flag_number      1
8   syn_flag_number      1
10  psh_flag_number      1
27              UDP      1
39              IAT      1
34              Min      2
30             ICMP      3
17        urg_count      4
15        syn_count      5
44         Variance      6
1     Header_Length      7
18        rst_count      8
33          Tot sum      9
43       Covariance     10
37              Std     11
41         Magnitue     12
38         Tot size     13
26              TCP     14
19             HTTP     15
45           Weight     16
2     Protocol Type     17
11  ack_flag_number     18
4              Rate     19
35              Max     20
14        ack_count     21
20            HTTPS     22
42           Radius     23
0     flow_duration     24
16        fin_count     25
9   rst_flag_number     26
31              IPv     27
3          Duration     28
40           Number     29
36              AVG     30
21              DNS     31
23             SMTP     32
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['UDP', 'syn_flag_number', 'Magnitue', 'Tot size', 'psh_flag_number', 'fin_flag_number', 'Header_Length', 'IAT', 'Min']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  ICMP
Extra features from RFE: ['ICMP']
top_features_combined =  ['UDP', 'syn_flag_number', 'Magnitue', 'Tot size', 'psh_flag_number', 'fin_flag_number', 'Header_Length', 'IAT', 'Min', 'ICMP']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9994738745848541
Precision: 0.9995005620141906
Recall: 0.9994738745848541
F1 Score: 0.9994841514936077
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       513
           1       0.91      0.86      0.88        56
           2       1.00      1.00      1.00     12772
           3       1.00      1.00      1.00       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.62      0.75      0.68        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       0.99      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.96      0.97      0.96     60822
weighted avg       1.00      1.00      1.00     6

# 15 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.037641
1     Header_Length  1.136766
34              Min  1.050435
38         Tot size  1.042562
41         Magnitue  1.038226
36              AVG  1.032098
33          Tot sum  1.027270
2     Protocol Type  0.981133
35              Max  0.978302
26              TCP  0.667992
15        syn_count  0.615501
30             ICMP  0.533271
8   syn_flag_number  0.527674
4              Rate  0.525725
5             Srate  0.525546
0     flow_duration  0.478363
27              UDP  0.436284
18        rst_count  0.434036
16        fin_count  0.401273
17        urg_count  0.387513
14        ack_count  0.380493
7   fin_flag_number  0.363018
9   rst_flag_number  0.359308
10  psh_flag_number  0.356355
11  ack_flag_number  0.352058
42           Radius  0.225459
37              Std  0.225199
43       Covariance  0.224380
44         Variance  0.150294
3          Duration  0.058356
19             HTTP  0.031141
20            HTTPS  0.017731
40        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
7   fin_flag_number      1
8   syn_flag_number      1
10  psh_flag_number      1
17        urg_count      1
27              UDP      1
30             ICMP      1
39              IAT      1
34              Min      1
15        syn_count      2
44         Variance      3
1     Header_Length      4
18        rst_count      5
33          Tot sum      6
43       Covariance      7
37              Std      8
41         Magnitue      9
38         Tot size     10
26              TCP     11
19             HTTP     12
45           Weight     13
2     Protocol Type     14
11  ack_flag_number     15
4              Rate     16
35              Max     17
14        ack_count     18
20            HTTPS     19
42           Radius     20
0     flow_duration     21
16        fin_count     22
9   rst_flag_number     23
31              IPv     24
3          Duration     25
40           Number     26
36              AVG     27
21              DNS     28
23             SMTP     29
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['UDP', 'Tot sum', 'syn_flag_number', 'Magnitue', 'Tot size', 'psh_flag_number', 'AVG', 'fin_flag_number', 'ICMP', 'Header_Length', 'urg_count', 'IAT', 'Min']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  syn_count
selected feature:  Variance
Extra features from RFE: ['syn_count', 'Variance']
top_features_combined =  ['UDP', 'Tot sum', 'syn_flag_number', 'Magnitue', 'Tot size', 'psh_flag_number', 'AVG', 'fin_flag_number', 'ICMP', 'Header_Length', 'urg_count', 'IAT', 'Min', 'syn_count', 'Variance']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997040544539805
Precision: 0.9997299523553076
Recall: 0.9997040544539805
F1 Score: 0.9997104768568654
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       513
           1       1.00      0.95      0.97        56
           2       1.00      1.00      1.00     12772
           3       1.00      1.00      1.00       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.77      1.00      0.87        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.98      0.99      0.99     60822
weighted avg       1.00      1.00      1.00     6

# 20 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.037642
1     Header_Length  1.139460
34              Min  1.046864
38         Tot size  1.042665
41         Magnitue  1.036904
36              AVG  1.032282
33          Tot sum  1.026865
35              Max  0.977885
2     Protocol Type  0.977443
26              TCP  0.667115
15        syn_count  0.615432
30             ICMP  0.531554
8   syn_flag_number  0.530867
4              Rate  0.525505
5             Srate  0.525368
0     flow_duration  0.479652
27              UDP  0.434700
18        rst_count  0.434198
16        fin_count  0.399572
17        urg_count  0.386209
14        ack_count  0.380977
7   fin_flag_number  0.365935
9   rst_flag_number  0.358086
10  psh_flag_number  0.356838
11  ack_flag_number  0.352890
37              Std  0.227178
43       Covariance  0.226716
42           Radius  0.223394
44         Variance  0.150464
3          Duration  0.056429
19             HTTP  0.029399
20            HTTPS  0.015852
31        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
7   fin_flag_number      1
15        syn_count      1
10  psh_flag_number      1
8   syn_flag_number      1
27              UDP      1
30             ICMP      1
17        urg_count      1
34              Min      1
39              IAT      1
44         Variance      1
1     Header_Length      2
18        rst_count      3
33          Tot sum      4
43       Covariance      5
37              Std      6
41         Magnitue      7
38         Tot size      8
26              TCP      9
19             HTTP     10
45           Weight     11
2     Protocol Type     12
11  ack_flag_number     13
4              Rate     14
35              Max     15
14        ack_count     16
20            HTTPS     17
42           Radius     18
0     flow_duration     19
16        fin_count     20
9   rst_flag_number     21
31              IPv     22
3          Duration     23
40           Number     24
36              AVG     25
21              DNS     26
23             SMTP     27
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['Tot sum', 'Protocol Type', 'Magnitue', 'Tot size', 'IAT', 'psh_flag_number', 'UDP', 'Max', 'Variance', 'TCP', 'AVG', 'fin_flag_number', 'ICMP', 'Header_Length', 'urg_count', 'syn_flag_number', 'syn_count', 'Min']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  rst_count
selected feature:  Covariance
Extra features from RFE: ['rst_count', 'Covariance']
top_features_combined =  ['Tot sum', 'Protocol Type', 'Magnitue', 'Tot size', 'IAT', 'psh_flag_number', 'UDP', 'Max', 'Variance', 'TCP', 'AVG', 'fin_flag_number', 'ICMP', 'Header_Length', 'urg_count', 'syn_flag_number', 'syn_count', 'Min', 'rst_count', 'Covariance']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997369372924271
Precision: 0.9997457516692028
Recall: 0.9997369372924271
F1 Score: 0.9997387926665342
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       513
           1       0.98      0.95      0.96        56
           2       1.00      1.00      1.00     12772
           3       1.00      1.00      1.00       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.86      1.00      0.92        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      1.00      1.00       548

    accuracy                           1.00     60822
   macro avg       0.99      0.99      0.99     60822
weighted avg       1.00      1.00      1.00     6

# 25 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.037658
1     Header_Length  1.140325
34              Min  1.046195
38         Tot size  1.042299
41         Magnitue  1.037969
36              AVG  1.031729
33          Tot sum  1.025190
2     Protocol Type  0.978749
35              Max  0.975940
26              TCP  0.666572
15        syn_count  0.616285
30             ICMP  0.531580
8   syn_flag_number  0.529195
5             Srate  0.526111
4              Rate  0.525778
0     flow_duration  0.477116
27              UDP  0.436043
18        rst_count  0.435994
16        fin_count  0.398984
17        urg_count  0.388940
14        ack_count  0.383663
7   fin_flag_number  0.367765
10  psh_flag_number  0.358153
9   rst_flag_number  0.357659
11  ack_flag_number  0.350077
37              Std  0.226813
43       Covariance  0.226098
42           Radius  0.224591
44         Variance  0.149210
3          Duration  0.057877
19             HTTP  0.032451
20            HTTPS  0.016084
32        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
1     Header_Length      1
7   fin_flag_number      1
10  psh_flag_number      1
8   syn_flag_number      1
15        syn_count      1
30             ICMP      1
18        rst_count      1
17        urg_count      1
27              UDP      1
39              IAT      1
34              Min      1
33          Tot sum      1
44         Variance      1
43       Covariance      2
37              Std      3
41         Magnitue      4
38         Tot size      5
26              TCP      6
19             HTTP      7
45           Weight      8
2     Protocol Type      9
11  ack_flag_number     10
4              Rate     11
35              Max     12
14        ack_count     13
20            HTTPS     14
42           Radius     15
0     flow_duration     16
16        fin_count     17
9   rst_flag_number     18
31              IPv     19
3          Duration     20
40           Number     21
36              AVG     22
21              DNS     23
23             SMTP     24
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['Tot sum', 'Protocol Type', 'Magnitue', 'Tot size', 'IAT', 'psh_flag_number', 'UDP', 'Max', 'Variance', 'TCP', 'AVG', 'fin_flag_number', 'ICMP', 'Header_Length', 'urg_count', 'rst_count', 'syn_flag_number', 'syn_count', 'Min']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  Covariance
selected feature:  Std
selected feature:  HTTP
selected feature:  Weight
selected feature:  ack_flag_number
selected feature:  Rate
Extra features from RFE: ['Covariance', 'Std', 'HTTP', 'Weight', 'ack_flag_number', 'Rate']
top_features_combined =  ['Tot sum', 'Protocol Type', 'Magnitue', 'Tot size', 'IAT', 'psh_flag_number', 'UDP', 'Max', 'Variance', 'TCP', 'AVG', 'fin_flag_number', 'ICMP', 'Header_Length', 'urg_count', 'rst_count', 'syn_flag_number', 'syn_count', 'Min', 'Covariance', 'Std', 'HTTP', 'Weight', 'ack_flag_number', 'Rate']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997369372924271
Precision: 0.9997422416123637
Recall: 0.9997369372924271
F1 Score: 0.99973814884181
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       513
           1       0.98      0.96      0.97        56
           2       1.00      1.00      1.00     12772
           3       1.00      1.00      1.00       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.89      1.00      0.94        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      1.00      1.00       548

    accuracy                           1.00     60822
   macro avg       0.99      1.00      0.99     60822
weighted avg       1.00      1.00      1.00     608