<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_cyber_attacks_using_XGB_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load and filter the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')

# Step 2: Separate features and target
X = df.drop("label", axis=1, errors='ignore')
y = df["label"] if 'label' in df.columns else pd.Series([])

# Encode target labels into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Check for valid features and labels
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (238687, 46)
Target shape: (238687,)


# 5 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.601156
38         Tot size  1.337866
1     Header_Length  1.326675
41         Magnitue  1.318371
36              AVG  1.313078
33          Tot sum  1.306669
34              Min  1.296478
35              Max  1.272257
2     Protocol Type  1.161106
26              TCP  0.656225
15        syn_count  0.652480
0     flow_duration  0.637555
5             Srate  0.632753
4              Rate  0.632677
18        rst_count  0.510322
8   syn_flag_number  0.496163
27              UDP  0.495042
17        urg_count  0.459172
30             ICMP  0.444478
37              Std  0.382657
43       Covariance  0.381131
42           Radius  0.379571
16        fin_count  0.349149
14        ack_count  0.331337
11  ack_flag_number  0.327544
44         Variance  0.300313
7   fin_flag_number  0.294076
10  psh_flag_number  0.285729
9   rst_flag_number  0.283657
3          Duration  0.210094
40           Number  0.177794
45           Weight  0.176016
20        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
30             ICMP      1
10  psh_flag_number      1
7   fin_flag_number      1
8   syn_flag_number      2
39              IAT      3
27              UDP      4
34              Min      5
45           Weight      6
9   rst_flag_number      7
2     Protocol Type      8
4              Rate      9
18        rst_count     10
40           Number     11
20            HTTPS     12
16        fin_count     13
24              SSH     14
19             HTTP     15
33          Tot sum     16
44         Variance     17
11  ack_flag_number     18
1     Header_Length     19
38         Tot size     20
15        syn_count     21
17        urg_count     22
14        ack_count     23
0     flow_duration     24
35              Max     25
43       Covariance     26
36              AVG     27
41         Magnitue     28
26              TCP     29
42           Radius     30
29              ARP     31
3          Duration     32
37              Std     33
31              IPv     34
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['ICMP', 'psh_flag_number', 'fin_flag_number', 'IAT', 'Tot size']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: []
top_features_combined =  ['ICMP', 'psh_flag_number', 'fin_flag_number', 'IAT', 'Tot size']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9830143282938916
Precision: 0.9818127523679997
Recall: 0.9830143282938916
F1 Score: 0.9815473019462784
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.74      0.92      0.82      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       0.99      0.98      0.99       539
           5       0.61      0.70      0.66        61
           6       1.00      1.00      1.00     12879
           7       1.00      1.00      1.00       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      0.99      1.00      7309
          11       0.72      0.45      0.55        47
          12       1.00      1.00      1.00      6251
          13       0.99      1.00      1.00      8058
          14       1.00      1.00      1.00      9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 10 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.601669
38         Tot size  1.338297
1     Header_Length  1.327945
41         Magnitue  1.318590
36              AVG  1.312474
33          Tot sum  1.305038
34              Min  1.296882
35              Max  1.271419
2     Protocol Type  1.161066
26              TCP  0.657788
15        syn_count  0.646165
0     flow_duration  0.637556
4              Rate  0.632753
5             Srate  0.632408
18        rst_count  0.507936
8   syn_flag_number  0.498120
27              UDP  0.494075
17        urg_count  0.461779
30             ICMP  0.444127
37              Std  0.383823
43       Covariance  0.379916
42           Radius  0.378057
16        fin_count  0.350504
14        ack_count  0.332451
11  ack_flag_number  0.326396
7   fin_flag_number  0.295364
44         Variance  0.295290
10  psh_flag_number  0.286201
9   rst_flag_number  0.284471
3          Duration  0.207787
40           Number  0.177851
45           Weight  0.174944
20        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
30             ICMP      1
10  psh_flag_number      1
8   syn_flag_number      1
39              IAT      1
7   fin_flag_number      1
27              UDP      2
34              Min      3
45           Weight      4
9   rst_flag_number      5
2     Protocol Type      6
4              Rate      7
18        rst_count      8
40           Number      9
20            HTTPS     10
16        fin_count     11
24              SSH     12
19             HTTP     13
33          Tot sum     14
44         Variance     15
11  ack_flag_number     16
1     Header_Length     17
38         Tot size     18
15        syn_count     19
17        urg_count     20
14        ack_count     21
0     flow_duration     22
35              Max     23
43       Covariance     24
36              AVG     25
41         Magnitue     26
26              TCP     27
42           Radius     28
29              ARP     29
3          Duration     30
37              Std     31
31              IPv     32
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['AVG', 'psh_flag_number', 'Header_Length', 'Magnitue', 'IAT', 'Tot size', 'fin_flag_number', 'syn_flag_number', 'ICMP']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  UDP
Extra features from RFE: ['UDP']
top_features_combined =  ['AVG', 'psh_flag_number', 'Header_Length', 'Magnitue', 'IAT', 'Tot size', 'fin_flag_number', 'syn_flag_number', 'ICMP', 'UDP']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9872996492740092
Precision: 0.9860600939656112
Recall: 0.9872996492740092
F1 Score: 0.9863875929987911
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.79      0.91      0.85      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       1.00      1.00      1.00       539
           5       0.78      0.85      0.81        61
           6       1.00      1.00      1.00     12879
           7       1.00      1.00      1.00       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.83      0.74      0.79        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 15 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.601410
38         Tot size  1.338830
1     Header_Length  1.327250
41         Magnitue  1.315900
36              AVG  1.313518
33          Tot sum  1.306942
34              Min  1.297045
35              Max  1.270041
2     Protocol Type  1.162497
26              TCP  0.657042
15        syn_count  0.648497
0     flow_duration  0.634860
5             Srate  0.632531
4              Rate  0.632317
18        rst_count  0.506954
27              UDP  0.496407
8   syn_flag_number  0.496167
17        urg_count  0.460945
30             ICMP  0.445222
37              Std  0.383741
42           Radius  0.382481
43       Covariance  0.379983
16        fin_count  0.349797
14        ack_count  0.331705
11  ack_flag_number  0.325948
44         Variance  0.298982
7   fin_flag_number  0.296520
10  psh_flag_number  0.287234
9   rst_flag_number  0.285494
3          Duration  0.207286
40           Number  0.180050
45           Weight  0.175908
20        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
45           Weight      1
30             ICMP      1
27              UDP      1
10  psh_flag_number      1
8   syn_flag_number      1
7   fin_flag_number      1
34              Min      1
39              IAT      1
9   rst_flag_number      2
2     Protocol Type      3
4              Rate      4
18        rst_count      5
40           Number      6
20            HTTPS      7
16        fin_count      8
24              SSH      9
19             HTTP     10
33          Tot sum     11
44         Variance     12
11  ack_flag_number     13
1     Header_Length     14
38         Tot size     15
15        syn_count     16
17        urg_count     17
14        ack_count     18
0     flow_duration     19
35              Max     20
43       Covariance     21
36              AVG     22
41         Magnitue     23
26              TCP     24
42           Radius     25
29              ARP     26
3          Duration     27
37              Std     28
31              IPv     29
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['AVG', 'Weight', 'Tot sum', 'psh_flag_number', 'Header_Length', 'UDP', 'Magnitue', 'IAT', 'Tot size', 'Min', 'fin_flag_number', 'syn_flag_number', 'ICMP']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  rst_flag_number
selected feature:  Protocol Type
Extra features from RFE: ['rst_flag_number', 'Protocol Type']
top_features_combined =  ['AVG', 'Weight', 'Tot sum', 'psh_flag_number', 'Header_Length', 'UDP', 'Magnitue', 'IAT', 'Tot size', 'Min', 'fin_flag_number', 'syn_flag_number', 'ICMP', 'rst_flag_number', 'Protocol Type']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9884368154558839
Precision: 0.9872215504479657
Recall: 0.9884368154558839
F1 Score: 0.9874526296239975
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.80      0.92      0.86      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       1.00      1.00      1.00       539
           5       0.83      0.89      0.86        61
           6       1.00      1.00      1.00     12879
           7       1.00      1.00      1.00       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.86      0.81      0.84        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 20 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.601516
38         Tot size  1.338850
1     Header_Length  1.326070
41         Magnitue  1.318450
36              AVG  1.315383
33          Tot sum  1.303433
34              Min  1.296095
35              Max  1.275103
2     Protocol Type  1.162732
26              TCP  0.656486
15        syn_count  0.648404
0     flow_duration  0.635293
5             Srate  0.632808
4              Rate  0.632420
18        rst_count  0.511767
8   syn_flag_number  0.497604
27              UDP  0.494290
17        urg_count  0.461282
30             ICMP  0.442946
37              Std  0.382819
43       Covariance  0.381570
42           Radius  0.381349
16        fin_count  0.350787
14        ack_count  0.333453
11  ack_flag_number  0.323853
44         Variance  0.297885
7   fin_flag_number  0.293281
10  psh_flag_number  0.287816
9   rst_flag_number  0.286176
3          Duration  0.212210
40           Number  0.177980
45           Weight  0.176595
20        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
45           Weight      1
30             ICMP      1
27              UDP      1
10  psh_flag_number      1
9   rst_flag_number      1
8   syn_flag_number      1
7   fin_flag_number      1
34              Min      1
39              IAT      1
2     Protocol Type      1
4              Rate      2
18        rst_count      3
40           Number      4
20            HTTPS      5
16        fin_count      6
24              SSH      7
19             HTTP      8
33          Tot sum      9
44         Variance     10
11  ack_flag_number     11
1     Header_Length     12
38         Tot size     13
15        syn_count     14
17        urg_count     15
14        ack_count     16
0     flow_duration     17
35              Max     18
43       Covariance     19
36              AVG     20
41         Magnitue     21
26              TCP     22
42           Radius     23
29              ARP     24
3          Duration     25
37              Std     26
31              IPv     27
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['AVG', 'Weight', 'psh_flag_number', 'Magnitue', 'Tot size', 'rst_flag_number', 'Max', 'UDP', 'TCP', 'Tot sum', 'Header_Length', 'Protocol Type', 'IAT', 'Min', 'fin_flag_number', 'syn_flag_number', 'ICMP']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  Rate
selected feature:  rst_count
selected feature:  Number
Extra features from RFE: ['Rate', 'rst_count', 'Number']
top_features_combined =  ['AVG', 'Weight', 'psh_flag_number', 'Magnitue', 'Tot size', 'rst_flag_number', 'Max', 'UDP', 'TCP', 'Tot sum', 'Header_Length', 'Protocol Type', 'IAT', 'Min', 'fin_flag_number', 'syn_flag_number', 'ICMP', 'Rate', 'rst_count', 'Number']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9901964304952059
Precision: 0.989055226285607
Recall: 0.9901964304952059
F1 Score: 0.9893459031576205
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.84      0.94      0.89      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       1.00      1.00      1.00       539
           5       0.91      0.95      0.93        61
           6       1.00      1.00      1.00     12879
           7       1.00      1.00      1.00       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.94      0.96      0.95        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      97

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 25 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names = ', skb_selected_feature_names)

            Feature     Score
39              IAT  2.601328
38         Tot size  1.339335
1     Header_Length  1.327400
41         Magnitue  1.318020
36              AVG  1.314042
33          Tot sum  1.305144
34              Min  1.295671
35              Max  1.271420
2     Protocol Type  1.162947
26              TCP  0.656476
15        syn_count  0.648856
0     flow_duration  0.634361
4              Rate  0.633166
5             Srate  0.632632
18        rst_count  0.509843
27              UDP  0.495907
8   syn_flag_number  0.495547
17        urg_count  0.462133
30             ICMP  0.442603
43       Covariance  0.382473
37              Std  0.381508
42           Radius  0.379551
16        fin_count  0.350280
14        ack_count  0.330329
11  ack_flag_number  0.326109
7   fin_flag_number  0.296841
44         Variance  0.296346
10  psh_flag_number  0.286394
9   rst_flag_number  0.286292
3          Duration  0.211540
45           Weight  0.177165
40           Number  0.176849
20        

In [None]:
rfe_selector = RFE(estimator=XGBClassifier(eval_metric='mlogloss'), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

# printing the ranks
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names = ', rfe_selected_feature_names)

            Feature  Score
45           Weight      1
30             ICMP      1
18        rst_count      1
27              UDP      1
10  psh_flag_number      1
9   rst_flag_number      1
8   syn_flag_number      1
34              Min      1
39              IAT      1
4              Rate      1
40           Number      1
2     Protocol Type      1
7   fin_flag_number      1
20            HTTPS      2
16        fin_count      3
24              SSH      4
19             HTTP      5
33          Tot sum      6
44         Variance      7
11  ack_flag_number      8
1     Header_Length      9
38         Tot size     10
15        syn_count     11
17        urg_count     12
14        ack_count     13
0     flow_duration     14
35              Max     15
43       Covariance     16
36              AVG     17
41         Magnitue     18
26              TCP     19
42           Radius     20
29              ARP     21
3          Duration     22
37              Std     23
31              IPv     24
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print('top_features_combined = ', top_features_combined)

top_features_combined =  ['AVG', 'Weight', 'Number', 'psh_flag_number', 'Magnitue', 'syn_count', 'Tot size', 'flow_duration', 'rst_flag_number', 'Max', 'UDP', 'TCP', 'Tot sum', 'rst_count', 'Header_Length', 'Protocol Type', 'IAT', 'Min', 'Rate', 'fin_flag_number', 'syn_flag_number', 'ICMP']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        print("selected feature: ", feature)
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

selected feature:  HTTPS
selected feature:  fin_count
selected feature:  SSH
Extra features from RFE: ['HTTPS', 'fin_count', 'SSH']
top_features_combined =  ['AVG', 'Weight', 'Number', 'psh_flag_number', 'Magnitue', 'syn_count', 'Tot size', 'flow_duration', 'rst_flag_number', 'Max', 'UDP', 'TCP', 'Tot sum', 'rst_count', 'Header_Length', 'Protocol Type', 'IAT', 'Min', 'Rate', 'fin_flag_number', 'syn_flag_number', 'ICMP', 'HTTPS', 'fin_count', 'SSH']


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.990723117989969
Precision: 0.9898401549296735
Recall: 0.990723117989969
F1 Score: 0.9900424091136766
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.85      0.94      0.90      1951
           2       0.00      0.00      0.00        11
           3       1.00      0.12      0.22         8
           4       1.00      1.00      1.00       539
           5       0.87      0.97      0.91        61
           6       1.00      1.00      1.00     12879
           7       1.00      1.00      1.00       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.96      0.91      0.93        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      979

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
