<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_cyber_attacks_using_DT_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score

# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')

# Step 2: Separate features and target
X = df.drop("label", axis=1, errors='ignore')
y = df["label"] if 'label' in df.columns else pd.Series([])

# Check if X and y are valid
print(X.shape)
print(y.shape)


(238687, 46)
(238687,)


# 5 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601322
38         Tot size  1.339165
1     Header_Length  1.328270
41         Magnitue  1.318521
36              AVG  1.312937
33          Tot sum  1.305131
34              Min  1.297491
35              Max  1.271629
2     Protocol Type  1.162063
26              TCP  0.655991
15        syn_count  0.649332
0     flow_duration  0.637012
4              Rate  0.632721
5             Srate  0.632675
18        rst_count  0.509768
27              UDP  0.495410
8   syn_flag_number  0.494935
17        urg_count  0.459471
30             ICMP  0.444954
37              Std  0.383616
43       Covariance  0.379718
42           Radius  0.378409
16        fin_count  0.350628
14        ack_count  0.331605
11  ack_flag_number  0.327742
44         Variance  0.298962
7   fin_flag_number  0.294792
10  psh_flag_number  0.287650
9   rst_flag_number  0.286323
3          Duration  0.209442
40           Number  0.177538
45           Weight  0.177153
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
41         Magnitue      1
34              Min      1
39              IAT      1
7   fin_flag_number      2
10  psh_flag_number      3
8   syn_flag_number      4
33          Tot sum      5
2     Protocol Type      6
1     Header_Length      7
30             ICMP      8
18        rst_count      9
42           Radius     10
5             Srate     11
16        fin_count     12
0     flow_duration     13
36              AVG     14
44         Variance     15
3          Duration     16
15        syn_count     17
4              Rate     18
38         Tot size     19
20            HTTPS     20
17        urg_count     21
43       Covariance     22
40           Number     23
35              Max     24
37              Std     25
14        ack_count     26
27              UDP     27
26              TCP     28
19             HTTP     29
24              SSH     30
11  ack_flag_number     31
21              DNS     32
12  ece_flag_number     33
9   rst_flag_number     34
4

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot size', 'Magnitue', 'Min', 'IAT']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['fin_flag_number']
top_features_combined =  ['Tot size', 'Magnitue', 'Min', 'IAT', 'fin_flag_number']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9885445469889037
Precision: 0.9885641697685623
recall: 0.9885445469889037
F1 Score: 0.98853627012635
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.38      0.75      0.50         4
          BenignTraffic       0.83      0.83      0.83      1951
       BrowserHijacking       0.45      0.45      0.45        11
       CommandInjection       0.46      0.75      0.57         8
 DDoS-ACK_Fragmentation       0.99      0.99      0.99       539
        DDoS-HTTP_Flood       1.00      1.00      1.00        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       0.99      1.00      1.00       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       1.00      1.00      1.00        47
DDoS-SynonymousIP

# 10 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601628
38         Tot size  1.337908
1     Header_Length  1.327343
41         Magnitue  1.317207
36              AVG  1.313202
33          Tot sum  1.304404
34              Min  1.295658
35              Max  1.271906
2     Protocol Type  1.161920
26              TCP  0.655807
15        syn_count  0.648392
0     flow_duration  0.635887
4              Rate  0.632920
5             Srate  0.632310
18        rst_count  0.510547
8   syn_flag_number  0.498839
27              UDP  0.495140
17        urg_count  0.460220
30             ICMP  0.444864
37              Std  0.381785
43       Covariance  0.381591
42           Radius  0.381197
16        fin_count  0.348113
14        ack_count  0.331316
11  ack_flag_number  0.326006
44         Variance  0.299271
7   fin_flag_number  0.294596
10  psh_flag_number  0.287502
9   rst_flag_number  0.286421
3          Duration  0.211714
40           Number  0.178600
45           Weight  0.176833
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
41         Magnitue      1
7   fin_flag_number      1
39              IAT      1
10  psh_flag_number      1
34              Min      1
8   syn_flag_number      2
33          Tot sum      3
2     Protocol Type      4
30             ICMP      5
1     Header_Length      6
18        rst_count      7
37              Std      8
5             Srate      9
16        fin_count     10
0     flow_duration     11
36              AVG     12
44         Variance     13
15        syn_count     14
3          Duration     15
4              Rate     16
17        urg_count     17
38         Tot size     18
20            HTTPS     19
45           Weight     20
35              Max     21
43       Covariance     22
42           Radius     23
14        ack_count     24
27              UDP     25
19             HTTP     26
40           Number     27
11  ack_flag_number     28
26              TCP     29
24              SSH     30
9   rst_flag_number     31
21              DNS     32
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['psh_flag_number', 'Magnitue', 'Header_Length', 'Tot size', 'AVG', 'Min', 'fin_flag_number', 'IAT']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_flag_number', 'Tot sum']
top_features_combined =  ['psh_flag_number', 'Magnitue', 'Header_Length', 'Tot size', 'AVG', 'Min', 'fin_flag_number', 'IAT', 'syn_flag_number', 'Tot sum']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9895979219784298
Precision: 0.9895867882899307
recall: 0.9895979219784298
F1 Score: 0.989576284709778
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.33      0.75      0.46         4
          BenignTraffic       0.85      0.85      0.85      1951
       BrowserHijacking       0.29      0.36      0.32        11
       CommandInjection       0.67      0.75      0.71         8
 DDoS-ACK_Fragmentation       1.00      1.00      1.00       539
        DDoS-HTTP_Flood       0.95      0.95      0.95        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.98      0.96      0.97        47
DDoS-SynonymousI

# 15 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601479
38         Tot size  1.339122
1     Header_Length  1.328512
41         Magnitue  1.318386
36              AVG  1.311911
33          Tot sum  1.306456
34              Min  1.297834
35              Max  1.272753
2     Protocol Type  1.161421
26              TCP  0.657034
15        syn_count  0.646882
0     flow_duration  0.635569
5             Srate  0.632849
4              Rate  0.632484
18        rst_count  0.510785
27              UDP  0.495144
8   syn_flag_number  0.495035
17        urg_count  0.458713
30             ICMP  0.443482
43       Covariance  0.384368
37              Std  0.382555
42           Radius  0.380508
16        fin_count  0.348475
14        ack_count  0.332532
11  ack_flag_number  0.328494
44         Variance  0.296680
7   fin_flag_number  0.290670
10  psh_flag_number  0.287425
9   rst_flag_number  0.286872
3          Duration  0.210629
40           Number  0.178430
45           Weight  0.178133
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
2     Protocol Type      1
34              Min      1
39              IAT      1
7   fin_flag_number      1
8   syn_flag_number      1
10  psh_flag_number      1
41         Magnitue      1
33          Tot sum      1
30             ICMP      2
1     Header_Length      3
18        rst_count      4
42           Radius      5
5             Srate      6
16        fin_count      7
0     flow_duration      8
36              AVG      9
3          Duration     10
44         Variance     11
4              Rate     12
17        urg_count     13
15        syn_count     14
20            HTTPS     15
40           Number     16
38         Tot size     17
35              Max     18
43       Covariance     19
37              Std     20
14        ack_count     21
19             HTTP     22
11  ack_flag_number     23
24              SSH     24
45           Weight     25
27              UDP     26
9   rst_flag_number     27
26              TCP     28
21              DNS     29
6

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['psh_flag_number', 'syn_flag_number', 'Magnitue', 'Tot sum', 'Header_Length', 'Tot size', 'fin_flag_number', 'Min', 'Protocol Type', 'IAT', 'AVG']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['ICMP', 'rst_count', 'Radius', 'Srate']
top_features_combined =  ['psh_flag_number', 'syn_flag_number', 'Magnitue', 'Tot sum', 'Header_Length', 'Tot size', 'fin_flag_number', 'Min', 'Protocol Type', 'IAT', 'AVG', 'ICMP', 'rst_count', 'Radius', 'Srate']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9903281023688967
Precision: 0.9904193898956226
recall: 0.9903281023688967
F1 Score: 0.9903381638138199
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.50      0.75      0.60         4
          BenignTraffic       0.88      0.88      0.88      1951
       BrowserHijacking       0.29      0.36      0.32        11
       CommandInjection       0.38      0.75      0.50         8
 DDoS-ACK_Fragmentation       1.00      0.99      1.00       539
        DDoS-HTTP_Flood       1.00      0.97      0.98        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.91      0.91      0.91        47
DDoS-Synonymous

# 20 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601394
38         Tot size  1.338069
1     Header_Length  1.328046
41         Magnitue  1.318713
36              AVG  1.312136
33          Tot sum  1.305532
34              Min  1.294817
35              Max  1.271868
2     Protocol Type  1.163586
26              TCP  0.656315
15        syn_count  0.650816
0     flow_duration  0.633839
4              Rate  0.633066
5             Srate  0.632606
18        rst_count  0.510642
27              UDP  0.497878
8   syn_flag_number  0.496007
17        urg_count  0.457962
30             ICMP  0.441603
37              Std  0.384040
43       Covariance  0.381566
42           Radius  0.380192
16        fin_count  0.349013
14        ack_count  0.332472
11  ack_flag_number  0.328318
44         Variance  0.298829
7   fin_flag_number  0.295374
9   rst_flag_number  0.286071
10  psh_flag_number  0.284725
3          Duration  0.207426
40           Number  0.178216
45           Weight  0.177464
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
8   syn_flag_number      1
1     Header_Length      1
2     Protocol Type      1
34              Min      1
33          Tot sum      1
41         Magnitue      1
7   fin_flag_number      1
39              IAT      1
10  psh_flag_number      1
30             ICMP      1
18        rst_count      2
37              Std      3
4              Rate      4
16        fin_count      5
0     flow_duration      6
36              AVG      7
3          Duration      8
44         Variance      9
5             Srate     10
38         Tot size     11
17        urg_count     12
20            HTTPS     13
15        syn_count     14
45           Weight     15
43       Covariance     16
35              Max     17
42           Radius     18
14        ack_count     19
40           Number     20
19             HTTP     21
11  ack_flag_number     22
24              SSH     23
27              UDP     24
26              TCP     25
21              DNS     26
31              IPv     27
1

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['syn_flag_number', 'Tot sum', 'Max', 'Header_Length', 'Tot size', 'ICMP', 'Min', 'IAT', 'AVG', 'psh_flag_number', 'Magnitue', 'TCP', 'Protocol Type', 'fin_flag_number']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['rst_count', 'Std', 'Rate', 'fin_count', 'flow_duration', 'Duration']
top_features_combined =  ['syn_flag_number', 'Tot sum', 'Max', 'Header_Length', 'Tot size', 'ICMP', 'Min', 'IAT', 'AVG', 'psh_flag_number', 'Magnitue', 'TCP', 'Protocol Type', 'fin_flag_number', 'rst_count', 'Std', 'Rate', 'fin_count', 'flow_duration', 'Duration']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9908428196933242
Precision: 0.9908284093301288
recall: 0.9908428196933242
F1 Score: 0.9908093175007887
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       1.00      0.75      0.86         4
          BenignTraffic       0.89      0.90      0.89      1951
       BrowserHijacking       0.38      0.45      0.42        11
       CommandInjection       0.46      0.75      0.57         8
 DDoS-ACK_Fragmentation       0.99      0.99      0.99       539
        DDoS-HTTP_Flood       0.89      0.93      0.91        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.87      0.85      0.86        47
DDoS-Synonymous

# **25 Features**

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601664
38         Tot size  1.338479
1     Header_Length  1.327841
41         Magnitue  1.315829
36              AVG  1.314332
33          Tot sum  1.304010
34              Min  1.297200
35              Max  1.272419
2     Protocol Type  1.163004
26              TCP  0.657242
15        syn_count  0.646062
0     flow_duration  0.635118
4              Rate  0.632724
5             Srate  0.632365
18        rst_count  0.509950
8   syn_flag_number  0.497308
27              UDP  0.496596
17        urg_count  0.462067
30             ICMP  0.444330
37              Std  0.384845
42           Radius  0.381160
43       Covariance  0.380710
16        fin_count  0.353389
14        ack_count  0.333822
11  ack_flag_number  0.328058
44         Variance  0.299818
7   fin_flag_number  0.295874
10  psh_flag_number  0.286131
9   rst_flag_number  0.285632
3          Duration  0.207454
45           Weight  0.180535
40           Number  0.177221
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
41         Magnitue      1
33          Tot sum      1
34              Min      1
18        rst_count      1
39              IAT      1
10  psh_flag_number      1
42           Radius      1
8   syn_flag_number      1
30             ICMP      1
4              Rate      1
2     Protocol Type      1
1     Header_Length      1
7   fin_flag_number      1
16        fin_count      2
0     flow_duration      3
36              AVG      4
3          Duration      5
44         Variance      6
5             Srate      7
17        urg_count      8
20            HTTPS      9
38         Tot size     10
15        syn_count     11
43       Covariance     12
45           Weight     13
37              Std     14
35              Max     15
14        ack_count     16
19             HTTP     17
11  ack_flag_number     18
40           Number     19
26              TCP     20
27              UDP     21
9   rst_flag_number     22
24              SSH     23
21              DNS     24
6

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['syn_flag_number', 'flow_duration', 'Tot sum', 'Max', 'Header_Length', 'Tot size', 'ICMP', 'Min', 'Rate', 'IAT', 'AVG', 'psh_flag_number', 'Magnitue', 'TCP', 'syn_count', 'rst_count', 'Radius', 'Protocol Type', 'fin_flag_number']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['fin_count', 'Duration', 'Variance', 'Srate', 'urg_count', 'HTTPS']
top_features_combined =  ['syn_flag_number', 'flow_duration', 'Tot sum', 'Max', 'Header_Length', 'Tot size', 'ICMP', 'Min', 'Rate', 'IAT', 'AVG', 'psh_flag_number', 'Magnitue', 'TCP', 'syn_count', 'rst_count', 'Radius', 'Protocol Type', 'fin_flag_number', 'fin_count', 'Duration', 'Variance', 'Srate', 'urg_count', 'HTTPS']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9909625213966795
Precision: 0.9911917312668765
recall: 0.9909625213966795
F1 Score: 0.9910268198731476
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.25      0.75      0.38         4
          BenignTraffic       0.90      0.89      0.90      1951
       BrowserHijacking       0.29      0.36      0.32        11
       CommandInjection       0.30      0.75      0.43         8
 DDoS-ACK_Fragmentation       0.99      0.99      0.99       539
        DDoS-HTTP_Flood       0.89      0.90      0.89        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.89      0.87      0.88        47
DDoS-Synonymous