<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_DDoS_attacks_using_DT_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score

# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Separate features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df_ddos.columns else pd.Series([])

# Check if X and y are valid
print(X.shape)
print(y.shape)


(173777, 46)
(173777,)


# 5 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037647
1     Header_Length  1.138363
34              Min  1.045500
38         Tot size  1.042804
41         Magnitue  1.035977
36              AVG  1.031996
33          Tot sum  1.027246
2     Protocol Type  0.978484
35              Max  0.976839
26              TCP  0.666641
15        syn_count  0.614459
30             ICMP  0.532388
8   syn_flag_number  0.529793
5             Srate  0.525842
4              Rate  0.525692
0     flow_duration  0.478218
27              UDP  0.437050
18        rst_count  0.433486
16        fin_count  0.402268
17        urg_count  0.388190
14        ack_count  0.379898
7   fin_flag_number  0.368997
9   rst_flag_number  0.356286
10  psh_flag_number  0.354386
11  ack_flag_number  0.350179
42           Radius  0.224117
43       Covariance  0.223821
37              Std  0.223142
44         Variance  0.147410
3          Duration  0.059456
19             HTTP  0.031022
20            HTTPS  0.017183
40        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
7   fin_flag_number      1
34              Min      1
39              IAT      1
17        urg_count      2
8   syn_flag_number      3
2     Protocol Type      4
1     Header_Length      5
43       Covariance      6
15        syn_count      7
33          Tot sum      8
5             Srate      9
10  psh_flag_number     10
44         Variance     11
37              Std     12
26              TCP     13
35              Max     14
31              IPv     15
45           Weight     16
41         Magnitue     17
40           Number     18
38         Tot size     19
18        rst_count     20
36              AVG     21
42           Radius     22
30             ICMP     23
20            HTTPS     24
23             SMTP     25
29              ARP     26
32              LLC     27
28             DHCP     28
27              UDP     29
16        fin_count     30
25              IRC     31
24              SSH     32
0     flow_duration     33
22           Telnet     34
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['IAT', 'Min', 'Header_Length', 'fin_flag_number']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['urg_count']
top_features_combined =  ['IAT', 'Min', 'Header_Length', 'fin_flag_number', 'urg_count']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.999523198842524
Precision: 0.9995251342845797
recall: 0.999523198842524
F1 Score: 0.9995237435841956
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.99      0.99      0.99       513
        DDoS-HTTP_Flood       0.93      0.95      0.94        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       0.99      1.00      0.99       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       1.00      0.96      0.98        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       1.00      0.99      1.00       548

               a

# 10 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037634
1     Header_Length  1.137411
34              Min  1.047119
38         Tot size  1.041460
41         Magnitue  1.036601
36              AVG  1.034455
33          Tot sum  1.028215
2     Protocol Type  0.978345
35              Max  0.978297
26              TCP  0.668366
15        syn_count  0.613551
30             ICMP  0.531062
8   syn_flag_number  0.530544
5             Srate  0.525851
4              Rate  0.525554
0     flow_duration  0.479761
27              UDP  0.437234
18        rst_count  0.436736
16        fin_count  0.398516
17        urg_count  0.387160
14        ack_count  0.380435
7   fin_flag_number  0.364235
10  psh_flag_number  0.357997
9   rst_flag_number  0.357021
11  ack_flag_number  0.350206
42           Radius  0.225649
37              Std  0.225517
43       Covariance  0.224537
44         Variance  0.145623
3          Duration  0.056478
19             HTTP  0.029937
20            HTTPS  0.018491
31        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
7   fin_flag_number      1
8   syn_flag_number      1
17        urg_count      1
34              Min      1
39              IAT      1
2     Protocol Type      2
1     Header_Length      3
43       Covariance      4
15        syn_count      5
41         Magnitue      6
5             Srate      7
44         Variance      8
10  psh_flag_number      9
26              TCP     10
31              IPv     11
38         Tot size     12
42           Radius     13
45           Weight     14
23             SMTP     15
35              Max     16
40           Number     17
25              IRC     18
24              SSH     19
37              Std     20
36              AVG     21
33          Tot sum     22
32              LLC     23
30             ICMP     24
29              ARP     25
28             DHCP     26
14        ack_count     27
27              UDP     28
12  ece_flag_number     29
4              Rate     30
3          Duration     31
22           Telnet     32
1

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['urg_count', 'Tot size', 'IAT', 'Header_Length', 'Magnitue', 'syn_flag_number', 'Min', 'fin_flag_number']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['Protocol Type', 'Covariance']
top_features_combined =  ['urg_count', 'Tot size', 'IAT', 'Header_Length', 'Magnitue', 'syn_flag_number', 'Min', 'fin_flag_number', 'Protocol Type', 'Covariance']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9996547301963106
Precision: 0.999655956930136
recall: 0.9996547301963106
F1 Score: 0.9996548783308299
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       1.00      0.99      1.00       513
        DDoS-HTTP_Flood       0.95      0.96      0.96        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       1.00      0.96      0.98        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       1.00      1.00      1.00       548

               

# 15 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037630
1     Header_Length  1.137304
34              Min  1.048571
38         Tot size  1.044214
41         Magnitue  1.038361
36              AVG  1.032891
33          Tot sum  1.027389
2     Protocol Type  0.982559
35              Max  0.979525
26              TCP  0.668652
15        syn_count  0.614586
30             ICMP  0.533099
8   syn_flag_number  0.531098
5             Srate  0.525552
4              Rate  0.525154
0     flow_duration  0.478752
18        rst_count  0.435107
27              UDP  0.433430
16        fin_count  0.400184
17        urg_count  0.388765
14        ack_count  0.380219
7   fin_flag_number  0.365233
9   rst_flag_number  0.359946
10  psh_flag_number  0.353901
11  ack_flag_number  0.351308
37              Std  0.226841
42           Radius  0.226471
43       Covariance  0.222457
44         Variance  0.147577
3          Duration  0.056694
19             HTTP  0.035218
20            HTTPS  0.018361
31        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
1     Header_Length      1
2     Protocol Type      1
7   fin_flag_number      1
8   syn_flag_number      1
17        urg_count      1
43       Covariance      1
39              IAT      1
34              Min      1
15        syn_count      2
5             Srate      3
41         Magnitue      4
44         Variance      5
10  psh_flag_number      6
37              Std      7
32              LLC      8
26              TCP      9
45           Weight     10
38         Tot size     11
40           Number     12
18        rst_count     13
35              Max     14
36              AVG     15
42           Radius     16
33          Tot sum     17
19             HTTP     18
29              ARP     19
20            HTTPS     20
30             ICMP     21
24              SSH     22
31              IPv     23
28             DHCP     24
27              UDP     25
25              IRC     26
16        fin_count     27
23             SMTP     28
22           Telnet     29
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot sum', 'urg_count', 'Tot size', 'IAT', 'Header_Length', 'AVG', 'Magnitue', 'syn_flag_number', 'Protocol Type', 'Min', 'fin_flag_number', 'Covariance']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_count', 'Srate', 'Variance']
top_features_combined =  ['Tot sum', 'urg_count', 'Tot size', 'IAT', 'Header_Length', 'AVG', 'Magnitue', 'syn_flag_number', 'Protocol Type', 'Min', 'fin_flag_number', 'Covariance', 'syn_count', 'Srate', 'Variance']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997369372924271
Precision: 0.9997370459535441
recall: 0.9997369372924271
F1 Score: 0.9997363944731077
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       1.00      0.99      1.00       513
        DDoS-HTTP_Flood       1.00      0.96      0.98        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       1.00      0.96      0.98        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       1.00      1.00      1.00       548

              

# 20 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037664
1     Header_Length  1.137340
34              Min  1.048680
38         Tot size  1.043018
41         Magnitue  1.037793
36              AVG  1.031683
33          Tot sum  1.025981
2     Protocol Type  0.980056
35              Max  0.976731
26              TCP  0.667259
15        syn_count  0.614802
30             ICMP  0.535216
8   syn_flag_number  0.527492
5             Srate  0.525589
4              Rate  0.525555
0     flow_duration  0.479942
27              UDP  0.435627
18        rst_count  0.435147
16        fin_count  0.400280
17        urg_count  0.388483
14        ack_count  0.381591
7   fin_flag_number  0.366441
10  psh_flag_number  0.357306
9   rst_flag_number  0.356124
11  ack_flag_number  0.352687
43       Covariance  0.227884
37              Std  0.226517
42           Radius  0.222190
44         Variance  0.145729
3          Duration  0.057578
19             HTTP  0.029865
20            HTTPS  0.017298
32        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
1     Header_Length      1
2     Protocol Type      1
7   fin_flag_number      1
5             Srate      1
15        syn_count      1
8   syn_flag_number      1
17        urg_count      1
34              Min      1
39              IAT      1
43       Covariance      1
41         Magnitue      2
10  psh_flag_number      3
44         Variance      4
26              TCP      5
32              LLC      6
37              Std      7
33          Tot sum      8
35              Max      9
42           Radius     10
45           Weight     11
40           Number     12
38         Tot size     13
22           Telnet     14
36              AVG     15
24              SSH     16
23             SMTP     17
30             ICMP     18
31              IPv     19
28             DHCP     20
27              UDP     21
9   rst_flag_number     22
29              ARP     23
25              IRC     24
12  ece_flag_number     25
11  ack_flag_number     26
3          Duration     27
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot sum', 'Srate', 'urg_count', 'Header_Length', 'IAT', 'syn_count', 'Magnitue', 'fin_flag_number', 'Tot size', 'Max', 'AVG', 'syn_flag_number', 'Protocol Type', 'Min', 'TCP', 'Covariance']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['psh_flag_number', 'Variance', 'LLC', 'Std']
top_features_combined =  ['Tot sum', 'Srate', 'urg_count', 'Header_Length', 'IAT', 'syn_count', 'Magnitue', 'fin_flag_number', 'Tot size', 'Max', 'AVG', 'syn_flag_number', 'Protocol Type', 'Min', 'TCP', 'Covariance', 'psh_flag_number', 'Variance', 'LLC', 'Std']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997369372924271
Precision: 0.9997371312754393
recall: 0.9997369372924271
F1 Score: 0.9997364230026176
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       1.00      0.99      1.00       513
        DDoS-HTTP_Flood       1.00      0.96      0.98        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       1.00      0.96      0.98        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       1.00      1.00      1.00       548

              

# **25 Features**

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037662
1     Header_Length  1.137618
34              Min  1.049438
38         Tot size  1.041804
41         Magnitue  1.038412
36              AVG  1.033570
33          Tot sum  1.028024
2     Protocol Type  0.979754
35              Max  0.977203
26              TCP  0.670450
15        syn_count  0.614686
30             ICMP  0.531330
8   syn_flag_number  0.529426
5             Srate  0.525983
4              Rate  0.525865
0     flow_duration  0.478780
18        rst_count  0.437113
27              UDP  0.434696
16        fin_count  0.400475
17        urg_count  0.389648
14        ack_count  0.380277
7   fin_flag_number  0.366550
10  psh_flag_number  0.355844
9   rst_flag_number  0.353587
11  ack_flag_number  0.353049
37              Std  0.225384
43       Covariance  0.224760
42           Radius  0.221587
44         Variance  0.149823
3          Duration  0.057696
19             HTTP  0.031552
20            HTTPS  0.018465
40        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
1     Header_Length      1
2     Protocol Type      1
7   fin_flag_number      1
5             Srate      1
10  psh_flag_number      1
15        syn_count      1
8   syn_flag_number      1
17        urg_count      1
41         Magnitue      1
43       Covariance      1
39              IAT      1
44         Variance      1
34              Min      1
38         Tot size      2
26              TCP      3
29              ARP      4
42           Radius      5
45           Weight      6
33          Tot sum      7
36              AVG      8
37              Std      9
18        rst_count     10
35              Max     11
40           Number     12
20            HTTPS     13
16        fin_count     14
30             ICMP     15
4              Rate     16
32              LLC     17
31              IPv     18
28             DHCP     19
27              UDP     20
25              IRC     21
24              SSH     22
23             SMTP     23
22           Telnet     24
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot sum', 'Srate', 'urg_count', 'Header_Length', 'IAT', 'syn_count', 'Variance', 'Magnitue', 'fin_flag_number', 'Tot size', 'Max', 'AVG', 'syn_flag_number', 'Protocol Type', 'psh_flag_number', 'Min', 'TCP', 'Covariance', 'ICMP']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['ARP', 'Radius', 'Weight', 'Std', 'rst_count', 'Number']
top_features_combined =  ['Tot sum', 'Srate', 'urg_count', 'Header_Length', 'IAT', 'syn_count', 'Variance', 'Magnitue', 'fin_flag_number', 'Tot size', 'Max', 'AVG', 'syn_flag_number', 'Protocol Type', 'psh_flag_number', 'Min', 'TCP', 'Covariance', 'ICMP', 'ARP', 'Radius', 'Weight', 'Std', 'rst_count', 'Number']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997204958732038
Precision: 0.9997221747990637
recall: 0.9997204958732038
F1 Score: 0.9997206970419888
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       1.00      0.99      1.00       513
        DDoS-HTTP_Flood       1.00      0.96      0.98        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       0.92      0.96      0.94        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       1.00      1.00      1.00       548

              