<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_DDoS_attacks_using_KNN_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score,recall_score

# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Separate features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df_ddos.columns else pd.Series([])

# Check if X and y are valid
print(X.shape)
print(y.shape)


(173777, 46)
(173777,)


# 5 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037636
1     Header_Length  1.138718
34              Min  1.046777
38         Tot size  1.042425
41         Magnitue  1.035973
36              AVG  1.033329
33          Tot sum  1.027840
2     Protocol Type  0.979074
35              Max  0.976849
26              TCP  0.667849
15        syn_count  0.615374
30             ICMP  0.531681
8   syn_flag_number  0.527392
4              Rate  0.526063
5             Srate  0.525546
0     flow_duration  0.480275
18        rst_count  0.434953
27              UDP  0.434442
16        fin_count  0.400376
17        urg_count  0.388093
14        ack_count  0.384152
7   fin_flag_number  0.365684
9   rst_flag_number  0.357436
10  psh_flag_number  0.356591
11  ack_flag_number  0.351088
37              Std  0.227486
42           Radius  0.226519
43       Covariance  0.223460
44         Variance  0.146925
3          Duration  0.057591
19             HTTP  0.032313
20            HTTPS  0.015069
31        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
34              Min      1
39              IAT      1
7   fin_flag_number      1
17        urg_count      2
8   syn_flag_number      3
2     Protocol Type      4
1     Header_Length      5
43       Covariance      6
15        syn_count      7
41         Magnitue      8
5             Srate      9
10  psh_flag_number     10
3          Duration     11
26              TCP     12
44         Variance     13
33          Tot sum     14
4              Rate     15
35              Max     16
38         Tot size     17
14        ack_count     18
36              AVG     19
37              Std     20
45           Weight     21
18        rst_count     22
40           Number     23
31              IPv     24
12  ece_flag_number     25
19             HTTP     26
30             ICMP     27
32              LLC     28
6             Drate     29
42           Radius     30
29              ARP     31
16        fin_count     32
28             DHCP     33
9   rst_flag_number     34
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['fin_flag_number', 'IAT', 'Min', 'Header_Length']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['urg_count']
top_features_combined =  ['fin_flag_number', 'IAT', 'Min', 'Header_Length', 'urg_count']


In [None]:

from sklearn.neighbors import KNeighborsClassifier
# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9978790569201933
Precision: 0.9978743472471425
Recall: 0.9978790569201933
F1 Score: 0.9978687930157063
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.96      0.98      0.97       513
        DDoS-HTTP_Flood       1.00      1.00      1.00        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       1.00      0.99      0.99       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       0.78      0.58      0.67        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       0.95      0.97      0.96       548

              

# 10 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037683
1     Header_Length  1.139858
34              Min  1.047125
38         Tot size  1.042247
41         Magnitue  1.035964
36              AVG  1.031720
33          Tot sum  1.027136
2     Protocol Type  0.979628
35              Max  0.978436
26              TCP  0.665388
15        syn_count  0.614862
30             ICMP  0.532555
8   syn_flag_number  0.528071
5             Srate  0.526149
4              Rate  0.526069
0     flow_duration  0.478570
18        rst_count  0.435492
27              UDP  0.434814
16        fin_count  0.398691
17        urg_count  0.385482
14        ack_count  0.380898
7   fin_flag_number  0.364084
10  psh_flag_number  0.357613
9   rst_flag_number  0.353702
11  ack_flag_number  0.350241
37              Std  0.228546
43       Covariance  0.224870
42           Radius  0.224483
44         Variance  0.149167
3          Duration  0.058148
19             HTTP  0.027599
20            HTTPS  0.017378
40        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
34              Min      1
7   fin_flag_number      1
39              IAT      1
8   syn_flag_number      1
17        urg_count      1
2     Protocol Type      2
1     Header_Length      3
43       Covariance      4
15        syn_count      5
33          Tot sum      6
5             Srate      7
10  psh_flag_number      8
44         Variance      9
32              LLC     10
4              Rate     11
3          Duration     12
26              TCP     13
45           Weight     14
36              AVG     15
27              UDP     16
16        fin_count     17
29              ARP     18
30             ICMP     19
18        rst_count     20
35              Max     21
6             Drate     22
9   rst_flag_number     23
31              IPv     24
38         Tot size     25
37              Std     26
12  ece_flag_number     27
41         Magnitue     28
13  cwr_flag_number     29
14        ack_count     30
42           Radius     31
40           Number     32
1

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['fin_flag_number', 'urg_count', 'Min', 'syn_flag_number', 'IAT', 'Tot size', 'Magnitue', 'Header_Length']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['Protocol Type', 'Covariance']
top_features_combined =  ['fin_flag_number', 'urg_count', 'Min', 'syn_flag_number', 'IAT', 'Tot size', 'Magnitue', 'Header_Length', 'Protocol Type', 'Covariance']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.997056985959028
Precision: 0.9970593487762511
Recall: 0.997056985959028
F1 Score: 0.9970569992470373
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.96      0.97      0.97       513
        DDoS-HTTP_Flood       0.93      0.95      0.94        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       1.00      0.99      0.99       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       0.65      0.62      0.64        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       0.96      0.96      0.96       548

               a

# 15 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037652
1     Header_Length  1.137934
34              Min  1.048026
38         Tot size  1.044745
41         Magnitue  1.035967
36              AVG  1.034179
33          Tot sum  1.028124
2     Protocol Type  0.980208
35              Max  0.976370
26              TCP  0.666196
15        syn_count  0.614073
30             ICMP  0.531187
8   syn_flag_number  0.530340
5             Srate  0.525975
4              Rate  0.525182
0     flow_duration  0.476466
27              UDP  0.438114
18        rst_count  0.432681
16        fin_count  0.401135
17        urg_count  0.390731
14        ack_count  0.378383
7   fin_flag_number  0.365787
10  psh_flag_number  0.358981
9   rst_flag_number  0.355061
11  ack_flag_number  0.348495
37              Std  0.227459
42           Radius  0.223860
43       Covariance  0.222197
44         Variance  0.148402
3          Duration  0.059637
19             HTTP  0.031041
20            HTTPS  0.019383
31        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
39              IAT      1
17        urg_count      1
34              Min      1
8   syn_flag_number      1
7   fin_flag_number      1
43       Covariance      1
2     Protocol Type      1
1     Header_Length      1
15        syn_count      2
33          Tot sum      3
5             Srate      4
44         Variance      5
10  psh_flag_number      6
26              TCP      7
3          Duration      8
4              Rate      9
31              IPv     10
37              Std     11
27              UDP     12
9   rst_flag_number     13
45           Weight     14
16        fin_count     15
41         Magnitue     16
6             Drate     17
18        rst_count     18
28             DHCP     19
29              ARP     20
30             ICMP     21
38         Tot size     22
32              LLC     23
11  ack_flag_number     24
40           Number     25
12  ece_flag_number     26
14        ack_count     27
19             HTTP     28
42           Radius     29
3

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['fin_flag_number', 'Covariance', 'Protocol Type', 'urg_count', 'Min', 'syn_flag_number', 'Tot sum', 'IAT', 'Tot size', 'Magnitue', 'AVG', 'Header_Length']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_count', 'Srate', 'Variance']
top_features_combined =  ['fin_flag_number', 'Covariance', 'Protocol Type', 'urg_count', 'Min', 'syn_flag_number', 'Tot sum', 'IAT', 'Tot size', 'Magnitue', 'AVG', 'Header_Length', 'syn_count', 'Srate', 'Variance']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9959554108710664
Precision: 0.9959563887930214
Recall: 0.9959554108710664
F1 Score: 0.9959544539677264
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.95      0.97      0.96       513
        DDoS-HTTP_Flood       0.93      0.95      0.94        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       0.99      0.98      0.99       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       0.61      0.58      0.60        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       0.97      0.97      0.97       548

              

# 20 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037655
1     Header_Length  1.137929
34              Min  1.048203
38         Tot size  1.044245
41         Magnitue  1.037380
36              AVG  1.031326
33          Tot sum  1.027901
2     Protocol Type  0.978123
35              Max  0.975682
26              TCP  0.669025
15        syn_count  0.615950
30             ICMP  0.535459
8   syn_flag_number  0.527885
4              Rate  0.525591
5             Srate  0.525553
0     flow_duration  0.478085
27              UDP  0.434508
18        rst_count  0.434390
16        fin_count  0.399241
17        urg_count  0.386392
14        ack_count  0.378569
7   fin_flag_number  0.364018
9   rst_flag_number  0.360002
10  psh_flag_number  0.356205
11  ack_flag_number  0.351531
37              Std  0.225244
42           Radius  0.223977
43       Covariance  0.223168
44         Variance  0.147499
3          Duration  0.058450
19             HTTP  0.029502
20            HTTPS  0.014902
40        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
39              IAT      1
15        syn_count      1
38         Tot size      1
17        urg_count      1
8   syn_flag_number      1
7   fin_flag_number      1
34              Min      1
2     Protocol Type      1
1     Header_Length      1
43       Covariance      1
4              Rate      2
10  psh_flag_number      3
26              TCP      4
44         Variance      5
31              IPv      6
16        fin_count      7
5             Srate      8
41         Magnitue      9
35              Max     10
45           Weight     11
33          Tot sum     12
30             ICMP     13
37              Std     14
3          Duration     15
12  ece_flag_number     16
18        rst_count     17
6             Drate     18
42           Radius     19
40           Number     20
36              AVG     21
13  cwr_flag_number     22
29              ARP     23
27              UDP     24
14        ack_count     25
9   rst_flag_number     26
32              LLC     27
1

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['fin_flag_number', 'Tot sum', 'Max', 'TCP', 'Magnitue', 'AVG', 'syn_flag_number', 'Covariance', 'Protocol Type', 'urg_count', 'Min', 'syn_count', 'IAT', 'Tot size', 'Header_Length']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['Rate', 'psh_flag_number', 'Variance', 'IPv', 'fin_count']
top_features_combined =  ['fin_flag_number', 'Tot sum', 'Max', 'TCP', 'Magnitue', 'AVG', 'syn_flag_number', 'Covariance', 'Protocol Type', 'urg_count', 'Min', 'syn_count', 'IAT', 'Tot size', 'Header_Length', 'Rate', 'psh_flag_number', 'Variance', 'IPv', 'fin_count']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9959554108710664
Precision: 0.9959563887930214
Recall: 0.9959554108710664
F1 Score: 0.9959544539677264
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.95      0.97      0.96       513
        DDoS-HTTP_Flood       0.93      0.95      0.94        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       0.99      0.98      0.99       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       0.61      0.58      0.60        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       1.00      1.00      1.00      9762
 DDoS-UDP_Fragmentation       0.97      0.97      0.97       548

              

# **25 Features**

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.037642
1     Header_Length  1.138465
34              Min  1.048045
38         Tot size  1.041559
41         Magnitue  1.037611
36              AVG  1.033072
33          Tot sum  1.028993
2     Protocol Type  0.981286
35              Max  0.978332
26              TCP  0.664879
15        syn_count  0.616445
30             ICMP  0.531165
8   syn_flag_number  0.528811
4              Rate  0.526021
5             Srate  0.525105
0     flow_duration  0.477022
27              UDP  0.437237
18        rst_count  0.434859
16        fin_count  0.399585
17        urg_count  0.390177
14        ack_count  0.378515
7   fin_flag_number  0.363562
10  psh_flag_number  0.358671
9   rst_flag_number  0.354765
11  ack_flag_number  0.351912
37              Std  0.226369
43       Covariance  0.224975
42           Radius  0.223993
44         Variance  0.151464
3          Duration  0.056953
19             HTTP  0.031642
20            HTTPS  0.016052
32        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
39              IAT      1
17        urg_count      1
15        syn_count      1
44         Variance      1
34              Min      1
38         Tot size      1
10  psh_flag_number      1
8   syn_flag_number      1
43       Covariance      1
5             Srate      1
2     Protocol Type      1
1     Header_Length      1
7   fin_flag_number      1
3          Duration      2
26              TCP      3
42           Radius      4
36              AVG      5
45           Weight      6
16        fin_count      7
4              Rate      8
14        ack_count      9
37              Std     10
35              Max     11
6             Drate     12
29              ARP     13
33          Tot sum     14
12  ece_flag_number     15
31              IPv     16
40           Number     17
41         Magnitue     18
13  cwr_flag_number     19
32              LLC     20
28             DHCP     21
11  ack_flag_number     22
9   rst_flag_number     23
18        rst_count     24
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['fin_flag_number', 'Tot sum', 'ICMP', 'TCP', 'Max', 'Magnitue', 'AVG', 'syn_flag_number', 'psh_flag_number', 'Covariance', 'Protocol Type', 'urg_count', 'Variance', 'Min', 'syn_count', 'Srate', 'IAT', 'Tot size', 'Header_Length']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['Duration', 'Radius', 'Weight', 'fin_count', 'Rate', 'ack_count']
top_features_combined =  ['fin_flag_number', 'Tot sum', 'ICMP', 'TCP', 'Max', 'Magnitue', 'AVG', 'syn_flag_number', 'psh_flag_number', 'Covariance', 'Protocol Type', 'urg_count', 'Variance', 'Min', 'syn_count', 'Srate', 'IAT', 'Tot size', 'Header_Length', 'Duration', 'Radius', 'Weight', 'fin_count', 'Rate', 'ack_count']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9956759067442702
Precision: 0.9956783160849333
Recall: 0.9956759067442702
F1 Score: 0.9956751833314686
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.95      0.97      0.96       513
        DDoS-HTTP_Flood       0.93      0.95      0.94        56
        DDoS-ICMP_Flood       1.00      1.00      1.00     12772
DDoS-ICMP_Fragmentation       0.99      0.98      0.99       835
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7477
       DDoS-RSTFINFlood       1.00      1.00      1.00      7196
         DDoS-SYN_Flood       1.00      1.00      1.00      7166
         DDoS-SlowLoris       0.61      0.58      0.60        24
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6311
         DDoS-TCP_Flood       1.00      1.00      1.00      8162
         DDoS-UDP_Flood       0.99      1.00      1.00      9762
 DDoS-UDP_Fragmentation       0.97      0.96      0.96       548

              