<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_cyber_attacks_using_KNN_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score,recall_score

# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')

# Step 2: Separate features and target
X = df.drop("label", axis=1, errors='ignore')
y = df["label"] if 'label' in df.columns else pd.Series([])

# Check if X and y are valid
print(X.shape)
print(y.shape)


(238687, 46)
(238687,)


# 5 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601478
38         Tot size  1.337739
1     Header_Length  1.326407
41         Magnitue  1.318772
36              AVG  1.312611
33          Tot sum  1.304776
34              Min  1.298815
35              Max  1.272458
2     Protocol Type  1.162507
26              TCP  0.654012
15        syn_count  0.648266
0     flow_duration  0.637336
5             Srate  0.632963
4              Rate  0.632455
18        rst_count  0.510328
8   syn_flag_number  0.498034
27              UDP  0.493474
17        urg_count  0.462270
30             ICMP  0.439984
37              Std  0.383198
43       Covariance  0.382293
42           Radius  0.380481
16        fin_count  0.347933
14        ack_count  0.332644
11  ack_flag_number  0.326908
44         Variance  0.299656
7   fin_flag_number  0.295323
9   rst_flag_number  0.284024
10  psh_flag_number  0.283991
3          Duration  0.211574
40           Number  0.180829
45           Weight  0.175109
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
39              IAT      1
34              Min      1
41         Magnitue      1
7   fin_flag_number      2
10  psh_flag_number      3
8   syn_flag_number      4
33          Tot sum      5
2     Protocol Type      6
30             ICMP      7
1     Header_Length      8
18        rst_count      9
42           Radius     10
5             Srate     11
16        fin_count     12
0     flow_duration     13
36              AVG     14
15        syn_count     15
44         Variance     16
3          Duration     17
4              Rate     18
17        urg_count     19
38         Tot size     20
40           Number     21
20            HTTPS     22
35              Max     23
37              Std     24
43       Covariance     25
14        ack_count     26
45           Weight     27
27              UDP     28
19             HTTP     29
24              SSH     30
26              TCP     31
11  ack_flag_number     32
9   rst_flag_number     33
21              DNS     34
6

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Header_Length', 'Magnitue', 'Min', 'IAT', 'Tot size', 'AVG']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['fin_flag_number', 'psh_flag_number', 'syn_flag_number', 'Tot sum', 'Protocol Type', 'ICMP', 'rst_count', 'Radius', 'Srate', 'fin_count', 'flow_duration', 'syn_count', 'Variance', 'Duration', 'Rate', 'urg_count', 'Number', 'HTTPS', 'Max', 'Std', 'Covariance', 'ack_count', 'Weight', 'UDP', 'HTTP', 'SSH', 'TCP', 'ack_flag_number', 'rst_flag_number', 'DNS', 'Drate', 'ece_flag_number', 'IPv', 'DHCP', 'cwr_flag_number', 'ARP', 'LLC', 'IRC', 'SMTP', 'Telnet']
top_features_combined =  ['Header_Length', 'Magnitue', 'Min', 'IAT', 'Tot size', 'AVG', 'fin_flag_number', 'psh_flag_number', 'syn_flag_number', 'Tot sum', 'Protocol Type', 'ICMP', 'rst_count', 'Radius', 'Srate', 'fin_count', 'flow_duration', 'syn_count', 'Variance', 'Duration', 'Rate', 'urg_count', 'Number', 'HTTPS', 'Max', 'Std', 'Covariance', 'ack_count', 'Weight', 'UDP', 'HTTP', 'SSH', 'TCP', 'ack_flag_number', 'rst_flag_number', 'DNS', 'Drate', 'ece_flag_number', 'IPv', 'DHCP', 'cwr_flag_number', 'ARP', 'L

In [None]:

from sklearn.neighbors import KNeighborsClassifier
# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9822123268814115
Precision: 0.9816663770415087
Recall: 0.9822123268814115
F1 Score: 0.9810553961963036


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.22      0.50      0.31         4
          BenignTraffic       0.73      0.93      0.82      1951
       BrowserHijacking       0.00      0.00      0.00        11
       CommandInjection       0.62      0.62      0.62         8
 DDoS-ACK_Fragmentation       0.94      0.96      0.95       539
        DDoS-HTTP_Flood       0.96      0.89      0.92        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       0.96      0.98      0.97       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       0.99      0.99      0.99      7230
         DDoS-SYN_Flood       1.00      0.99      1.00      7309
         DDoS-SlowLoris       0.69      0.43      0.53        47
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6251
         DDoS-TCP_Flood       1.00      1.00      1.00      8058


# 10 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601473
38         Tot size  1.337934
1     Header_Length  1.327710
41         Magnitue  1.318071
36              AVG  1.312015
33          Tot sum  1.305782
34              Min  1.297050
35              Max  1.272283
2     Protocol Type  1.162467
26              TCP  0.655537
15        syn_count  0.649357
0     flow_duration  0.636118
4              Rate  0.632770
5             Srate  0.632701
18        rst_count  0.511235
8   syn_flag_number  0.495702
27              UDP  0.495266
17        urg_count  0.460430
30             ICMP  0.444715
37              Std  0.382921
42           Radius  0.382133
43       Covariance  0.381825
16        fin_count  0.347460
14        ack_count  0.331654
11  ack_flag_number  0.324013
44         Variance  0.299108
7   fin_flag_number  0.295511
10  psh_flag_number  0.287651
9   rst_flag_number  0.287155
3          Duration  0.209518
40           Number  0.178862
45           Weight  0.176798
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
34              Min      1
7   fin_flag_number      1
10  psh_flag_number      1
41         Magnitue      1
39              IAT      1
8   syn_flag_number      2
33          Tot sum      3
2     Protocol Type      4
1     Header_Length      5
30             ICMP      6
18        rst_count      7
42           Radius      8
4              Rate      9
16        fin_count     10
0     flow_duration     11
36              AVG     12
15        syn_count     13
44         Variance     14
3          Duration     15
5             Srate     16
17        urg_count     17
38         Tot size     18
20            HTTPS     19
35              Max     20
40           Number     21
43       Covariance     22
14        ack_count     23
27              UDP     24
37              Std     25
45           Weight     26
19             HTTP     27
11  ack_flag_number     28
24              SSH     29
26              TCP     30
9   rst_flag_number     31
21              DNS     32
2

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Magnitue', 'Min', 'fin_flag_number', 'Tot size', 'Header_Length', 'IAT', 'psh_flag_number', 'AVG']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_flag_number', 'Tot sum']
top_features_combined =  ['Magnitue', 'Min', 'fin_flag_number', 'Tot size', 'Header_Length', 'IAT', 'psh_flag_number', 'AVG', 'syn_flag_number', 'Tot sum']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.98697645467495
Precision: 0.986852777112305
Recall: 0.98697645467495
F1 Score: 0.9863097009629476


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.29      1.00      0.44         4
          BenignTraffic       0.77      0.94      0.84      1951
       BrowserHijacking       0.00      0.00      0.00        11
       CommandInjection       0.56      0.62      0.59         8
 DDoS-ACK_Fragmentation       0.97      0.98      0.97       539
        DDoS-HTTP_Flood       1.00      1.00      1.00        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       0.98      0.98      0.98       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.94      0.62      0.74        47
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6251
         DDoS-TCP_Flood       1.00      1.00      1.00      8058


# 15 features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601411
38         Tot size  1.337880
1     Header_Length  1.326567
41         Magnitue  1.318801
36              AVG  1.314731
33          Tot sum  1.304261
34              Min  1.297114
35              Max  1.273074
2     Protocol Type  1.162718
26              TCP  0.656027
15        syn_count  0.648717
0     flow_duration  0.634438
4              Rate  0.633025
5             Srate  0.632282
18        rst_count  0.509047
8   syn_flag_number  0.497058
27              UDP  0.495788
17        urg_count  0.460852
30             ICMP  0.443985
37              Std  0.382452
42           Radius  0.379909
43       Covariance  0.378266
16        fin_count  0.350968
14        ack_count  0.331701
11  ack_flag_number  0.324635
44         Variance  0.298060
7   fin_flag_number  0.296381
9   rst_flag_number  0.285224
10  psh_flag_number  0.285136
3          Duration  0.211901
45           Weight  0.178026
40           Number  0.177194
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
2     Protocol Type      1
34              Min      1
7   fin_flag_number      1
8   syn_flag_number      1
10  psh_flag_number      1
41         Magnitue      1
33          Tot sum      1
39              IAT      1
1     Header_Length      2
30             ICMP      3
18        rst_count      4
37              Std      5
4              Rate      6
16        fin_count      7
0     flow_duration      8
36              AVG      9
44         Variance     10
3          Duration     11
15        syn_count     12
5             Srate     13
17        urg_count     14
38         Tot size     15
20            HTTPS     16
43       Covariance     17
35              Max     18
40           Number     19
42           Radius     20
14        ack_count     21
19             HTTP     22
45           Weight     23
27              UDP     24
11  ack_flag_number     25
24              SSH     26
9   rst_flag_number     27
26              TCP     28
21              DNS     29
6

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Magnitue', 'Min', 'fin_flag_number', 'Tot sum', 'Tot size', 'Header_Length', 'Protocol Type', 'IAT', 'syn_flag_number', 'psh_flag_number', 'AVG']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['ICMP', 'rst_count', 'Std', 'Rate']
top_features_combined =  ['Magnitue', 'Min', 'fin_flag_number', 'Tot sum', 'Tot size', 'Header_Length', 'Protocol Type', 'IAT', 'syn_flag_number', 'psh_flag_number', 'AVG', 'ICMP', 'rst_count', 'Std', 'Rate']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9852766904873056
Precision: 0.9850432956447885
Recall: 0.9852766904873056
F1 Score: 0.9845637793781169


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.29      1.00      0.44         4
          BenignTraffic       0.77      0.93      0.84      1951
       BrowserHijacking       0.00      0.00      0.00        11
       CommandInjection       0.45      0.62      0.53         8
 DDoS-ACK_Fragmentation       0.96      0.95      0.95       539
        DDoS-HTTP_Flood       1.00      0.95      0.97        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       0.98      0.98      0.98       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.84      0.55      0.67        47
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6251
         DDoS-TCP_Flood       1.00      1.00      1.00      8058


# 20 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601677
38         Tot size  1.339809
1     Header_Length  1.326893
41         Magnitue  1.317881
36              AVG  1.312906
33          Tot sum  1.305667
34              Min  1.297067
35              Max  1.273413
2     Protocol Type  1.162664
26              TCP  0.655932
15        syn_count  0.649593
0     flow_duration  0.636320
5             Srate  0.633108
4              Rate  0.632402
18        rst_count  0.511761
8   syn_flag_number  0.496840
27              UDP  0.494538
17        urg_count  0.461748
30             ICMP  0.441979
43       Covariance  0.383886
37              Std  0.382566
42           Radius  0.381489
16        fin_count  0.345445
14        ack_count  0.332525
11  ack_flag_number  0.325689
44         Variance  0.298754
7   fin_flag_number  0.295363
9   rst_flag_number  0.285834
10  psh_flag_number  0.285699
3          Duration  0.208429
45           Weight  0.181070
40           Number  0.179106
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
33          Tot sum      1
1     Header_Length      1
2     Protocol Type      1
30             ICMP      1
39              IAT      1
7   fin_flag_number      1
8   syn_flag_number      1
34              Min      1
10  psh_flag_number      1
41         Magnitue      1
18        rst_count      2
42           Radius      3
4              Rate      4
16        fin_count      5
0     flow_duration      6
36              AVG      7
3          Duration      8
44         Variance      9
5             Srate     10
17        urg_count     11
38         Tot size     12
15        syn_count     13
20            HTTPS     14
35              Max     15
43       Covariance     16
40           Number     17
37              Std     18
14        ack_count     19
27              UDP     20
19             HTTP     21
26              TCP     22
11  ack_flag_number     23
24              SSH     24
9   rst_flag_number     25
45           Weight     26
21              DNS     27
6

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Magnitue', 'fin_flag_number', 'Tot sum', 'psh_flag_number', 'Min', 'ICMP', 'Tot size', 'Header_Length', 'Max', 'TCP', 'Protocol Type', 'IAT', 'syn_flag_number', 'AVG']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['rst_count', 'Radius', 'Rate', 'fin_count', 'flow_duration', 'Duration']
top_features_combined =  ['Magnitue', 'fin_flag_number', 'Tot sum', 'psh_flag_number', 'Min', 'ICMP', 'Tot size', 'Header_Length', 'Max', 'TCP', 'Protocol Type', 'IAT', 'syn_flag_number', 'AVG', 'rst_count', 'Radius', 'Rate', 'fin_count', 'flow_duration', 'Duration']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9855041237236806
Precision: 0.9852830769357647
Recall: 0.9855041237236806
F1 Score: 0.9847829371394159


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.36      1.00      0.53         4
          BenignTraffic       0.77      0.94      0.85      1951
       BrowserHijacking       0.00      0.00      0.00        11
       CommandInjection       0.42      0.62      0.50         8
 DDoS-ACK_Fragmentation       0.96      0.95      0.96       539
        DDoS-HTTP_Flood       1.00      0.95      0.97        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       0.98      0.98      0.98       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.84      0.55      0.67        47
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6251
         DDoS-TCP_Flood       1.00      1.00      1.00      8058


# **25 Features**

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print("skb_selected_feature_names = ", skb_selected_feature_names)

            Feature     Score
39              IAT  2.601186
38         Tot size  1.340401
1     Header_Length  1.327574
41         Magnitue  1.316188
36              AVG  1.312840
33          Tot sum  1.306688
34              Min  1.297364
35              Max  1.272401
2     Protocol Type  1.162532
26              TCP  0.653577
15        syn_count  0.646147
0     flow_duration  0.637630
4              Rate  0.632844
5             Srate  0.632789
18        rst_count  0.512035
27              UDP  0.497377
8   syn_flag_number  0.496887
17        urg_count  0.461296
30             ICMP  0.445490
43       Covariance  0.383683
37              Std  0.382010
42           Radius  0.378177
16        fin_count  0.349839
14        ack_count  0.334031
11  ack_flag_number  0.326779
44         Variance  0.297408
7   fin_flag_number  0.292592
10  psh_flag_number  0.287253
9   rst_flag_number  0.286387
3          Duration  0.210616
40           Number  0.178830
45           Weight  0.178355
20        

In [None]:
rfe_selector = RFE(estimator= DecisionTreeClassifier( ), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

# Printing the rankings
rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print(' rfe_feature_scores = ', rfe_selected_feature_names)

            Feature  Score
30             ICMP      1
1     Header_Length      1
2     Protocol Type      1
34              Min      1
4              Rate      1
37              Std      1
18        rst_count      1
7   fin_flag_number      1
8   syn_flag_number      1
10  psh_flag_number      1
41         Magnitue      1
33          Tot sum      1
39              IAT      1
16        fin_count      2
0     flow_duration      3
36              AVG      4
44         Variance      5
3          Duration      6
15        syn_count      7
5             Srate      8
17        urg_count      9
38         Tot size     10
20            HTTPS     11
43       Covariance     12
40           Number     13
42           Radius     14
35              Max     15
14        ack_count     16
19             HTTP     17
45           Weight     18
27              UDP     19
24              SSH     20
26              TCP     21
9   rst_flag_number     22
11  ack_flag_number     23
21              DNS     24
6

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Magnitue', 'syn_count', 'Std', 'Tot sum', 'fin_flag_number', 'flow_duration', 'psh_flag_number', 'AVG', 'Rate', 'ICMP', 'Tot size', 'Header_Length', 'Max', 'TCP', 'Protocol Type', 'IAT', 'syn_flag_number', 'rst_count', 'Min']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

# Find the next 3 best-ranked features NOT in SelectKBest
extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print('top_features_combined = ', top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['fin_count', 'Variance', 'Duration', 'Srate', 'urg_count', 'HTTPS']
top_features_combined =  ['Magnitue', 'syn_count', 'Std', 'Tot sum', 'fin_flag_number', 'flow_duration', 'psh_flag_number', 'AVG', 'Rate', 'ICMP', 'Tot size', 'Header_Length', 'Max', 'TCP', 'Protocol Type', 'IAT', 'syn_flag_number', 'rst_count', 'Min', 'fin_count', 'Variance', 'Duration', 'Srate', 'urg_count', 'HTTPS']


In [None]:

# Step 4: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 5: Train K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn_model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = knn_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9851450186136149
Precision: 0.9848902369728629
Recall: 0.9851450186136149
F1 Score: 0.9844286444721514


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.36      1.00      0.53         4
          BenignTraffic       0.77      0.94      0.84      1951
       BrowserHijacking       0.00      0.00      0.00        11
       CommandInjection       0.45      0.62      0.53         8
 DDoS-ACK_Fragmentation       0.96      0.95      0.96       539
        DDoS-HTTP_Flood       1.00      0.93      0.97        61
        DDoS-ICMP_Flood       1.00      1.00      1.00     12879
DDoS-ICMP_Fragmentation       0.98      0.97      0.97       816
      DDoS-PSHACK_Flood       1.00      1.00      1.00      7396
       DDoS-RSTFINFlood       1.00      1.00      1.00      7230
         DDoS-SYN_Flood       1.00      1.00      1.00      7309
         DDoS-SlowLoris       0.83      0.53      0.65        47
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      6251
         DDoS-TCP_Flood       1.00      1.00      1.00      8058
