<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_cyber_attacks_using_RF_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')

# Step 2: Separate features and target
X = df.drop("label", axis=1, errors='ignore')
y = df["label"] if 'label' in df.columns else pd.Series([])

# Encode target labels into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Check for valid features and labels
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


Features shape: (238687, 46)
Target shape: (238687,)


# 5 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.601526
38         Tot size  1.338491
1     Header_Length  1.327253
41         Magnitue  1.316873
36              AVG  1.312922
33          Tot sum  1.303143
34              Min  1.297709
35              Max  1.271486
2     Protocol Type  1.162077
26              TCP  0.655380
15        syn_count  0.647037
0     flow_duration  0.634361
5             Srate  0.632944
4              Rate  0.632361
18        rst_count  0.510583
8   syn_flag_number  0.498558
27              UDP  0.495764
17        urg_count  0.460873
30             ICMP  0.443863
37              Std  0.383906
43       Covariance  0.382039
42           Radius  0.380824
16        fin_count  0.347048
14        ack_count  0.331640
11  ack_flag_number  0.327203
44         Variance  0.297658
7   fin_flag_number  0.291331
9   rst_flag_number  0.285713
10  psh_flag_number  0.285226
3          Duration  0.212666
45           Weight  0.178388
40           Number  0.177406
20        

In [None]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
34              Min      1
39              IAT      1
41         Magnitue      1
15        syn_count      2
38         Tot size      3
36              AVG      4
7   fin_flag_number      5
10  psh_flag_number      6
1     Header_Length      7
2     Protocol Type      8
8   syn_flag_number      9
33          Tot sum     10
17        urg_count     11
14        ack_count     12
0     flow_duration     13
35              Max     14
18        rst_count     15
16        fin_count     16
5             Srate     17
27              UDP     18
11  ack_flag_number     19
4              Rate     20
30             ICMP     21
44         Variance     22
26              TCP     23
9   rst_flag_number     24
42           Radius     25
43       Covariance     26
37              Std     27
40           Number     28
3          Duration     29
45           Weight     30
20            HTTPS     31
19             HTTP     32
21              DNS     33
24              SSH     34
3

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot size', 'IAT', 'Min', 'Magnitue']


In [None]:
no_of_redundent_features = 5-(len(top_features_combined))


extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_count']
['Tot size', 'IAT', 'Min', 'Magnitue', 'syn_count']


In [None]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9910343424186926
Precision: 0.9906822805370231
Recall: 0.9910343424186926
F1 Score: 0.9904485910818993
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.81      0.96      0.88      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       0.99      0.99      0.99       539
           5       0.92      1.00      0.96        61
           6       1.00      1.00      1.00     12879
           7       0.99      1.00      1.00       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.96      0.96      0.96        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      9

# 10 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.601493
38         Tot size  1.337445
1     Header_Length  1.326649
41         Magnitue  1.318354
36              AVG  1.311915
33          Tot sum  1.305875
34              Min  1.295828
35              Max  1.272968
2     Protocol Type  1.162741
26              TCP  0.656333
15        syn_count  0.646946
0     flow_duration  0.636168
4              Rate  0.632633
5             Srate  0.632600
18        rst_count  0.509842
8   syn_flag_number  0.498780
27              UDP  0.492435
17        urg_count  0.461447
30             ICMP  0.445349
43       Covariance  0.381943
37              Std  0.379853
42           Radius  0.379124
16        fin_count  0.347370
14        ack_count  0.332304
11  ack_flag_number  0.326482
44         Variance  0.299211
7   fin_flag_number  0.293657
10  psh_flag_number  0.286020
9   rst_flag_number  0.285511
3          Duration  0.211844
40           Number  0.176741
45           Weight  0.176223
20        

In [None]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
34              Min      1
38         Tot size      1
39              IAT      1
15        syn_count      1
41         Magnitue      1
36              AVG      2
7   fin_flag_number      3
10  psh_flag_number      4
1     Header_Length      5
2     Protocol Type      6
18        rst_count      7
35              Max      8
8   syn_flag_number      9
14        ack_count     10
33          Tot sum     11
0     flow_duration     12
5             Srate     13
16        fin_count     14
17        urg_count     15
30             ICMP     16
27              UDP     17
4              Rate     18
11  ack_flag_number     19
44         Variance     20
9   rst_flag_number     21
26              TCP     22
37              Std     23
42           Radius     24
43       Covariance     25
3          Duration     26
45           Weight     27
40           Number     28
20            HTTPS     29
19             HTTP     30
24              SSH     31
21              DNS     32
3

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot size', 'IAT', 'Magnitue', 'Header_Length', 'syn_count', 'AVG', 'Min']


In [None]:
no_of_redundent_features = 10-(len(top_features_combined))

extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['fin_flag_number', 'psh_flag_number', 'Protocol Type']
['Tot size', 'IAT', 'Magnitue', 'Header_Length', 'syn_count', 'AVG', 'Min', 'fin_flag_number', 'psh_flag_number', 'Protocol Type']


In [None]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9915610299134556
Precision: 0.9911607372622913
Recall: 0.9915610299134556
F1 Score: 0.9909040444856537
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.83      0.97      0.90      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       0.99      0.98      0.99       539
           5       0.83      0.97      0.89        61
           6       1.00      1.00      1.00     12879
           7       0.98      1.00      0.99       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.85      0.87      0.86        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      9

# 15 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.601205
38         Tot size  1.338093
1     Header_Length  1.326020
41         Magnitue  1.318289
36              AVG  1.311659
33          Tot sum  1.304061
34              Min  1.298065
35              Max  1.271573
2     Protocol Type  1.162413
26              TCP  0.656795
15        syn_count  0.650762
0     flow_duration  0.636389
4              Rate  0.632693
5             Srate  0.631931
18        rst_count  0.510628
8   syn_flag_number  0.496774
27              UDP  0.493230
17        urg_count  0.462407
30             ICMP  0.442824
37              Std  0.382852
42           Radius  0.380368
43       Covariance  0.379815
16        fin_count  0.349282
14        ack_count  0.333188
11  ack_flag_number  0.326725
44         Variance  0.298468
7   fin_flag_number  0.295421
9   rst_flag_number  0.286672
10  psh_flag_number  0.285321
3          Duration  0.210859
40           Number  0.178582
45           Weight  0.178282
20        

In [None]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
15        syn_count      1
34              Min      1
36              AVG      1
38         Tot size      1
39              IAT      1
7   fin_flag_number      1
10  psh_flag_number      1
41         Magnitue      1
1     Header_Length      2
8   syn_flag_number      3
2     Protocol Type      4
14        ack_count      5
33          Tot sum      6
0     flow_duration      7
35              Max      8
17        urg_count      9
16        fin_count     10
18        rst_count     11
5             Srate     12
30             ICMP     13
27              UDP     14
4              Rate     15
9   rst_flag_number     16
11  ack_flag_number     17
26              TCP     18
44         Variance     19
43       Covariance     20
37              Std     21
42           Radius     22
3          Duration     23
40           Number     24
45           Weight     25
20            HTTPS     26
19             HTTP     27
24              SSH     28
21              DNS     29
3

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Tot size', 'psh_flag_number', 'IAT', 'fin_flag_number', 'Magnitue', 'Tot sum', 'Header_Length', 'syn_count', 'AVG', 'Min']


In [None]:
no_of_redundent_features = 15-(len(top_features_combined))


extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_flag_number', 'Protocol Type', 'ack_count', 'flow_duration', 'Max']
['Tot size', 'psh_flag_number', 'IAT', 'fin_flag_number', 'Magnitue', 'Tot sum', 'Header_Length', 'syn_count', 'AVG', 'Min', 'syn_flag_number', 'Protocol Type', 'ack_count', 'flow_duration', 'Max']


In [None]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9908667600339953
Precision: 0.9901811541513873
Recall: 0.9908667600339953
F1 Score: 0.9900699622765582
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.83      0.98      0.90      1951
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         8
           4       0.99      0.98      0.99       539
           5       0.79      0.92      0.85        61
           6       1.00      1.00      1.00     12879
           7       0.99      0.99      0.99       816
           8       1.00      1.00      1.00      7396
           9       1.00      1.00      1.00      7230
          10       1.00      1.00      1.00      7309
          11       0.84      0.68      0.75        47
          12       1.00      1.00      1.00      6251
          13       1.00      1.00      1.00      8058
          14       1.00      1.00      1.00      9

# 20 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.601509
38         Tot size  1.338497
1     Header_Length  1.326666
41         Magnitue  1.317534
36              AVG  1.310313
33          Tot sum  1.305601
34              Min  1.296997
35              Max  1.272325
2     Protocol Type  1.162193
26              TCP  0.657358
15        syn_count  0.649527
0     flow_duration  0.636757
5             Srate  0.632720
4              Rate  0.632604
18        rst_count  0.510121
8   syn_flag_number  0.497959
27              UDP  0.493908
17        urg_count  0.462178
30             ICMP  0.443475
37              Std  0.382872
42           Radius  0.382124
43       Covariance  0.381727
16        fin_count  0.348705
14        ack_count  0.329308
11  ack_flag_number  0.325570
44         Variance  0.299317
7   fin_flag_number  0.295535
10  psh_flag_number  0.285762
9   rst_flag_number  0.283838
3          Duration  0.211113
40           Number  0.179155
45           Weight  0.175017
20        

In [None]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'UDP', 'urg_count', 'Header_Length', 'Tot sum', 'AVG', 'syn_count', 'Magnitue', 'TCP', 'Max']


In [None]:
no_of_redundent_features = 20-(len(top_features_combined))

extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['ack_count', 'psh_flag_number', 'syn_flag_number', 'fin_count', 'rst_count', 'ack_flag_number']
['Min', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'UDP', 'urg_count', 'Header_Length', 'Tot sum', 'AVG', 'syn_count', 'Magnitue', 'TCP', 'Max', 'ack_count', 'psh_flag_number', 'syn_flag_number', 'fin_count', 'rst_count', 'ack_flag_number']


In [None]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.9991779290388346
Precision: 0.9991900197575078
Recall: 0.9991779290388346
F1 Score: 0.9991814742902781
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       513
           1       0.97      1.00      0.98        56
           2       1.00      1.00      1.00     12772
           3       0.98      1.00      0.99       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.85      0.96      0.90        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.98      0.99      0.99     60822
weighted avg       1.00      1.00      1.00     6

# 25 Features

In [None]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.037667
1     Header_Length  1.137331
34              Min  1.047731
38         Tot size  1.042350
41         Magnitue  1.038449
36              AVG  1.032249
33          Tot sum  1.029861
2     Protocol Type  0.979617
35              Max  0.977488
26              TCP  0.669488
15        syn_count  0.615645
30             ICMP  0.533193
8   syn_flag_number  0.530077
5             Srate  0.525655
4              Rate  0.525389
0     flow_duration  0.477055
27              UDP  0.434234
18        rst_count  0.433090
16        fin_count  0.400893
17        urg_count  0.388131
14        ack_count  0.380211
7   fin_flag_number  0.365576
10  psh_flag_number  0.358072
9   rst_flag_number  0.356337
11  ack_flag_number  0.350323
43       Covariance  0.226241
37              Std  0.225009
42           Radius  0.223542
44         Variance  0.149737
3          Duration  0.057544
19             HTTP  0.032423
20            HTTPS  0.016839
40        

In [None]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
15        syn_count      1
17        urg_count      1
33          Tot sum      1
14        ack_count      1
41         Magnitue      1
38         Tot size      1
10  psh_flag_number      1
8   syn_flag_number      1
39              IAT      1
36              AVG      1
34              Min      1
1     Header_Length      1
7   fin_flag_number      1
27              UDP      2
18        rst_count      3
16        fin_count      4
2     Protocol Type      5
35              Max      6
9   rst_flag_number      7
11  ack_flag_number      8
0     flow_duration      9
30             ICMP     10
5             Srate     11
4              Rate     12
26              TCP     13
37              Std     14
44         Variance     15
42           Radius     16
43       Covariance     17
3          Duration     18
19             HTTP     19
45           Weight     20
20            HTTPS     21
40           Number     22
32              LLC     23
29              ARP     24
3

In [None]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'ICMP', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'urg_count', 'Header_Length', 'psh_flag_number', 'Tot sum', 'AVG', 'syn_flag_number', 'syn_count', 'Magnitue', 'ack_count', 'TCP', 'Max']


In [None]:
no_of_redundent_features = 25-(len(top_features_combined))

extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['UDP', 'rst_count', 'fin_count', 'rst_flag_number', 'ack_flag_number', 'flow_duration', 'Srate', 'Rate']
['Min', 'ICMP', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'urg_count', 'Header_Length', 'psh_flag_number', 'Tot sum', 'AVG', 'syn_flag_number', 'syn_count', 'Magnitue', 'ack_count', 'TCP', 'Max', 'UDP', 'rst_count', 'fin_count', 'rst_flag_number', 'ack_flag_number', 'flow_duration', 'Srate', 'Rate']


In [None]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.9992930189733977
Precision: 0.9993092840129629
Recall: 0.9992930189733977
F1 Score: 0.9992973340413825
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       513
           1       0.98      1.00      0.99        56
           2       1.00      1.00      1.00     12772
           3       0.99      1.00      0.99       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.83      1.00      0.91        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.98      1.00      0.99     60822
weighted avg       1.00      1.00      1.00     6