<a href="https://colab.research.google.com/github/ushasri999/Multiclass-classification-of-DDoS-attacks-in-IoT-network-using-hybrid-feature-selection-algorithm/blob/main/MC_of_DDoS_attacks_using_RF_%26_HFSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Separate features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df_ddos.columns else pd.Series([])

# Encode target labels into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Check for valid features and labels
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


Features shape: (173777, 46)
Target shape: (173777,)


# 5 Features

In [5]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=2)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.037655
1     Header_Length  1.137673
34              Min  1.048091
38         Tot size  1.040580
41         Magnitue  1.035853
36              AVG  1.030769
33          Tot sum  1.027818
2     Protocol Type  0.977589
35              Max  0.976750
26              TCP  0.666537
15        syn_count  0.615892
30             ICMP  0.532460
8   syn_flag_number  0.529614
4              Rate  0.526137
5             Srate  0.525697
0     flow_duration  0.478410
27              UDP  0.435094
18        rst_count  0.433737
16        fin_count  0.404035
17        urg_count  0.387752
14        ack_count  0.380753
7   fin_flag_number  0.365498
10  psh_flag_number  0.355848
9   rst_flag_number  0.355565
11  ack_flag_number  0.349758
42           Radius  0.226610
37              Std  0.226518
43       Covariance  0.223591
44         Variance  0.150267
3          Duration  0.055064
19             HTTP  0.030318
20            HTTPS  0.016695
32        

In [7]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=3)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
34              Min      1
39              IAT      1
41         Magnitue      1
15        syn_count      2
7   fin_flag_number      3
36              AVG      4
17        urg_count      5
38         Tot size      6
1     Header_Length      7
14        ack_count      8
10  psh_flag_number      9
33          Tot sum     10
8   syn_flag_number     11
2     Protocol Type     12
27              UDP     13
16        fin_count     14
11  ack_flag_number     15
18        rst_count     16
0     flow_duration     17
35              Max     18
30             ICMP     19
4              Rate     20
9   rst_flag_number     21
26              TCP     22
5             Srate     23
42           Radius     24
44         Variance     25
37              Std     26
43       Covariance     27
3          Duration     28
19             HTTP     29
40           Number     30
45           Weight     31
20            HTTPS     32
29              ARP     33
32              LLC     34
3

In [8]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'Magnitue', 'IAT', 'Header_Length']


In [9]:
no_of_redundent_features = 5-(len(top_features_combined))


extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_count']
['Min', 'Magnitue', 'IAT', 'Header_Length', 'syn_count']


In [10]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.999572523100194
Precision: 0.9995868334491375
Recall: 0.999572523100194
F1 Score: 0.9995767771147368
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       513
           1       0.95      1.00      0.97        56
           2       1.00      1.00      1.00     12772
           3       1.00      1.00      1.00       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.82      0.96      0.88        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      0.99      1.00       548

    accuracy                           1.00     60822
   macro avg       0.98      1.00      0.99     60822
weighted avg       1.00      1.00      1.00     608

# 10 Features

In [11]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=5)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.037637
1     Header_Length  1.137538
34              Min  1.045685
38         Tot size  1.044083
41         Magnitue  1.038732
36              AVG  1.031253
33          Tot sum  1.027172
2     Protocol Type  0.978991
35              Max  0.976818
26              TCP  0.666643
15        syn_count  0.613916
30             ICMP  0.535843
8   syn_flag_number  0.527543
4              Rate  0.526116
5             Srate  0.525484
0     flow_duration  0.479123
18        rst_count  0.434747
27              UDP  0.432365
16        fin_count  0.404588
17        urg_count  0.387327
14        ack_count  0.380186
7   fin_flag_number  0.364685
10  psh_flag_number  0.358402
9   rst_flag_number  0.356558
11  ack_flag_number  0.350159
37              Std  0.226226
43       Covariance  0.224749
42           Radius  0.221926
44         Variance  0.150547
3          Duration  0.057520
19             HTTP  0.032015
20            HTTPS  0.016778
40        

In [12]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
15        syn_count      1
34              Min      1
41         Magnitue      1
7   fin_flag_number      1
39              IAT      1
38         Tot size      2
10  psh_flag_number      3
36              AVG      4
27              UDP      5
1     Header_Length      6
14        ack_count      7
8   syn_flag_number      8
2     Protocol Type      9
17        urg_count     10
33          Tot sum     11
16        fin_count     12
35              Max     13
11  ack_flag_number     14
0     flow_duration     15
18        rst_count     16
30             ICMP     17
5             Srate     18
9   rst_flag_number     19
26              TCP     20
4              Rate     21
37              Std     22
43       Covariance     23
44         Variance     24
42           Radius     25
3          Duration     26
19             HTTP     27
45           Weight     28
20            HTTPS     29
40           Number     30
31              IPv     31
32              LLC     32
2

In [13]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'IAT', 'fin_flag_number', 'syn_count', 'Tot size', 'Magnitue', 'Header_Length']


In [14]:
no_of_redundent_features = 10-(len(top_features_combined))

extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['psh_flag_number', 'AVG', 'UDP']
['Min', 'IAT', 'fin_flag_number', 'syn_count', 'Tot size', 'Magnitue', 'Header_Length', 'psh_flag_number', 'AVG', 'UDP']


In [15]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.9992436947157278
Precision: 0.9992582603259019
Recall: 0.9992436947157278
F1 Score: 0.9992486987396392
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       513
           1       0.92      0.98      0.95        56
           2       1.00      1.00      1.00     12772
           3       0.99      1.00      0.99       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.81      0.92      0.86        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       0.99      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.98      0.99      0.98     60822
weighted avg       1.00      1.00      1.00     6

# 15 Features

In [16]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=7)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.037639
1     Header_Length  1.141106
34              Min  1.047393
38         Tot size  1.044076
41         Magnitue  1.034924
36              AVG  1.031075
33          Tot sum  1.029023
2     Protocol Type  0.978941
35              Max  0.977176
26              TCP  0.667349
15        syn_count  0.614043
30             ICMP  0.534656
8   syn_flag_number  0.530088
5             Srate  0.526210
4              Rate  0.525888
0     flow_duration  0.477293
27              UDP  0.434623
18        rst_count  0.433292
16        fin_count  0.399610
17        urg_count  0.387811
14        ack_count  0.381246
7   fin_flag_number  0.364537
9   rst_flag_number  0.357917
10  psh_flag_number  0.354913
11  ack_flag_number  0.349575
37              Std  0.227330
42           Radius  0.224622
43       Covariance  0.223007
44         Variance  0.149418
3          Duration  0.055324
19             HTTP  0.030285
20            HTTPS  0.014197
32        

In [17]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=8)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
15        syn_count      1
1     Header_Length      1
41         Magnitue      1
34              Min      1
10  psh_flag_number      1
39              IAT      1
38         Tot size      1
7   fin_flag_number      1
8   syn_flag_number      2
36              AVG      3
14        ack_count      4
33          Tot sum      5
17        urg_count      6
27              UDP      7
16        fin_count      8
18        rst_count      9
2     Protocol Type     10
11  ack_flag_number     11
0     flow_duration     12
9   rst_flag_number     13
35              Max     14
30             ICMP     15
5             Srate     16
4              Rate     17
26              TCP     18
44         Variance     19
37              Std     20
42           Radius     21
43       Covariance     22
3          Duration     23
19             HTTP     24
45           Weight     25
20            HTTPS     26
40           Number     27
31              IPv     28
32              LLC     29
2

In [18]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'Tot sum', 'AVG', 'IAT', 'fin_flag_number', 'syn_count', 'Tot size', 'psh_flag_number', 'Magnitue', 'Header_Length']


In [19]:
no_of_redundent_features = 15-(len(top_features_combined))


extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:  # Stop once we have 3 features
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['syn_flag_number', 'ack_count', 'urg_count', 'UDP', 'fin_count']
['Min', 'Tot sum', 'AVG', 'IAT', 'fin_flag_number', 'syn_count', 'Tot size', 'psh_flag_number', 'Magnitue', 'Header_Length', 'syn_flag_number', 'ack_count', 'urg_count', 'UDP', 'fin_count']


In [20]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.9991614876196113
Precision: 0.9991925368045159
Recall: 0.9991614876196113
F1 Score: 0.9991720549094997
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       513
           1       0.92      0.96      0.94        56
           2       1.00      1.00      1.00     12772
           3       0.99      1.00      0.99       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.73      0.92      0.81        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       0.99      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.97      0.99      0.98     60822
weighted avg       1.00      1.00      1.00     6

# 20 Features

In [21]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=10)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.037654
1     Header_Length  1.140839
34              Min  1.049792
38         Tot size  1.043792
41         Magnitue  1.036765
36              AVG  1.031844
33          Tot sum  1.028628
2     Protocol Type  0.978248
35              Max  0.977770
26              TCP  0.666819
15        syn_count  0.610603
30             ICMP  0.530515
8   syn_flag_number  0.529094
4              Rate  0.525599
5             Srate  0.525481
0     flow_duration  0.476558
18        rst_count  0.434453
27              UDP  0.433974
16        fin_count  0.401336
17        urg_count  0.389136
14        ack_count  0.381653
7   fin_flag_number  0.363826
9   rst_flag_number  0.357010
10  psh_flag_number  0.356748
11  ack_flag_number  0.353305
37              Std  0.227218
42           Radius  0.224236
43       Covariance  0.224231
44         Variance  0.147551
3          Duration  0.060308
19             HTTP  0.030518
20            HTTPS  0.018487
32        

In [22]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
1     Header_Length      1
36              AVG      1
27              UDP      1
38         Tot size      1
39              IAT      1
17        urg_count      1
7   fin_flag_number      1
34              Min      1
15        syn_count      1
41         Magnitue      1
14        ack_count      2
10  psh_flag_number      3
8   syn_flag_number      4
33          Tot sum      5
16        fin_count      6
18        rst_count      7
2     Protocol Type      8
11  ack_flag_number      9
35              Max     10
30             ICMP     11
0     flow_duration     12
5             Srate     13
4              Rate     14
26              TCP     15
9   rst_flag_number     16
37              Std     17
44         Variance     18
42           Radius     19
43       Covariance     20
19             HTTP     21
3          Duration     22
40           Number     23
20            HTTPS     24
45           Weight     25
31              IPv     26
32              LLC     27
2

In [23]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'UDP', 'urg_count', 'Header_Length', 'Tot sum', 'AVG', 'syn_count', 'Magnitue', 'TCP', 'Max']


In [24]:
no_of_redundent_features = 20-(len(top_features_combined))

extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['ack_count', 'psh_flag_number', 'syn_flag_number', 'fin_count', 'rst_count', 'ack_flag_number']
['Min', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'UDP', 'urg_count', 'Header_Length', 'Tot sum', 'AVG', 'syn_count', 'Magnitue', 'TCP', 'Max', 'ack_count', 'psh_flag_number', 'syn_flag_number', 'fin_count', 'rst_count', 'ack_flag_number']


In [25]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.9991779290388346
Precision: 0.9991900197575078
Recall: 0.9991779290388346
F1 Score: 0.9991814742902781
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       513
           1       0.97      1.00      0.98        56
           2       1.00      1.00      1.00     12772
           3       0.98      1.00      0.99       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.85      0.96      0.90        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.98      0.99      0.99     60822
weighted avg       1.00      1.00      1.00     6

# 25 Features

In [26]:
skb_selector = SelectKBest(score_func=mutual_info_classif, k=12)
skb_selector.fit_transform(X, y)

# printing the scores
skb_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': skb_selector.scores_})
skb_feature_scores = skb_feature_scores.sort_values(by='Score', ascending=False)  # Sort by importance
print(skb_feature_scores)

# Get the selected feature names
skb_selected_feature_names = X.columns[skb_selector.get_support()]
print('skb_selected_feature_names: ',skb_selected_feature_names.to_list())

            Feature     Score
39              IAT  2.037667
1     Header_Length  1.137331
34              Min  1.047731
38         Tot size  1.042350
41         Magnitue  1.038449
36              AVG  1.032249
33          Tot sum  1.029861
2     Protocol Type  0.979617
35              Max  0.977488
26              TCP  0.669488
15        syn_count  0.615645
30             ICMP  0.533193
8   syn_flag_number  0.530077
5             Srate  0.525655
4              Rate  0.525389
0     flow_duration  0.477055
27              UDP  0.434234
18        rst_count  0.433090
16        fin_count  0.400893
17        urg_count  0.388131
14        ack_count  0.380211
7   fin_flag_number  0.365576
10  psh_flag_number  0.358072
9   rst_flag_number  0.356337
11  ack_flag_number  0.350323
43       Covariance  0.226241
37              Std  0.225009
42           Radius  0.223542
44         Variance  0.149737
3          Duration  0.057544
19             HTTP  0.032423
20            HTTPS  0.016839
40        

In [27]:
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=13)
rfe_selector.fit_transform(X, y)

rfe_feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': rfe_selector.ranking_})
rfe_feature_scores = rfe_feature_scores.sort_values(by='Score', ascending=True)  # Sort by importance (ascending)
print(rfe_feature_scores)

# Get the selected feature names
rfe_selected_feature_names = X.columns[rfe_selector.get_support()]
print('rfe_selected_feature_names: ',rfe_selected_feature_names.to_list())

            Feature  Score
15        syn_count      1
17        urg_count      1
33          Tot sum      1
14        ack_count      1
41         Magnitue      1
38         Tot size      1
10  psh_flag_number      1
8   syn_flag_number      1
39              IAT      1
36              AVG      1
34              Min      1
1     Header_Length      1
7   fin_flag_number      1
27              UDP      2
18        rst_count      3
16        fin_count      4
2     Protocol Type      5
35              Max      6
9   rst_flag_number      7
11  ack_flag_number      8
0     flow_duration      9
30             ICMP     10
5             Srate     11
4              Rate     12
26              TCP     13
37              Std     14
44         Variance     15
42           Radius     16
43       Covariance     17
3          Duration     18
19             HTTP     19
45           Weight     20
20            HTTPS     21
40           Number     22
32              LLC     23
29              ARP     24
3

In [28]:
top_features_combined = list(set(skb_selected_feature_names) | set(rfe_selected_feature_names))
print(top_features_combined)

['Min', 'ICMP', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'urg_count', 'Header_Length', 'psh_flag_number', 'Tot sum', 'AVG', 'syn_flag_number', 'syn_count', 'Magnitue', 'ack_count', 'TCP', 'Max']


In [29]:
no_of_redundent_features = 25-(len(top_features_combined))

extra_features_from_rfe = []

for feature in rfe_feature_scores['Feature']:
    if feature not in top_features_combined:
        extra_features_from_rfe.append(feature)
    if len(extra_features_from_rfe) == no_of_redundent_features:
        break

print("Extra features from RFE:", extra_features_from_rfe)

top_features_combined.extend(extra_features_from_rfe)
print(top_features_combined)

X_selected = X[top_features_combined]

Extra features from RFE: ['UDP', 'rst_count', 'fin_count', 'rst_flag_number', 'ack_flag_number', 'flow_duration', 'Srate', 'Rate']
['Min', 'ICMP', 'Protocol Type', 'IAT', 'fin_flag_number', 'Tot size', 'urg_count', 'Header_Length', 'psh_flag_number', 'Tot sum', 'AVG', 'syn_flag_number', 'syn_count', 'Magnitue', 'ack_count', 'TCP', 'Max', 'UDP', 'rst_count', 'fin_count', 'rst_flag_number', 'ack_flag_number', 'flow_duration', 'Srate', 'Rate']


In [30]:
# Step 6: Train and Test Split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.35, random_state=42)

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Step 9: Cross-validation (optional)
cv_scores = cross_val_score(rf_model, X_selected, y, cv=5)
print("Cross-validation Accuracy:", np.mean(cv_scores))

Accuracy: 0.9992930189733977
Precision: 0.9993092840129629
Recall: 0.9992930189733977
F1 Score: 0.9992973340413825
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       513
           1       0.98      1.00      0.99        56
           2       1.00      1.00      1.00     12772
           3       0.99      1.00      0.99       835
           4       1.00      1.00      1.00      7477
           5       1.00      1.00      1.00      7196
           6       1.00      1.00      1.00      7166
           7       0.83      1.00      0.91        24
           8       1.00      1.00      1.00      6311
           9       1.00      1.00      1.00      8162
          10       1.00      1.00      1.00      9762
          11       1.00      0.99      0.99       548

    accuracy                           1.00     60822
   macro avg       0.98      1.00      0.99     60822
weighted avg       1.00      1.00      1.00     6