In [3]:
'''
From previous observtions Random Forest has the most accuracy,
we will use it for our final model and improve it further
'''

'\nFrom previous observtions Random Forest has the most accuracy,\nwe will use it for our final model and improve it further\n'

In [4]:
import numpy as np
import pandas as pd
import os
import csv
import time
import warnings
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
import joblib

warnings.filterwarnings('ignore')
start_time = time.time()

# Top 20 features + Label
cols = ["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
        "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total", "Flow Duration", "Bwd Packet Length Max",
        "Flow IAT Max", "Flow IAT Mean", "Total Length of Bwd Packets", "Fwd Packet Length Min",
        "Bwd Packet Length Mean", "Flow Packets/s", "Fwd Packet Length Mean", "Total Backward Packets",
        "Fwd Packet Length Max", "Total Fwd Packets", "Bwd Packet Length Min", 'Label']

data_file = "merged.csv"

df = pd.read_csv(data_file, usecols=cols)
df = df.fillna(0)
df['Label'] = df['Label'].apply(lambda x: 1 if x == "BENIGN" else 0)


# Feature and label split
X = df.drop("Label", axis=1)
y = df["Label"]

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from scipy.stats import randint

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

param_dist = {
    'n_estimators': randint(200, 300),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced')

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=cv,
    random_state=42,
    n_jobs=1,
    verbose=2
)

print("\nTuning Random Forest...")
search.fit(X, y)

best_model = search.best_estimator_
print("\nBest Hyperparameters:", search.best_params_)

# Metrics for evaluation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

print("\nEvaluating Tuned Random Forest...")
results = cross_validate(best_model, X, y, cv=cv, scoring=scoring, n_jobs=-1)

mean_acc = np.mean(results["test_accuracy"])
mean_prec = np.mean(results["test_precision"])
mean_rec = np.mean(results["test_recall"])
duration = round(time.time() - start_time, 2)

print(f"Final Accuracy: {mean_acc:.4f}, Precision: {mean_prec:.4f}, Recall: {mean_rec:.4f}, Time: {duration}s")

# Save results to CSV
with open("Result/final_random_forest_results.csv", "a", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Tuned Random Forest", round(mean_acc, 4), round(mean_prec, 4), round(mean_rec, 4), duration])

# Save the trained model
joblib.dump(best_model, "Result/Network_Anomaly_Detection_rf.pkl")
feature_list = X.columns.tolist()
joblib.dump(feature_list, "Result/features_used.pkl")




Tuning Random Forest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=271; total time= 8.2min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=271; total time= 7.2min
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=271; total time= 7.2min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=282; total time= 6.4min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=282; total time= 6.7min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=282; total time= 7.1min
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=299; total time=155.7min
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_est

['Result/features_used.pkl']