In [9]:
'''
Model Build & Evaluation using algorithms like:
Random Forest Classifier
Logistic Regression
SVM
'''

'\nModel Build & Evaluation using algorithms like:\nRandom Forest Classifier\nLogistic Regression\nSVM\n'

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import time
import warnings
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
import joblib

warnings.filterwarnings("ignore")
start_time = time.time()

# Create result folder
def create_folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print("Couldn't make folder")

create_folder("Result")

# Top 20 features + Label
cols = ["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
        "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total", "Flow Duration", "Bwd Packet Length Max",
        "Flow IAT Max", "Flow IAT Mean", "Total Length of Bwd Packets", "Fwd Packet Length Min",
        "Bwd Packet Length Mean", "Flow Packets/s", "Fwd Packet Length Mean", "Total Backward Packets",
        "Fwd Packet Length Max", "Total Fwd Packets", "Bwd Packet Length Min", 'Label']

data_file = "merged.csv"

df = pd.read_csv(data_file, usecols=cols)
df = df.fillna(0)
df['Label'] = df['Label'].apply(lambda x: 1 if x == "BENIGN" else 0)


# Feature and label split
X = df.drop("Label", axis=1)
y = df["Label"]

# Models to evaluate
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42),
    "Linear SVM": LinearSVC(max_iter=1000, random_state=42)
}

# Metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

# Write header to CSV
results_path = "Result/results.csv"
with open(results_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Mean Accuracy", "Mean Precision", "Mean Recall", "Time Taken (s)"])

# Evaluate each model
for model_name, model in models.items():
    print(f"\nEvaluating: {model_name}")
    start = time.time()
    results = cross_validate(model, X, y, cv=5, scoring=scoring, n_jobs=-1)
    end = time.time()

    mean_acc = np.mean(results["test_accuracy"])
    mean_prec = np.mean(results["test_precision"])
    mean_rec = np.mean(results["test_recall"])
    duration = round(end - start, 2)

    print(f"Accuracy: {mean_acc:.4f}, Precision: {mean_prec:.4f}, Recall: {mean_rec:.4f}, Time: {duration}s")

    # Save to CSV
    with open(results_path, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([model_name, round(mean_acc, 4), round(mean_prec, 4), round(mean_rec, 4), duration])

    # Save trained model
    model.fit(X, y)
    joblib.dump(model, f"Result/{model_name.replace(' ', '_')}.pkl")

print("\nOperation completed in", round(time.time() - start_time, 2), "seconds.")



Evaluating: Random Forest
Accuracy: 0.9449, Precision: 0.9156, Recall: 0.9058, Time: 3942.87s

Evaluating: Logistic Regression
Accuracy: 0.8516, Precision: 0.7237, Recall: 0.6665, Time: 21224.7s

Evaluating: Linear SVM
Accuracy: 0.8636, Precision: 0.7210, Recall: 0.6713, Time: 734.76s

Operation completed in 29885.53 seconds.
