In [1]:
import time
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from memory_profiler import memory_usage

In [2]:
# Load the dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-BoT-IoT/NF-BoT-IoT.csv")

# Preprocessing
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']


# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

In [3]:
# Evaluate classifiers
results = {}
start_overall = time.perf_counter()  # Start measuring total execution time
for name, clf in classifiers.items():
    start_time = time.perf_counter()  # Real time measurement starts
    mem_usage = max(memory_usage((clf.fit, (X_train, y_train)), interval=1))  # Less frequent checks to reduce overhead
    train_time = time.perf_counter() - start_time  # Real time measurement ends
    
    start_time = time.perf_counter()  # Real time measurement starts
    y_pred = clf.predict(X_test)
    test_time = time.perf_counter() - start_time  # Real time measurement ends
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    results[name] = {
        "Accuracy": accuracy,
        "F1 Score": f1,
        "Training Time": train_time,
        "Testing Time": test_time,
        "Memory Usage": mem_usage
    }

In [4]:
# Display results
print("The results of the NF BoT IoT dataset..." )
for classifier, metrics in results.items():
    print(f"{classifier}:")
    for metric, value in metrics.items():
        print(f"    {metric}: {value:.4f}")

The results of the NF BoT IoT dataset...
Random Forest:
    Accuracy: 0.9897
    F1 Score: 0.8840
    Training Time: 22.7601
    Testing Time: 0.2988
    Memory Usage: 522.1406
KNN:
    Accuracy: 0.9916
    F1 Score: 0.8989
    Training Time: 0.9417
    Testing Time: 21.7985
    Memory Usage: 312.0156
SVM:
    Accuracy: 0.9886
    F1 Score: 0.8378
    Training Time: 1137.5603
    Testing Time: 41.3507
    Memory Usage: 785.7344
Decision Tree:
    Accuracy: 0.9905
    F1 Score: 0.8954
    Training Time: 1.5218
    Testing Time: 0.0066
    Memory Usage: 260.6875
Logistic Regression:
    Accuracy: 0.9874
    F1 Score: 0.8107
    Training Time: 3.5792
    Testing Time: 0.0048
    Memory Usage: 291.3438


In [5]:
# Load the dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT.csv")

# Preprocessing
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

In [6]:
# Evaluate classifiers
results = {}
start_overall = time.perf_counter()  # Start measuring total execution time
for name, clf in classifiers.items():
    start_time = time.perf_counter()  # Real time measurement starts
    mem_usage = max(memory_usage((clf.fit, (X_train, y_train)), interval=1))  # Less frequent checks to reduce overhead
    train_time = time.perf_counter() - start_time  # Real time measurement ends
    
    start_time = time.perf_counter()  # Real time measurement starts
    y_pred = clf.predict(X_test)
    test_time = time.perf_counter() - start_time  # Real time measurement ends
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    results[name] = {
        "Accuracy": accuracy,
        "F1 Score": f1,
        "Training Time": train_time,
        "Testing Time": test_time,
        "Memory Usage": mem_usage
    }

In [7]:
# Display results
print("The results of the ToN IoT dataset..." )
for classifier, metrics in results.items():
    print(f"{classifier}:")
    for metric, value in metrics.items():
        print(f"    {metric}: {value:.4f}")

The results of the ToN IoT dataset...
Random Forest:
    Accuracy: 0.9995
    F1 Score: 0.9993
    Training Time: 118.7442
    Testing Time: 1.3307
    Memory Usage: 824.2812
KNN:
    Accuracy: 0.9957
    F1 Score: 0.9932
    Training Time: 1.8806
    Testing Time: 81.1632
    Memory Usage: 356.3906
SVM:
    Accuracy: 0.9840
    F1 Score: 0.9738
    Training Time: 1955.1664
    Testing Time: 306.1988
    Memory Usage: 1030.6562
Decision Tree:
    Accuracy: 0.9994
    F1 Score: 0.9990
    Training Time: 5.7509
    Testing Time: 0.0218
    Memory Usage: 278.1250
Logistic Regression:
    Accuracy: 0.9712
    F1 Score: 0.9531
    Training Time: 2.7380
    Testing Time: 0.0049
    Memory Usage: 305.7031
