In [12]:
import os
import pandas as pd

DATASET_PATH = r"DATASET"

csv_files = [
    os.path.join(DATASET_PATH, f)
    for f in os.listdir(DATASET_PATH)
    if f.endswith(".csv")
]

df = pd.DataFrame()

for file in csv_files:
    print("Loading:", file)

    for chunk in pd.read_csv(file, chunksize=200000):
        df = pd.concat([df, chunk], ignore_index=True)

print("Final shape:", df.shape)


Loading: DATASET\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loading: DATASET\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loading: DATASET\Friday-WorkingHours-Morning.pcap_ISCX.csv
Loading: DATASET\Monday-WorkingHours.pcap_ISCX.csv
Loading: DATASET\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loading: DATASET\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Loading: DATASET\Tuesday-WorkingHours.pcap_ISCX.csv
Loading: DATASET\Wednesday-workingHours.pcap_ISCX.csv
Final shape: (2830743, 79)


In [13]:
df.head()


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0    Destination Port             int64  
 1    Flow Duration                int64  
 2    Total Fwd Packets            int64  
 3    Total Backward Packets       int64  
 4   Total Length of Fwd Packets   int64  
 5    Total Length of Bwd Packets  int64  
 6    Fwd Packet Length Max        int64  
 7    Fwd Packet Length Min        int64  
 8    Fwd Packet Length Mean       float64
 9    Fwd Packet Length Std        float64
 10  Bwd Packet Length Max         int64  
 11   Bwd Packet Length Min        int64  
 12   Bwd Packet Length Mean       float64
 13   Bwd Packet Length Std        float64
 14  Flow Bytes/s                  float64
 15   Flow Packets/s               float64
 16   Flow IAT Mean                float64
 17   Flow IAT Std                 float64
 18   Flow IAT Max         

In [15]:
df.columns = df.columns.str.strip()


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

In [17]:
df=df.drop(columns=['Fwd Avg Bytes/Bulk',
    'Fwd Avg Packets/Bulk',
    'Fwd Avg Bulk Rate',
    'Bwd Avg Bytes/Bulk',
    'Bwd Avg Packets/Bulk',
    'Bwd Avg Bulk Rate'])

In [18]:
def mergeanomalies(label):
    if label in [
        "DDoS",
        "DoS Hulk",
        "DoS GoldenEye",
        "DoS Slowhttptest",
        "DoS slowloris"
    ]:
        return "1"
    elif label=="PortScan":
        return "2"

    elif label in [
        "FTP-Patator",
        "SSH-Patator",
        "Web Attack – Brute Force"
    ]:
        return "3"

    elif label=="BENIGN":
        return "0"
    else:
        return "4"


In [19]:
df["Label"] = df["Label"].apply(mergeanomalies)


In [20]:
print(df["Label"].value_counts())


Label
0    2273097
1     380688
2     158930
3      13835
4       4193
Name: count, dtype: int64


In [21]:
df.to_csv("processed_dataset.csv", index=False)

## Unsupervised Anomaly Detection (BENIGN-only training)

This section implements the Isolation Forest pipeline on `processed_dataset.csv` with training data restricted to BENIGN traffic only.

In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

In [26]:
# Step 1: Load dataset
csv_path = "processed_dataset.csv"
df_model = pd.read_csv(csv_path)

print("Shape:", df_model.shape)
print("First 10 columns:", list(df_model.columns[:10]))

if "Label" not in df_model.columns:
    raise ValueError("Label column not found in processed_dataset.csv")

# Ensure labels are numeric (handles both '0' and 0)
df_model["Label"] = pd.to_numeric(df_model["Label"], errors="coerce")
if df_model["Label"].isna().any():
    raise ValueError("Label contains non-numeric values after conversion.")

print("Label distribution:")
print(df_model["Label"].value_counts().sort_index())

Shape: (2830743, 73)
First 10 columns: ['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std']
Label distribution:
Label
0    2273097
1     380688
2     158930
3      13835
4       4193
Name: count, dtype: int64


In [29]:
# Steps 2-4: Separate label/features, keep BENIGN only, then split
# 0 = BENIGN, non-zero = attack

y = df_model["Label"].astype(int)
X = df_model.drop(columns=["Label"]).copy()

# Convert all features to numeric and handle invalid values
X = X.apply(pd.to_numeric, errors="coerce")
X = X.replace([np.inf, -np.inf], np.nan)

X_benign = X[y == 0].copy()
print("Total rows:", len(df_model))
print("BENIGN rows:", len(X_benign))

if len(X_benign) == 0:
    raise ValueError("No BENIGN rows found. Check label encoding.")

X_train, X_test_benign = train_test_split(
    X_benign,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

print("X_train shape:", X_train.shape)
print("X_test_benign shape:", X_test_benign.shape)

# Sanity check: make sure training split is BENIGN-only
if not np.all((y.loc[X_train.index] == 0).values):
    raise ValueError("Attack samples found in training data!")
print("Training data check passed: BENIGN-only.")

# Impute missing values using train-benign statistics only
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_benign_imputed = imputer.transform(X_test_benign)
X_full_imputed = imputer.transform(X)

# Step 5: Feature scaling (required for Autoencoder; also used consistently here)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_benign_scaled = scaler.transform(X_test_benign_imputed)

# Step 6: Isolation Forest training
iso = IsolationForest(
    n_estimators=200,
    contamination=0.2,
    random_state=42,
    n_jobs=-1,
)
iso.fit(X_train_scaled)

# Step 7: Evaluate on full dataset
X_scaled_full = scaler.transform(X_full_imputed)
y_pred_iso = iso.predict(X_scaled_full)  # 1 normal, -1 anomaly
y_pred_iso = np.where(y_pred_iso == 1, 0, 1)  # 0 benign, 1 anomaly

y_true = np.where(y.values == 0, 0, 1)

# Step 8: Metrics
cm = confusion_matrix(y_true, y_pred_iso)
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred_iso, digits=4))

acc = accuracy_score(y_true, y_pred_iso)
prec = precision_score(y_true, y_pred_iso, zero_division=0)
rec = recall_score(y_true, y_pred_iso, zero_division=0)
f1 = f1_score(y_true, y_pred_iso, zero_division=0)
roc_auc = roc_auc_score(y_true, y_pred_iso)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")

# Optional: false positive rate on benign holdout only
benign_holdout_pred = np.where(iso.predict(X_test_benign_scaled) == 1, 0, 1)
fpr_benign_holdout = benign_holdout_pred.mean()
print(f"False Positive Rate on BENIGN holdout: {fpr_benign_holdout:.4f}")

Total rows: 2830743
BENIGN rows: 2273097
X_train shape: (1818477, 72)
X_test_benign shape: (454620, 72)
Training data check passed: BENIGN-only.
Confusion Matrix:
 [[1818562  454535]
 [ 297221  260425]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8595    0.8000    0.8287   2273097
           1     0.3643    0.4670    0.4093    557646

    accuracy                         0.7344   2830743
   macro avg     0.6119    0.6335    0.6190   2830743
weighted avg     0.7620    0.7344    0.7461   2830743

Accuracy : 0.7344
Precision: 0.3643
Recall   : 0.4670
F1-score : 0.4093
ROC-AUC  : 0.6335
False Positive Rate on BENIGN holdout: 0.1998
