# Three-Class Low-Rate DDoS Full Analysis (Fixed)
This notebook:
1. Splits data by label
2. Cleans data
3. Prepares BENIGN, Slowloris, Slowhttptest subsets
4. Defines Test scenarios and models
5. Evaluates each model (MLP, RandomForest, LogisticRegression, XGBoost)
   - Prints class counts before/after augmentation
   - Shows confusion matrix and classification report
   - Computes average recall of attack classes


In [None]:
import warnings
warnings.filterwarnings('ignore')
import os, glob
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score


In [None]:
def augment_cvae(X, y, n_samples): return X, y
def augment_gan(X, y, n_samples): return X, y


In [None]:
csv_files = glob.glob('CIC-IDS-2017/*.csv')
os.makedirs('split_by_label', exist_ok=True)
for f in csv_files:
    df = pd.read_csv(f)
    df.columns = df.columns.str.strip()
    for lbl, g in df.groupby('Label'):
        safe = lbl.replace('/', '_').replace(' ', '_')
        g.to_csv(f'split_by_label/{safe}.csv', index=False)
print('Split by label done')


In [None]:
df_all = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
df_all.columns = df_all.columns.str.strip()
df = df_all.drop_duplicates().dropna()
print('Cleaned shape:', df.shape)
display(df['Label'].value_counts())


In [None]:
benign = df[df['Label']=='BENIGN']
slowloris = df[df['Label']=='DoS slowloris']
slowhttptest = df[df['Label']=='DoS Slowhttptest']
print('Samples: BENIGN=', len(benign), 'Slowloris=', len(slowloris), 'Slowhttptest=', len(slowhttptest))


In [None]:
def sample_and_split(n_b, n_sl, n_sht):
    rep = lambda n, arr: n > len(arr)
    df_s = pd.concat([
        benign.sample(n=n_b, random_state=42, replace=rep(n_b, benign)),
        slowloris.sample(n=n_sl, random_state=42, replace=rep(n_sl, slowloris)),
        slowhttptest.sample(n=n_sht, random_state=42, replace=rep(n_sht, slowhttptest))
    ], ignore_index=True)
    df_s = df_s.replace([np.inf, -np.inf], np.nan).dropna()
    X = df_s.drop(columns=['Label'])
    y = df_s['Label'].map({'BENIGN':0, 'DoS slowloris':1, 'DoS Slowhttptest':2})
    Xs = StandardScaler().fit_transform(X)
    return train_test_split(Xs, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
tests = {
    'Test1': {'benign':len(slowloris), 'slowloris':len(slowloris), 'slowhttptest':len(slowloris)},
    'Test2': {'benign':len(benign), 'slowloris':len(slowloris), 'slowhttptest':len(slowhttptest)},
    'Test3': {'benign':len(slowloris)//2, 'slowloris':len(slowloris)//2, 'slowhttptest':len(slowhttptest)//2},
    'Test4': {'benign':len(slowloris)*2, 'slowloris':len(slowloris), 'slowhttptest':len(slowhttptest)}
}
methods = ['None', 'SMOTE', 'ADASYN', 'CVAE', 'GAN']
models = {
    'MLP': MLPClassifier(random_state=42, max_iter=500, early_stopping=True),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}


In [None]:
# Evaluate MLP for three-class classification (fixed printing)
from sklearn.metrics import classification_report, confusion_matrix

for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slowloris'], sz['slowhttptest'])
    for m in methods:
        cnt = np.bincount(y_tr, minlength=3)
        print(f"Test={t}, Aug={m} BEFORE counts: BENIGN={cnt[0]}, Slowloris={cnt[1]}, Slowhttptest={cnt[2]}")
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            try:
                X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
            except ValueError:
                X_aug, y_aug = X_tr, y_tr
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        cnt2 = np.bincount(y_aug, minlength=3)
        print(f"Test={t}, Aug={m} AFTER counts: BENIGN={cnt2[0]}, Slowloris={cnt2[1]}, Slowhttptest={cnt2[2]}")
        clf = models['MLP']
        clf.fit(X_aug, y_aug)
        y_pred = clf.predict(X_te)
        cm = confusion_matrix(y_te, y_pred)
        print("Confusion Matrix:\n", cm)
        print("Classification Report:\n", classification_report(y_te, y_pred, digits=4))
        rec1 = recall_score(y_te, y_pred, labels=[1], average='macro')
        rec2 = recall_score(y_te, y_pred, labels=[2], average='macro')
        print(f"Attack Recall avg: {(rec1 + rec2)/2:.4f}\n")
    print('-'*60)

In [None]:
# Evaluate RandomForest for three-class classification
from sklearn.metrics import classification_report, confusion_matrix

for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slowloris'], sz['slowhttptest'])
    for m in methods:
        cnt = np.bincount(y_tr, minlength=3)
        print(f"Test={t}, Aug={m} BEFORE counts: BENIGN={cnt[0]}, Slowloris={cnt[1]}, Slowhttptest={cnt[2]}")
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            try:
                X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
            except ValueError:
                X_aug, y_aug = X_tr, y_tr
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        cnt2 = np.bincount(y_aug, minlength=3)
        print(f"Test={t}, Aug={m} AFTER counts: BENIGN={cnt2[0]}, Slowloris={cnt2[1]}, Slowhttptest={cnt2[2]}")
        clf = models['RandomForest']
        clf.fit(X_aug, y_aug)
        y_pred = clf.predict(X_te)
        cm = confusion_matrix(y_te, y_pred)
        print("Confusion Matrix:\n", cm)
        print("Classification Report:\n", classification_report(y_te, y_pred, digits=4))
        rec1 = recall_score(y_te, y_pred, labels=[1], average='macro')
        rec2 = recall_score(y_te, y_pred, labels=[2], average='macro')
        print(f"Attack Recall avg: {(rec1 + rec2)/2:.4f}\n")
    print('-'*60)

In [None]:
# Evaluate LogisticRegression for three-class classification
from sklearn.metrics import classification_report, confusion_matrix

for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slowloris'], sz['slowhttptest'])
    for m in methods:
        cnt = np.bincount(y_tr, minlength=3)
        print(f"Test={t}, Aug={m} BEFORE counts: BENIGN={cnt[0]}, Slowloris={cnt[1]}, Slowhttptest={cnt[2]}")
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            try:
                X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
            except ValueError:
                X_aug, y_aug = X_tr, y_tr
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        cnt2 = np.bincount(y_aug, minlength=3)
        print(f"Test={t}, Aug={m} AFTER counts: BENIGN={cnt2[0]}, Slowloris={cnt2[1]}, Slowhttptest={cnt2[2]}")
        clf = models['LogisticRegression']
        clf.fit(X_aug, y_aug)
        y_pred = clf.predict(X_te)
        cm = confusion_matrix(y_te, y_pred)
        print("Confusion Matrix:\n", cm)
        print("Classification Report:\n", classification_report(y_te, y_pred, digits=4))
        rec1 = recall_score(y_te, y_pred, labels=[1], average='macro')
        rec2 = recall_score(y_te, y_pred, labels=[2], average='macro')
        print(f"Attack Recall avg: {(rec1 + rec2)/2:.4f}\n")
    print('-'*60)

In [None]:
# Evaluate XGBoost for three-class classification
from sklearn.metrics import classification_report, confusion_matrix

for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slowloris'], sz['slowhttptest'])
    for m in methods:
        cnt = np.bincount(y_tr, minlength=3)
        print(f"Test={t}, Aug={m} BEFORE counts: BENIGN={cnt[0]}, Slowloris={cnt[1]}, Slowhttptest={cnt[2]}")
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            try:
                X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
            except ValueError:
                X_aug, y_aug = X_tr, y_tr
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        cnt2 = np.bincount(y_aug, minlength=3)
        print(f"Test={t}, Aug={m} AFTER counts: BENIGN={cnt2[0]}, Slowloris={cnt2[1]}, Slowhttptest={cnt2[2]}")
        clf = models['XGBoost']
        clf.fit(X_aug, y_aug)
        y_pred = clf.predict(X_te)
        cm = confusion_matrix(y_te, y_pred)
        print("Confusion Matrix:\n", cm)
        print("Classification Report:\n", classification_report(y_te, y_pred, digits=4))
        rec1 = recall_score(y_te, y_pred, labels=[1], average='macro')
        rec2 = recall_score(y_te, y_pred, labels=[2], average='macro')
        print(f"Attack Recall avg: {(rec1 + rec2)/2:.4f}\n")
    print('-'*60)