# Step 5: Data Augmentation Evaluation by Model

Evaluate five classifiers (MLP, SVM, Random Forest, Logistic Regression, XGBoost) across four test scenarios using None, SMOTE, ADASYN, CVAE, and GAN augmentation.

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import seaborn as sns

# Placeholder augmentation functions
def augment_cvae(X, y, n_samples): return X, y
def augment_gan(X, y, n_samples): return X, y

# Load data splits
benign = pd.read_csv('split_by_label/BenignTraffic.csv')
slow = pd.read_csv('split_by_label/DDoS-SlowLoris.csv')
print(f'Benign: {len(benign)}, Slowloris: {len(slow)}')

# Sampling and splitting function
def sample_and_split(n_b, n_s):
    replace_b = n_b > len(benign)
    replace_s = n_s > len(slow)
    df = pd.concat([
        benign.sample(n=n_b, random_state=42, replace=replace_b),
        slow.sample(n=n_s, random_state=42, replace=replace_s)
    ], ignore_index=True)
    X = df.drop(columns=['label'])
    y = (df['label']=='DDoS-SlowLoris').astype(int)
    Xs = StandardScaler().fit_transform(X)
    return train_test_split(Xs, y, test_size=0.3, random_state=42, stratify=y)

tests = {
    'Test1': {'benign': 23426, 'slow': 23426},
    'Test2': {'benign': 1098195, 'slow': 23426},
    'Test3': {'benign': 23426//2, 'slow': 23426//2},
    'Test4': {'benign': 23426*2, 'slow': 23426}
}
methods = ['None','SMOTE','ADASYN','CVAE','GAN']


## MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

results_mlp = []
for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slow'])
    for m in methods:
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        clf = MLPClassifier(random_state=42, max_iter=500, early_stopping=True)
        clf.fit(X_aug, y_aug)
        y_pred = clf.predict(X_te)
        y_proba = clf.predict_proba(X_te)[:,1]
        results_mlp.append({
            'Test': t, 'Augmentation': m, 'Model': 'MLP',
            'Accuracy': clf.score(X_te, y_te),
            'Precision': precision_score(y_te, y_pred, zero_division=0),
            'Recall': recall_score(y_te, y_pred, zero_division=0),
            'F1-Score': f1_score(y_te, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_te, y_proba)
        })
results_mlp = pd.DataFrame(results_mlp)
display(results_mlp)

## Support Vector Machine (Linear Kernel)

In [None]:
from sklearn.svm import SVC

results_svm = []
for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slow'])
    for m in methods:
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        clf = SVC(kernel='linear', probability=True, random_state=42)
        clf.fit(X_aug, y_aug)
        y_pred = clf.predict(X_te)
        y_proba = clf.predict_proba(X_te)[:,1]
        results_svm.append({
            'Test': t, 'Augmentation': m, 'Model': 'SVM',
            'Accuracy': clf.score(X_te, y_te),
            'Precision': precision_score(y_te, y_pred, zero_division=0),
            'Recall': recall_score(y_te, y_pred, zero_division=0),
            'F1-Score': f1_score(y_te, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_te, y_proba)
        })
results_svm = pd.DataFrame(results_svm)
display(results_svm)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

results_rf = []
for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slow'])
    for m in methods:
        X_aug, y_aug = (SMOTE(random_state=42).fit_resample(X_tr, y_tr) if m=='SMOTE' 
                        else ADASYN(random_state=42).fit_resample(X_tr, y_tr) if m=='ADASYN'
                        else augment_cvae(X_tr, y_tr, sz['benign']) if m=='CVAE'
                        else augment_gan(X_tr, y_tr, sz['benign']) if m=='GAN'
                        else (X_tr, y_tr))
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
        clf.fit(* (X_aug, y_aug))
        y_pred = clf.predict(X_te)
        y_proba = clf.predict_proba(X_te)[:,1]
        results_rf.append({
            'Test': t, 'Augmentation': m, 'Model': 'Random Forest',
            'Accuracy': clf.score(X_te, y_te),
            'Precision': precision_score(y_te, y_pred, zero_division=0),
            'Recall': recall_score(y_te, y_pred, zero_division=0),
            'F1-Score': f1_score(y_te, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_te, y_proba)
        })
results_rf = pd.DataFrame(results_rf)
display(results_rf)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

results_lr = []
for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slow'])
    for m in methods:
        X_aug, y_aug = (SMOTE(random_state=42).fit_resample(X_tr, y_tr) if m=='SMOTE' 
                        else ADASYN(random_state=42).fit_resample(X_tr, y_tr) if m=='ADASYN'
                        else augment_cvae(X_tr, y_tr, sz['benign']) if m=='CVAE'
                        else augment_gan(X_tr, y_tr, sz['benign']) if m=='GAN'
                        else (X_tr, y_tr))
        clf = LogisticRegression(max_iter=500, random_state=42)
        clf.fit(* (X_aug, y_aug))
        y_pred = clf.predict(X_te)
        y_proba = clf.predict_proba(X_te)[:,1]
        results_lr.append({
            'Test': t, 'Augmentation': m, 'Model': 'Logistic Regression',
            'Accuracy': clf.score(X_te, y_te),
            'Precision': precision_score(y_te, y_pred, zero_division=0),
            'Recall': recall_score(y_te, y_pred, zero_division=0),
            'F1-Score': f1_score(y_te, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_te, y_proba)
        })
results_lr = pd.DataFrame(results_lr)
display(results_lr)

## XGBoost

In [None]:
from xgboost import XGBClassifier

results_xgb = []
for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slow'])
    for m in methods:
        X_aug, y_aug = (SMOTE(random_state=42).fit_resample(X_tr, y_tr) if m=='SMOTE' 
                        else ADASYN(random_state=42).fit_resample(X_tr, y_tr) if m=='ADASYN'
                        else augment_cvae(X_tr, y_tr, sz['benign']) if m=='CVAE'
                        else augment_gan(X_tr, y_tr, sz['benign']) if m=='GAN'
                        else (X_tr, y_tr))
        clf = XGBClassifier(eval_metric='logloss', n_jobs=-1, random_state=42)
        clf.fit(* (X_aug, y_aug))
        y_pred = clf.predict(X_te)
        y_proba = clf.predict_proba(X_te)[:,1]
        results_xgb.append({
            'Test': t, 'Augmentation': m, 'Model': 'XGBoost',
            'Accuracy': clf.score(X_te, y_te),
            'Precision': precision_score(y_te, y_pred, zero_division=0),
            'Recall': recall_score(y_te, y_pred, zero_division=0),
            'F1-Score': f1_score(y_te, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_te, y_proba)
        })
results_xgb = pd.DataFrame(results_xgb)
display(results_xgb)

## Combined Results and Visualization

In [None]:
all_results = pd.concat([results_mlp, results_svm, results_rf, results_lr, results_xgb], ignore_index=True)
display(all_results)
all_results.to_csv('augmentation_evaluation_by_model.csv', index=False)

plt.figure(figsize=(12,6))
sns.barplot(data=all_results, x='Test', y='F1-Score', hue='Model', palette='plasma')
plt.title('F1-Score Across Models and Tests')
plt.show()