# Step 5: Data Augmentation Evaluation for Slowloris vs. Benign

Four experiments using different class balances with a 70/30 split, evaluating five classifiers and four augmentation methods.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import seaborn as sns

# Placeholder augmentation functions
def augment_cvae(X, y, n_samples):
    return X, y  # TODO: CVAE implementation

def augment_gan(X, y, n_samples):
    return X, y  # TODO: GAN implementation


## Load Data Splits

In [None]:
benign = pd.read_csv('split_by_label/BenignTraffic.csv')
slow = pd.read_csv('split_by_label/DDoS-SlowLoris.csv')
print(f'Benign samples: {len(benign)}, Slowloris samples: {len(slow)}')

## Define Augmentation & Evaluation Functions

In [None]:
def augment_data(X, y, method, target_size=None):
    if method == 'SMOTE': return SMOTE(random_state=42).fit_resample(X, y)
    if method == 'ADASYN': return ADASYN(random_state=42).fit_resample(X, y)
    if method == 'CVAE':   return augment_cvae(X, y, n_samples=target_size)
    if method == 'GAN':    return augment_gan(X, y, n_samples=target_size)
    return X, y

def evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        'MLP': MLPClassifier(random_state=42, max_iter=500),
        'SVM': SVC(probability=True, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=500, random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1]
        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, zero_division=0),
            'Recall': recall_score(y_test, y_pred, zero_division=0),
            'F1-Score': f1_score(y_test, y_pred, zero_division=0),
            'ROC-AUC': roc_auc_score(y_test, y_proba)
        })
    return pd.DataFrame(results)


## Run Experiments

In [None]:
tests = {
    'Test1': {'benign': 23426,   'slow': 23426},
    'Test2': {'benign': 1098195, 'slow': 23426},
    'Test3': {'benign': 23426//2, 'slow': 23426//2},
    'Test4': {'benign': 23426*2,  'slow': 23426}
}
all_results = []
for test_name, sizes in tests.items():
    replace_b = sizes['benign'] > len(benign)
    replace_s = sizes['slow']   > len(slow)
    df_b = benign.sample(n=sizes['benign'], random_state=42, replace=replace_b)
    df_s = slow.sample(n=sizes['slow'], random_state=42, replace=replace_s)
    df = pd.concat([df_b, df_s], ignore_index=True)
    X = df.drop(columns=['label'])
    y = (df['label']=='DDoS-SlowLoris').astype(int)
    X_scaled = StandardScaler().fit_transform(X)
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_scaled, y, test_size=0.3, random_state=42, stratify=y)
    for method in ['None','SMOTE','ADASYN','CVAE','GAN']:
        X_aug, y_aug = augment_data(X_tr, y_tr, method, target_size=sizes['benign'])
        df_res = evaluate_models(X_aug, y_aug, X_te, y_te)
        df_res['Test'] = test_name
        df_res['Augmentation'] = method
        all_results.append(df_res)
results_df = pd.concat(all_results, ignore_index=True)
print('### Combined Results ###')
display(results_df)
results_df.to_csv('augmentation_evaluation_results_v2.csv', index=False)


## Visualization

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(data=results_df, x='Test', y='F1-Score', hue='Augmentation', palette='plasma')
plt.title('F1-Score by Test and Augmentation Method')
plt.show()

**Conclusion:** Four tests with corrected sizes and sampling logic.