# CIC-IDS-2017 Low-Rate DDoS Augmentation and Model Evaluation
This notebook performs:
1. Splitting the dataset by attack label into separate CSVs.
2. Basic data cleaning and inspection.
3. Designing Test scenarios with different benign vs low-rate attack ratios.
4. Evaluating MLP, Random Forest, Logistic Regression, and XGBoost under augmentation methods: None, SMOTE, ADASYN, CVAE, GAN.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os, glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score


In [2]:
def augment_cvae(X, y, n_samples):
    # Placeholder for CVAE-based augmentation
    return X, y

def augment_gan(X, y, n_samples):
    # Placeholder for GAN-based augmentation
    return X, y


In [4]:
# Step 1: Split by label into separate CSVs 

# Collect CSV files
csv_files = glob.glob('CIC-IDS-2017/*.csv')

os.makedirs('split_by_label', exist_ok=True)

for file in csv_files:
    df = pd.read_csv(file)
    # Strip whitespace from all column names
    df.columns = df.columns.str.strip()
    # Verify 'Label' column exists
    if 'Label' not in df.columns:
        raise KeyError(f"'Label' column not found in {file}. Available columns: {df.columns.tolist()}")
    # Split and save by label
    for lbl, group in df.groupby('Label'):
        safe_lbl = lbl.replace('/', '_').replace(' ', '_')
        group.to_csv(f'split_by_label/{safe_lbl}.csv', index=False)

print('Split files created in split_by_label/')


Split files created in split_by_label/


In [7]:
# Step 2: Data cleaning and inspection (fixed)

# Collect all CSV files from the dataset folder
csv_files = glob.glob('CIC-IDS-2017/*.csv')

# Load and concatenate all CSVs into a single DataFrame
df_all = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# Strip whitespace from column names
df_all.columns = df_all.columns.str.strip()

# Drop duplicates and missing values
df_clean = df_all.drop_duplicates().dropna()

print('Original dataset shape:', df_all.shape)
print('Cleaned dataset shape:', df_clean.shape)

# Ensure 'Label' column exists
if 'Label' not in df_clean.columns:
    raise KeyError(f"'Label' column not found. Available columns: {df_clean.columns.tolist()}")

# Display distribution of attack labels
display(df_clean['Label'].value_counts())

Original dataset shape: (2830743, 79)
Cleaned dataset shape: (2522009, 79)


Label
BENIGN                        2096134
DoS Hulk                       172846
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [8]:
# Prepare BENIGN and low-rate attack sets (slowloris + slowhttptest)
benign = df_clean[df_clean['Label'] == 'BENIGN']
slow = df_clean[df_clean['Label'].isin(['DoS slowloris', 'DoS Slowhttptest'])]
len(benign), len(slow)


(2096134, 10613)

In [9]:
def sample_and_split(n_b, n_s):
    replace_b = n_b > len(benign)
    replace_s = n_s > len(slow)
    df = pd.concat([
        benign.sample(n=n_b, random_state=42, replace=replace_b),
        slow.sample(n=n_s, random_state=42, replace=replace_s)
    ], ignore_index=True)
    X = df.drop(columns=['Label'])
    y = (df['Label'].isin(['DoS slowloris', 'DoS Slowhttptest'])).astype(int)
    Xs = StandardScaler().fit_transform(X)
    return train_test_split(Xs, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Step 3: Define Test scenarios
combined_slow_count = len(slow)
tests = {
    'Test1': {'benign': combined_slow_count, 'slow': combined_slow_count},
    'Test2': {'benign': len(benign), 'slow': combined_slow_count},
    'Test3': {'benign': combined_slow_count // 2, 'slow': combined_slow_count // 2},
    'Test4': {'benign': combined_slow_count * 2, 'slow': combined_slow_count}
}
methods = ['None', 'SMOTE', 'ADASYN', 'CVAE', 'GAN']
models = {
    'MLP': MLPClassifier(random_state=42, max_iter=500, early_stopping=True),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}


In [None]:
# Step 4: Evaluate models under different augmentation methods
results = []
for t, sz in tests.items():
    X_tr, X_te, y_tr, y_te = sample_and_split(sz['benign'], sz['slow'])
    for m in methods:
        if m == 'SMOTE':
            X_aug, y_aug = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'ADASYN':
            X_aug, y_aug = ADASYN(random_state=42).fit_resample(X_tr, y_tr)
        elif m == 'CVAE':
            X_aug, y_aug = augment_cvae(X_tr, y_tr, sz['benign'])
        elif m == 'GAN':
            X_aug, y_aug = augment_gan(X_tr, y_tr, sz['benign'])
        else:
            X_aug, y_aug = X_tr, y_tr
        for name, model in models.items():
            clf = model
            clf.fit(X_aug, y_aug)
            y_pred = clf.predict(X_te)
            y_proba = clf.predict_proba(X_te)[:, 1]
            results.append({
                'Test': t,
                'Augmentation': m,
                'Model': name,
                'Accuracy': clf.score(X_te, y_te),
                'Precision': precision_score(y_te, y_pred, zero_division=0),
                'Recall': recall_score(y_te, y_pred, zero_division=0),
                'F1-Score': f1_score(y_te, y_pred, zero_division=0),
                'ROC-AUC': roc_auc_score(y_te, y_proba)
            })
df_results = pd.DataFrame(results)
display(df_results)
# Save results to CSV
df_results.to_csv('cic_ids_2017_results.csv', index=False)


## Conclusion
- This notebook compared four classification models under five augmentation methods across four Test scenarios.
- Results are saved to `cic_ids_2017_results.csv` for further analysis.
