## Load and clean data, define test scenarios

In [None]:
import os
import pandas as pd
import numpy as np
from feature_split import split_features, get_feature_indices

# 1) Read all CSV files
csv_dir = 'CIC-IDS-2017'
df_list = []
for fname in os.listdir(csv_dir):
    if fname.endswith('.csv'):
        df_list.append(pd.read_csv(os.path.join(csv_dir, fname)))
df = pd.concat(df_list, ignore_index=True)

# **New**: Strip whitespace from column names to ensure 'Label' can be located
df.columns = df.columns.str.strip()

# 2) Basic cleaning: Replace infinite values, drop missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# 3) Split dataset into subsets based on label
df_benign       = df[df['Label'] == 'BENIGN']
df_slowloris    = df[df['Label'] == 'DoS slowloris']
df_slowhttptest = df[df['Label'] == 'DoS Slowhttptest']

# 4) Feature type split: continuous vs. discrete
feature_df = df.drop('Label', axis=1)
cont_cols, disc_cols = split_features(feature_df)
cont_idx, disc_idx   = get_feature_indices(feature_df, cont_cols, disc_cols)

# 5) Print check
print(f"Continuous features ({len(cont_cols)}):")
for c in cont_cols:
    print(f"  - {c}")

print(f"\nDiscrete features ({len(disc_cols)}):")
for c in disc_cols:
    print(f"  - {c}")

# 6) Define four test scenarios
tests = {
    'Test1': {
        'benign':       len(df_slowloris),
        'slowloris':    len(df_slowloris),
        'slowhttptest': len(df_slowloris),
    },
    'Test2': {
        'benign':       len(df_benign),
        'slowloris':    len(df_slowloris),
        'slowhttptest': len(df_slowhttptest),
    },
    'Test3': {
        'benign':       len(df_slowloris) // 2,
        'slowloris':    len(df_slowloris) // 2,
        'slowhttptest': len(df_slowhttptest) // 2,
    },
    'Test4': {
        'benign':       len(df_slowloris) * 2,
        'slowloris':    len(df_slowloris),
        'slowhttptest': len(df_slowhttptest),
    },
}

# 7) Print scenario check
for name, sizes in tests.items():
    print(f"{name}: BENIGN={sizes['benign']}, Slowloris={sizes['slowloris']}, Slowhttptest={sizes['slowhttptest']}")

## Define augmentation functions supporting CVAE/GAN/SMOTE/ADASYN methods

In [None]:
import numpy as np
from imblearn.over_sampling import SMOTENC
from sklearn.impute import SimpleImputer
from sklearn.covariance import EmpiricalCovariance
from augment_module import augment_cvae, augment_gan

def apply_augmentation(
    X_train, y_train,
    method,
    cont_idx, disc_idx,
    imbalance_thresh: float = 0.10,
    outlier_thresh: float = 3.0
):
    """
    Supported methods: 'None','SMOTENC','CVAE','GAN'
      - Only augment when the minority class proportion < imbalance_thresh (default 0.10)
      - SMOTENC: Over-sample both continuous and categorical features
      - CVAE/GAN: Apply outlier filtering after generation, and synchronize filtering of y_gen
      - If X_gen is empty, skip augmentation
    """
    # 1) imbalance check
    classes, counts = np.unique(y_train, return_counts=True)
    if method == 'None' or counts.min() / counts.sum() >= imbalance_thresh:
        return X_train, y_train

    # 2) Augmentation branch
    if method == 'CVAE':
        X_gen, y_gen = augment_cvae(X_train, y_train)
    elif method == 'GAN':
        X_gen, y_gen = augment_gan(X_train, y_train)
    elif method == 'SMOTENC':
        sampler = SMOTENC(
            categorical_features=disc_idx,
            sampling_strategy='minority',
            random_state=seed
        )
        X_res_full, y_res_full = sampler.fit_resample(X_train, y_train)
        n_new = len(y_res_full) - len(y_train)
        X_gen = X_res_full[-n_new:]
        y_gen = y_res_full[-n_new:]
    else:
        return X_train, y_train

    # If no new samples are generated, skip augmentation
    if X_gen.shape[0] == 0:
        return X_train, y_train

    # 3) Quality filtering: Apply mean imputation to X_train and X_gen, then compute Mahalanobis distance
    imp = SimpleImputer(strategy='mean')
    X_train_imp = imp.fit_transform(X_train)
    X_gen_imp   = imp.transform(X_gen)

    cov = EmpiricalCovariance().fit(X_train_imp)
    md = cov.mahalanobis(X_gen_imp)
    mask = md < outlier_thresh
    X_gen = X_gen[mask]
    y_gen = y_gen[mask]

    # If no samples remain after filtering, skip augmentation
    if X_gen.shape[0] == 0:
        return X_train, y_train

    # 4) Merge original and augmented data
    X_res = np.vstack([X_train, X_gen])
    y_res = np.concatenate([y_train, y_gen])

    return X_res, y_res


In [None]:
import os
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline

# Disable TensorFlow / absl logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.getLogger('absl').setLevel(logging.ERROR)

# Create 'results' folder
os.makedirs('results', exist_ok=True)

# Aggregated results for all seeds
all_confusions = []
all_classreports = []

# List of seeds
seeds = [
    1674292617, 1362507843, 1523173961, 3675946018, 1730465472,
    2092905740, 3127331292, 233407511, 476689606, 638637055
]

# Epoch configuration
epoch_range = [1, 2, 3, 5, 10, 15, 20, 25, 30, 50, 100]
methods = ['None', 'SMOTENC', 'CVAE', 'GAN']

count_key = {
    'BENIGN':           'BenignN',
    'DoS slowloris':    'SlowlorisN',
    'DoS Slowhttptest': 'SlowhttptestN'
}

# Use Test2 only
tests = {'Test2': tests['Test2']}

for seed in seeds:
    print(f"\n>>> Running with seed = {seed}")
    
    # Build model dictionary, including current seed in model name
    models = {}
    for ep in epoch_range:
        models[f'MLP-{ep}ep-s{seed}'] = MLPClassifier(
            hidden_layer_sizes=(32,),
            max_iter=ep,
            tol=1e-3,
            random_state=seed,
            early_stopping=False
        )
        models[f'LogReg-{ep}ep-s{seed}'] = make_pipeline(
            StandardScaler(),
            LogisticRegression(
                max_iter=ep,
                tol=1e-3,
                class_weight='balanced',
                random_state=seed,
                n_jobs=-1
            )
        )
        models[f'RF-{ep}ep-s{seed}'] = RandomForestClassifier(
            n_estimators=100,
            random_state=seed,
            n_jobs=-1
        )
        models[f'XGB-{ep}ep-s{seed}'] = XGBClassifier(
            use_label_encoder=False,
            eval_metric='mlogloss',
            random_state=seed,
            n_jobs=-1
        )

    for test_name, sizes in tests.items():
        print(f"\n=== Scenario: {test_name} ===")

        keep_benign = min(sizes['benign'], 50000)
        b  = df_benign.sample(n=keep_benign, random_state=seed, replace=(keep_benign > len(df_benign)))
        s1 = df_slowloris.sample(n=sizes['slowloris'], random_state=seed, replace=(sizes['slowloris'] > len(df_slowloris)))
        s2 = df_slowhttptest.sample(n=sizes['slowhttptest'], random_state=seed, replace=(sizes['slowhttptest'] > len(df_slowhttptest)))

        sample_counts = {
            'Scenario':       test_name,
            'BenignN':        len(b),
            'SlowlorisN':     len(s1),
            'SlowhttptestN':  len(s2)
        }
        print("Sample counts →", sample_counts)

        data = pd.concat([b, s1, s2]).sample(frac=1, random_state=seed).reset_index(drop=True)
        X, y = data.drop('Label', axis=1).values, data['Label'].values

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )

        le = LabelEncoder().fit(y_train)
        y_train_enc, y_test_enc = le.transform(y_train), le.transform(y_test)

        for method in methods:
            X_res, y_res = apply_augmentation(X_train, y_train_enc, method, cont_idx, disc_idx)
            imputer = SimpleImputer(strategy='mean')
            X_res = imputer.fit_transform(X_res)
            X_test_imp = imputer.transform(X_test)

            for model_name, clf in models.items():
                clf.fit(X_res, y_res)
                y_pred_enc = clf.predict(X_test_imp)
                y_pred = le.inverse_transform(y_pred_enc)

                cm = confusion_matrix(y_test, y_pred, labels=le.classes_)
                for i, actual in enumerate(le.classes_):
                    row = {
                        'Seed': seed,
                        'Scenario': test_name,
                        'Augmentation': method,
                        'Model': model_name,
                        'ActualClass': actual
                    }
                    for j, pred in enumerate(le.classes_):
                        row[f'Pred_{pred}'] = cm[i, j]
                    all_confusions.append(row)

                cr_dict = classification_report(y_test, y_pred, output_dict=True)
                cr_df = pd.DataFrame(cr_dict).transpose()
                for cls in le.classes_:
                    metrics = cr_df.loc[cls]
                    row = {
                        'Seed': seed,
                        'Scenario': test_name,
                        'Augmentation': method,
                        'Model': model_name,
                        'Class': cls,
                        'Precision': metrics['precision'],
                        'Recall': metrics['recall'],
                        'F1-Score': metrics['f1-score'],
                        'Support': metrics['support'],
                        'TotalSamples': sample_counts[count_key[cls]]
                    }
                    all_classreports.append(row)

# Save results to CSV files
pd.DataFrame(all_confusions).to_csv('results/confusion_matrices_all_seeds.csv', index=False)
pd.DataFrame(all_classreports).to_csv('results/classification_reports_all_seeds.csv', index=False)

print("\n All seed results saved to:")
print(" - results/confusion_matrices_all_seeds.csv")
print(" - results/classification_reports_all_seeds.csv")

In [None]:
import pandas as pd
import re

# Explicitly retain the string "None" (to avoid being interpreted as NaN)
report_df = pd.read_csv('results/classification_reports_all_seeds.csv', keep_default_na=False)

# Keep only three classes
target_classes = ['BENIGN', 'DoS slowloris', 'DoS Slowhttptest']
report_df = report_df[report_df['Class'].isin(target_classes)]

# Strip seed suffix from model name
def strip_seed(model_name):
    return re.sub(r'-s\d+$', '-avg', model_name)

report_df['ModelConfig'] = report_df['Model'].apply(strip_seed)

# Group by Scenario + Augmentation + Model + Class
avg_report = report_df.groupby(
    ['Scenario', 'Augmentation', 'ModelConfig', 'Class']
)[['Precision', 'Recall', 'F1-Score', 'Support', 'TotalSamples']].mean().reset_index()

# Rename column for consistent output format
avg_report = avg_report.rename(columns={'ModelConfig': 'Model'})

print("Per-Class Average Report Across Seeds (with 'None'):")
display(avg_report)

# Calculate F1-score
macro_f1 = avg_report.groupby(['Model', 'Augmentation'])[['F1-Score']].mean().rename(
    columns={'F1-Score': 'Macro-F1'}
).reset_index()

print("Macro-Averaged F1-Score Across Classes:")
display(macro_f1)

# Save results
avg_report.to_csv("results/classification_report_perclass_avg.csv", index=False)
macro_f1.to_csv("results/classification_macro_f1_avg.csv", index=False)