In [1]:
from __future__ import annotations

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from src.config import DEFAULT_CONFIG_PATH, load_config
from src.models.pso_lightgbm import PSOLightGBMTuner
from src.utils.logger import setup_logger
from src.utils.paths import data_path

In [None]:
DATASET_PATH = data_path("raw", "Dataset(Over Sampled).csv")
RAW_LABEL_COL = "label"

config = load_config(DEFAULT_CONFIG_PATH)
config.training.use_gpu = True
config.training.gpu_platform_id = 0
config.training.gpu_device_id = 0
logger = setup_logger(log_dir=config.paths.logs_dir)

df = pd.read_csv(DATASET_PATH)
df[RAW_LABEL_COL] = df[RAW_LABEL_COL].astype(int)
label_col = config.data.label_column
# Collapse multi-class labels so 9 -> normal and everything else -> attack
df[label_col] = df[RAW_LABEL_COL].apply(lambda value: "normal" if value == 9 else "attack")

print(f"Loaded {len(df):,} rows from {DATASET_PATH}")
df[label_col].value_counts()

Loaded 314,055 rows from C:\Users\z-pc\Desktop\lightnet-botnet-detector\data\raw\Dataset(Over Sampled).csv


attack_label
attack    304834
normal      9221
Name: count, dtype: int64

In [3]:
random_state = config.training.random_state
class_counts = df[label_col].value_counts()
min_class_size = class_counts.min()

balanced_parts = [
    group.sample(n=min_class_size, random_state=random_state)
    for _, group in df.groupby(label_col)
]
balanced_df = (
    pd.concat(balanced_parts)
    .sample(frac=1.0, random_state=random_state)
    .reset_index(drop=True)
)

print(f"Balanced subset size: {len(balanced_df):,}")
balanced_df[label_col].value_counts()

Balanced subset size: 18,442


attack_label
attack    9221
normal    9221
Name: count, dtype: int64

In [4]:
feature_cols = [c for c in balanced_df.columns if c not in {label_col, RAW_LABEL_COL}]
X = balanced_df[feature_cols]
y = balanced_df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=config.training.test_size,
    stratify=y,
    random_state=random_state,
)

X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train,
    y_train,
    test_size=config.training.val_size,
    stratify=y_train,
    random_state=random_state,
)

print(
    f"Splits -> train: {len(X_train_sub):,}, val: {len(X_val):,}, test: {len(X_test):,}"
)

Splits -> train: 13,277, val: 1,476, test: 3,689


In [5]:
tuner = PSOLightGBMTuner(config)
best_params = tuner.fit(X_train_sub, y_train_sub)

mlp_inspired_overrides = {
    # Map hidden units -> tree leaves, reuse learning rate + regularization
    "num_leaves": 200,
    "learning_rate": 0.004174,
    "reg_alpha": 1e-4,
    "reg_lambda": 1e-4,
}
best_params.update(mlp_inspired_overrides)
tuner.best_params_ = best_params

model = tuner.train_best_model(
    pd.concat([X_train_sub, X_val]),
    pd.concat([y_train_sub, y_val]),
)

print("Best hyperparameters with MLP overrides")
best_params

2025-11-19 12:59:44 | INFO | psolgbm | Iteration 1/25 | best f1_macro = 0.9999
2025-11-19 12:59:57 | INFO | psolgbm | Iteration 2/25 | best f1_macro = 0.9999
2025-11-19 12:59:57 | INFO | psolgbm | Iteration 2/25 | best f1_macro = 0.9999
2025-11-19 13:00:12 | INFO | psolgbm | Iteration 3/25 | best f1_macro = 0.9999
2025-11-19 13:00:12 | INFO | psolgbm | Iteration 3/25 | best f1_macro = 0.9999
2025-11-19 13:00:25 | INFO | psolgbm | Iteration 4/25 | best f1_macro = 0.9999
2025-11-19 13:00:25 | INFO | psolgbm | Iteration 4/25 | best f1_macro = 0.9999
2025-11-19 13:00:38 | INFO | psolgbm | Iteration 5/25 | best f1_macro = 0.9999
2025-11-19 13:00:38 | INFO | psolgbm | Iteration 5/25 | best f1_macro = 0.9999
2025-11-19 13:00:50 | INFO | psolgbm | Iteration 6/25 | best f1_macro = 0.9999
2025-11-19 13:00:50 | INFO | psolgbm | Iteration 6/25 | best f1_macro = 0.9999
2025-11-19 13:01:02 | INFO | psolgbm | Iteration 7/25 | best f1_macro = 0.9999
2025-11-19 13:01:02 | INFO | psolgbm | Iteration 7/2

[LightGBM] [Info] Number of positive: 7377, number of negative: 7376
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2230
[LightGBM] [Info] Number of data points in the train set: 14753, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Best hyperparameters with MLP overrides
Best hyperparameters with MLP overrides


{'num_leaves': 200,
 'max_depth': 10,
 'learning_rate': 0.004174,
 'feature_fraction': 0.8349069973412552,
 'bagging_fraction': 0.7185759594361654,
 'min_data_in_leaf': 429,
 'objective': 'binary',
 'n_estimators': 800,
 'subsample': 0.7185759594361654,
 'colsample_bytree': 0.8349069973412552,
 'random_state': 42,
 'n_jobs': -1,
 'class_weight': 'balanced',
 'reg_alpha': 0.0001,
 'reg_lambda': 0.0001}

In [6]:
test_preds = model.predict(X_test)
report = classification_report(y_test, test_preds, output_dict=True)

print(f"Test macro F1: {report['macro avg']['f1-score']:.4f}")
pd.DataFrame(report).T

Test macro F1: 0.9984


Unnamed: 0,precision,recall,f1-score,support
attack,0.996759,1.0,0.998377,1845.0
normal,1.0,0.996746,0.99837,1844.0
accuracy,0.998374,0.998374,0.998374,0.998374
macro avg,0.998379,0.998373,0.998374,3689.0
weighted avg,0.998379,0.998374,0.998374,3689.0
