# Fruty — Experiment Notebook

Goal: reproduce and iterate the IDS experiments (NSL-KDD) end-to-end and push detection performance as high as possible.

Notes: we distinguish two goals — (a) binary detection (normal vs attack), where ~99% accuracy is realistic, and (b) multiclass attack classification (many imbalanced classes) which is harder. This notebook contains diagnostics, fast filter + ABA feature selection, weighted LightGBM and CatBoost baselines, and stacking/ensemble scaffolding.

FAST_RUN: toggle to `True` to run quick/smaller experiments (useful for CI / local iteration). Set to `False` for full runs.

In [None]:
# Cell 1: Setup
FAST_RUN = True
SEED = 1
import os, sys, json, time
from pathlib import Path
ROOT = Path('..').resolve().parent if Path('.').name == 'notebooks' else Path('.').resolve()
# adjust sys.path to import project modules
sys.path.insert(0, str(ROOT))
print('Root:', ROOT)
# imports
import numpy as np
import pandas as pd
from pprint import pprint
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [None]:
# Cell 2: Helper to load processed arrays
from pathlib import Path
def load_processed():
    tr = Path('data') / 'processed' / 'train_processed.npz'
    te = Path('data') / 'processed' / 'test_processed.npz'
    if not tr.exists() or not te.exists():
        raise FileNotFoundError('Run src/preprocessing.py to create data/processed/*.npz')
    dt = np.load(tr, allow_pickle=True)
    de = np.load(te, allow_pickle=True)
    return dt['X'], dt['y'], de['X'], de['y']
# quick smoke load
Xtr, ytr, Xte, yte = load_processed()
print('Loaded processed arrays: ', Xtr.shape, Xte.shape)

In [None]:
# Cell 3: Diagnostics (per-feature variance, NaNs, label counts)
def diagnostics(X, y, top_n=10):
    n_samples, n_features = X.shape
    var = np.nanvar(X, axis=0)
    n_zero = np.sum(X == 0, axis=0)
    n_nan = np.sum(np.isnan(X), axis=0)
    df = pd.DataFrame({
        'feature_idx': np.arange(n_features),
        'variance': var,
        'pct_zero': n_zero / n_samples,
        'pct_nan': n_nan / n_samples,
    })
    df = df.sort_values('variance', ascending=False).reset_index(drop=True)
    print('n_features:', n_features)
    print('constant features (var==0):', int((df['variance'] == 0).sum()))
    print('top features by variance:')
    display(df.head(top_n))
    print('
Label distribution (train):')
    from collections import Counter
    print(Counter(y.tolist()))
    out = Path('results')
    out.mkdir(parents=True, exist_ok=True)
    df.to_csv(out / 'feature_diagnostics.csv', index=False)
    print('Saved feature_diagnostics.csv')

diagnostics(Xtr, ytr, top_n=10)

In [None]:
# Cell 4: Pre-filter using f_classif (fast) and save top-K indices
from sklearn.feature_selection import f_classif
K = 500
mi_subsample = 5000 if not FAST_RUN else 2000
print('Using subsample for f_classif:', mi_subsample)
sss_idx = None
if Xtr.shape[0] > mi_subsample:
    from sklearn.model_selection import StratifiedShuffleSplit
    sss = StratifiedShuffleSplit(n_splits=1, train_size=mi_subsample, random_state=SEED)
    idx, _ = next(sss.split(Xtr, ytr))
    Xs = Xtr[idx]
    ys = ytr[idx]
else:
    Xs = Xtr; ys = ytr
f_vals, p_vals = f_classif(Xs, ys)
top_k = int(K if K <= Xtr.shape[1] else Xtr.shape[1])
top_idx = np.argsort(f_vals)[-top_k:][::-1]
print('Selected top_k=', top_k)
np.save('results/top_k_indices.npy', top_idx)
print('Saved results/top_k_indices.npy')

In [None]:
# Cell 5: Run ABA on reduced feature set (fast mode).
# This cell will import the small ABA implementation in src/feature_selection/aba.py
from src.feature_selection.aba import ArtificialButterfly
from sklearn.model_selection import StratifiedKFold, cross_val_score
try:
    import lightgbm as lgb
except Exception:
    lgb = None
# map reduced space -> original indices
top_idx = np.load('results/top_k_indices.npy')
Xtr_red = Xtr[:, top_idx]
Xte_red = Xte[:, top_idx]
print('Reduced shapes:', Xtr_red.shape, Xte_red.shape)
# define fitness: class-weighted LGB with 3-fold CV scoring f1_macro
def make_lgb_fitness(X, y, n_splits=3, seed=SEED):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    clf = lgb.LGBMClassifier(n_estimators=50 if not FAST_RUN else 10, learning_rate=0.1, random_state=seed, n_jobs=1)
    def fitness(X_sub, y_sub):
        if X_sub.shape[1] == 0:
            return 0.0
        try:
            scores = cross_val_score(clf, X_sub, y_sub, cv=skf, scoring='f1_macro', n_jobs=1)
            return float(np.mean(scores))
        except Exception as e:
            print('Fitness eval failed:', e)
            return 0.0
    return fitness

fitness = make_lgb_fitness(Xtr_red, ytr, n_splits=3)
aba_pop = 12 if not FAST_RUN else 6
aba_iter = 20 if not FAST_RUN else 6
print('ABA settings pop,iter =', aba_pop, aba_iter)
aba = ArtificialButterfly(pop_size=aba_pop, n_iter=aba_iter, random_state=SEED)
# run ABA (this may take time)
best_mask_red, best_score = aba.fit(Xtr_red, ytr, fitness)
print('ABA done: best_score=', best_score, 'n_features_selected=', int(best_mask_red.sum()))
# map back to original feature indices and save
sel_indices = top_idx[best_mask_red.astype(bool)]
np.save('models/aba_best_mask.npy', sel_indices)
# save history
import csv
with open('results/aba_history.csv', 'w', newline='') as fh:
    w = csv.writer(fh)
    w.writerow(['iteration','best_score'])
    for i,s in enumerate(aba.history_):
        w.writerow([i,s])
print('Saved models/aba_best_mask.npy and results/aba_history.csv')

In [None]:
# Cell 6: Retrain final model using ABA-selected features (or fallback to top_k) and evaluate on test set
from sklearn.metrics import accuracy_score, f1_score, classification_report
# load selected features if present
sel_path = Path('models') / 'aba_best_mask.npy'
if sel_path.exists():
    sel = np.load(sel_path)
    sel_mask = np.zeros(Xtr.shape[1], dtype=bool)
    sel_mask[sel] = True
    print('Using ABA-selected features count=', sel_mask.sum())
else:
    print('ABA mask not found: using top_k indices')
    sel = np.load('results/top_k_indices.npy')
    sel_mask = np.zeros(Xtr.shape[1], dtype=bool)
    sel_mask[sel] = True
    print('Using top_k features count=', sel_mask.sum())
Xtr_sel = Xtr[:, sel_mask]
Xte_sel = Xte[:, sel_mask]
# train a final weighted LGB on a stratified subsample for speed, or full if FAST_RUN False
from sklearn.model_selection import StratifiedShuffleSplit
tr_size = 30000 if not FAST_RUN else 8000
sss = StratifiedShuffleSplit(n_splits=1, train_size=min(tr_size, Xtr_sel.shape[0]), random_state=SEED)
idx_train, _ = next(sss.split(Xtr_sel, ytr))
Xtrain_sub = Xtr_sel[idx_train]
ytrain_sub = ytr[idx_train]
# compute sample weights inverse freq (simple)
from collections import Counter
counts = Counter(ytrain_sub.tolist())
total = len(ytrain_sub)
class_weight = {k: total / (len(counts) * v) for k, v in counts.items()}
sample_weight = np.array([class_weight[int(l)] for l in ytrain_sub])
clf = lgb.LGBMClassifier(n_estimators=200 if not FAST_RUN else 50, learning_rate=0.05, random_state=SEED, n_jobs=1)
clf.fit(Xtrain_sub, ytrain_sub, sample_weight=sample_weight)
y_pred = clf.predict(Xte_sel)
acc = accuracy_score(yte, y_pred)
f1 = f1_score(yte, y_pred, average='macro')
print('Final model -> acc:', acc, 'f1_macro:', f1)
# save final model and results
joblib.dump(clf, 'models/aba_lgb_final.joblib')
pd.DataFrame([{'setup':'aba_lgb_final','accuracy':acc,'f1_macro':f1,'n_features':int(sel_mask.sum())}]).to_csv('results/aba_results.csv', index=False)
print('Saved models/aba_lgb_final.joblib and results/aba_results.csv')

In [None]:
# Cell 7: Binary detection validation (normal vs attack)
def to_binary(y):
    ys = np.array([str(v).lower() for v in y])
    return (ys != 'normal').astype(int)
ytr_bin = to_binary(ytr)
yte_bin = to_binary(yte)
from sklearn.model_selection import cross_val_score, StratifiedKFold
clf_bin = lgb.LGBMClassifier(n_estimators=100 if not FAST_RUN else 30, random_state=SEED, n_jobs=1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
# evaluate with CV on subsample to avoid long runs
scores = cross_val_score(clf_bin, Xtr, ytr_bin, cv=skf, scoring='f1', n_jobs=1)
print('Binary cv f1 (5-fold) mean/std:', scores.mean(), scores.std())
clf_bin.fit(Xtr, ytr_bin)
y_pred_bin = clf_bin.predict(Xte)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
print('Binary test acc:', accuracy_score(yte_bin, y_pred_bin))
print('Binary test f1:', f1_score(yte_bin, y_pred_bin))
# save binary model
joblib.dump(clf_bin, 'models/lgb_binary_full.joblib')
print('Saved models/lgb_binary_full.joblib')

In [None]:
# Cell 8: Optional CatBoost experiment (handles categorical natively)
try:
    from catboost import CatBoostClassifier
    cat_available = True
except Exception:
    cat_available = False

if cat_available:
    print('CatBoost available — running a quick experiment (fast mode subsample)')
    # To run CatBoost using original categorical columns we would need the original DataFrame and pipeline.
    # For speed, we run CatBoost on numeric processed features as a comparison.
    sss = StratifiedShuffleSplit(n_splits=1, train_size=8000 if FAST_RUN else 30000, random_state=SEED)
    idx, _ = next(sss.split(Xtr, ytr))
    Xs = Xtr[idx]; ys = ytr[idx]
    clf_cat = CatBoostClassifier(iterations=200 if not FAST_RUN else 50, random_seed=SEED, verbose=False)
    clf_cat.fit(Xs, ys)
    ycat = clf_cat.predict(Xte)
    print('CatBoost multiclass f1_macro:', f1_score(yte, ycat, average='macro'))
    joblib.dump(clf_cat, 'models/catboost_quick.joblib')
else:
    print('CatBoost not installed — skip this cell or install catboost in the venv')

In [None]:
# Cell 9: Simple stacking ensemble scaffold (LGB + optional CatBoost + LR meta)
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
estimators = [('lgb', clf)]
if 'clf_cat' in globals() and cat_available:
    estimators.append(('cat', clf_cat))
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=200), n_jobs=1)
# quick CV to evaluate stacking (fast)
from sklearn.model_selection import cross_val_score
sc = cross_val_score(stack, Xtr[:8000] if FAST_RUN else Xtr, ytr[:8000] if FAST_RUN else ytr, cv=3, scoring='f1_macro', n_jobs=1)
print('Stacking cv f1_macro (3-fold):', sc.mean(), sc.std())
# Save placeholder: full fitting left as next step if stacking improves CV
print('If stacking helps, fit on full training set and evaluate on test set (left as follow-up).')

In [None]:
# Cell 10: Save experiment config and short conclusions
cfg = {
    'date': time.asctime(),
    'seed': SEED,
    'FAST_RUN': FAST_RUN,
    'aba_pop': aba_pop if 'aba_pop' in globals() else None,
    'aba_iter': aba_iter if 'aba_iter' in globals() else None,
    'top_k': int(top_k) if 'top_k' in globals() else None
}
Path('results').mkdir(parents=True, exist_ok=True)
with open('results/experiment_config.json', 'w') as fh:
    json.dump(cfg, fh, indent=2)
print('Saved results/experiment_config.json')

print('Next steps (short):')
print('- Validate binary detector via repeated CV and different splits (aim for stable 99%+).')
print('- Run ABA full-scale on top-1000 features (longer run) and hybridize with PSO/GA.')
print('- Train CatBoost using original categorical pipeline (avoids OHE explosion) and compare.')
print('- Implement adaptive voting based on per-sample confidences for stacking/ensemble.')