<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/Cyber_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data_inspect.py

import pandas as pd
from pathlib import Path

# 1) Configure where your files live
base_dir = Path('/content')   # or wherever you uploaded the CSVs
files = ['Merged01.csv', 'Merged02.csv', 'Merged03.csv', 'Merged04.csv']

# 2) What names to look for as the label column
candidate_labels = [
    'Label', 'label',
    'attack_label', 'attack_cat',
    'class', 'category'
]

for fname in files:
    path = base_dir / fname
    print(f"\n=== Inspecting {fname} ===")
    if not path.exists():
        print(f"  ✗ File not found at {path}")
        continue

    # Read a small sample to inspect headers & dtypes
    df_sample = pd.read_csv(path, nrows=5)
    print("  • Columns:")
    for col, dt in df_sample.dtypes.items():
        print(f"     - {col} : {dt}")

    # Quick row‐count via shell (optional)
    try:
        import subprocess
        cnt = int(subprocess.check_output(
            ["wc", "-l", str(path)], text=True).split()[0]) - 1
        print(f"  • Approx. rows: {cnt}")
    except Exception:
        pass

    # 3) Locate the label column
    label_col = None
    for cand in candidate_labels:
        if cand in df_sample.columns:
            label_col = cand
            break

    # 4) Fallback: detect any column containing the value "BENIGN"
    if label_col is None:
        for col in df_sample.columns:
            # sample up to 10k values
            vals = pd.read_csv(path, usecols=[col], nrows=10000)[col] \
                     .astype(str).str.upper().unique()
            if 'BENIGN' in vals:
                label_col = col
                print(f"  → Detected '{col}' as label column (contains 'BENIGN').")
                break

    if not label_col:
        print("  ✗ No label column found among candidates or by 'BENIGN' fallback.")
        continue

    print(f"  ✓ Using '{label_col}' as label column.")
    # Show all distinct labels in this column
    full = pd.read_csv(path, usecols=[label_col])
    uniq = full[label_col].astype(str).unique()
    print(f"  • Unique label values ({len(uniq)}): {uniq[:10]}{'...' if len(uniq)>10 else ''}")



=== Inspecting Merged01.csv ===
  • Columns:
     - Header_Length : float64
     - Protocol Type : int64
     - Time_To_Live : float64
     - Rate : float64
     - fin_flag_number : float64
     - syn_flag_number : float64
     - rst_flag_number : float64
     - psh_flag_number : float64
     - ack_flag_number : float64
     - ece_flag_number : float64
     - cwr_flag_number : float64
     - ack_count : int64
     - syn_count : int64
     - fin_count : int64
     - rst_count : int64
     - HTTP : float64
     - HTTPS : float64
     - DNS : float64
     - Telnet : float64
     - SMTP : float64
     - SSH : float64
     - IRC : float64
     - TCP : float64
     - UDP : float64
     - DHCP : float64
     - ARP : float64
     - ICMP : float64
     - IGMP : float64
     - IPv : float64
     - LLC : float64
     - Tot sum : int64
     - Min : int64
     - Max : int64
     - AVG : float64
     - Std : float64
     - Tot size : float64
     - IAT : float64
     - Number : int64
     - Varianc

In [2]:
# preprocess.py

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

def _winsorize_array(arr: np.ndarray) -> np.ndarray:
    lower = np.nanpercentile(arr, 0.5, axis=0)
    upper = np.nanpercentile(arr, 99.5, axis=0)
    return np.clip(arr, lower, upper)

def _replace_inf_with_nan(arr: np.ndarray) -> np.ndarray:
    # any infinite values become NaN
    return np.where(np.isfinite(arr), arr, np.nan)

def build_preprocessor(train_df: pd.DataFrame):
    X_train = train_df.drop(columns=['Label'])
    numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()

    numeric_pipe = Pipeline([
        ('finite', FunctionTransformer(_replace_inf_with_nan, validate=False)),
        ('imputer', SimpleImputer(strategy='median')),
        ('winsor', FunctionTransformer(_winsorize_array, validate=False)),
        ('scaler', RobustScaler())
    ])
    categorical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='UNK')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_pipe, numeric_cols),
        ('cat', categorical_pipe, categorical_cols),
    ], remainder='drop', verbose_feature_names_out=False)

    preprocessor.fit(X_train)
    return preprocessor, numeric_cols, categorical_cols

def transform_dfs(preprocessor, df: pd.DataFrame) -> np.ndarray:
    X = df.drop(columns=['Label'])
    return preprocessor.transform(X)

# Example usage:
if __name__ == '__main__':
    from pathlib import Path
    import pickle

    base = Path('/content')
    train_df = pd.concat([
        pd.read_csv(base/'Merged01.csv'),
        pd.read_csv(base/'Merged02.csv')
    ], ignore_index=True)
    val_df = pd.read_csv(base/'Merged03.csv')
    test_df = pd.read_csv(base/'Merged04.csv')

    preproc, num_cols, cat_cols = build_preprocessor(train_df)
    print(f"Numeric columns ({len(num_cols)}): {num_cols}")
    print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")

    X_train = transform_dfs(preproc, train_df)
    X_val   = transform_dfs(preproc, val_df)
    X_test  = transform_dfs(preproc, test_df)
    print("Shapes:", X_train.shape, X_val.shape, X_test.shape)

    with open('preprocessor.pkl','wb') as f:
        pickle.dump(preproc, f)

Numeric columns (39): ['Header_Length', 'Protocol Type', 'Time_To_Live', 'Rate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IGMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Variance']
Categorical columns (0): []
Shapes: (1460896, 39) (697289, 39) (676620, 39)


In [3]:
# feature_engineering.py

import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

def add_handcrafted_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Safeguard against division by zero
    eps = 1e-6

    # 1) Flag ratios
    flags = ['fin', 'syn', 'rst', 'psh', 'ack', 'ece', 'cwr']
    for flag in tqdm(flags, desc="Flag ratios"):
        num_col = f"{flag}_flag_number"
        df[f"{flag}_ratio"] = df[num_col] / (df['Number'] + eps)

    # 2) Mean duration per flow in window
    df['mean_duration'] = df['Tot sum'] / (df['Number'] + eps)

    # 3) Mean packet size per flow
    df['mean_pkt_size'] = df['Tot size'] / (df['Number'] + eps)

    # 4) Low-IAT indicator: 1 if IAT below median, else 0
    median_iat = df['IAT'].median()
    df['low_iat_pct'] = (df['IAT'] < median_iat).astype(int)

    return df

if __name__ == "__main__":
    base = Path('/content')  # adjust if your CSVs are elsewhere

    # 1) Load splits
    splits = {
        'train': pd.concat([
            pd.read_csv(base/'Merged01.csv'),
            pd.read_csv(base/'Merged02.csv')
        ], ignore_index=True),
        'val':   pd.read_csv(base/'Merged03.csv'),
        'test':  pd.read_csv(base/'Merged04.csv'),
    }

    # 2) Process each split with tqdm
    for split_name, df in tqdm(splits.items(), desc="Processing splits"):
        print(f"\n→ {split_name.upper()} (before): {df.shape}")
        df_feat = add_handcrafted_features(df)
        print(f"  {split_name.upper()} (after):  {df_feat.shape}")

        # 3) Save augmented CSV (or parquet if preferred)
        out_path = base / f"{split_name}_with_features.csv"
        df_feat.to_csv(out_path, index=False)
        print(f"  Saved → {out_path}")

Processing splits:   0%|          | 0/3 [00:00<?, ?it/s]


→ TRAIN (before): (1460896, 40)


Flag ratios:   0%|          | 0/7 [00:00<?, ?it/s]

  TRAIN (after):  (1460896, 50)
  Saved → /content/train_with_features.csv

→ VAL (before): (697289, 40)


Flag ratios:   0%|          | 0/7 [00:00<?, ?it/s]

  VAL (after):  (697289, 50)
  Saved → /content/val_with_features.csv

→ TEST (before): (676620, 40)


Flag ratios:   0%|          | 0/7 [00:00<?, ?it/s]

  TEST (after):  (676620, 50)
  Saved → /content/test_with_features.csv


In [4]:
# train_models.py

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

# XGBoost & sklearn
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ----- 0) Dynamic loader with CSV→Feather fallback -----
def load_data(base: Path):
    def _load_split(name: str) -> pd.DataFrame:
        feather = base / f"{name}_with_features.feather"
        csvf    = base / f"{name}_with_features.csv"
        if feather.exists():
            print(f"Loading {feather} …")
            return pd.read_feather(feather)
        elif csvf.exists():
            print(f"Feather not found; reading {csvf} in chunks…")
            chunks = []
            for c in pd.read_csv(csvf, chunksize=200_000):
                chunks.append(c)
            df = pd.concat(chunks, ignore_index=True)
            print(f"Saving Feather → {feather}")
            df.to_feather(feather)
            return df
        else:
            raise FileNotFoundError(f"Neither {feather} nor {csvf} found.")
    return (_load_split('train'),
            _load_split('val'),
            _load_split('test'))

# ----- Helper to sanitize arrays -----
def sanitize_arrays(X_train: np.ndarray, X_other: np.ndarray):
    X_train = np.where(np.isfinite(X_train), X_train, np.nan)
    X_other = np.where(np.isfinite(X_other), X_other, np.nan)
    med = np.nanmedian(X_train, axis=0)
    inds1 = np.where(np.isnan(X_train))
    X_train[inds1] = np.take(med, inds1[1])
    inds2 = np.where(np.isnan(X_other))
    X_other[inds2] = np.take(med, inds2[1])
    return X_train, X_other

# ----- 1) Split features & labels -----
def split_xy(df: pd.DataFrame):
    X = df.drop(columns=['Label']).values
    y = (df['Label'].astype(str) != 'BENIGN').astype(int).values
    return X, y

# ----- 2) Baseline Tree -----
def train_tree_baseline(X_train, y_train, X_val, y_val):
    print("\n=== Training XGBoost baseline ===")
    X_train, X_val = sanitize_arrays(X_train, X_val)
    pos, neg = y_train.sum(), len(y_train) - y_train.sum()
    clf = XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.075,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=neg/pos,
        use_label_encoder=False,
        eval_metric='logloss',
        verbosity=1
    )
    clf.fit(X_train, y_train)
    joblib.dump(clf, 'xgb_baseline.pkl')
    preds = clf.predict_proba(X_val)[:,1]
    print(f"Baseline Val AUROC: {roc_auc_score(y_val,preds):.4f}, "
          f"AP: {average_precision_score(y_val,preds):.4f}, "
          f"F1: {f1_score(y_val,preds.round()):.4f}")
    return clf

# ----- 3) Encoder & Windowed Dataset -----
class WindowedDataset(Dataset):
    def __init__(self, X, y, L, S):
        windows, labels = [], []
        N = len(X)
        for start in tqdm(range(0, N-L+1, S), desc="Creating windows"):
            win = X[start:start+L]
            lbl = int(y[start:start+L].any())
            windows.append(win)
            labels.append(lbl)
        self.windows = np.stack(windows)
        self.labels  = np.array(labels, int)

    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return self.windows[idx], self.labels[idx]

class CNNBiGRUEncoder(nn.Module):
    def __init__(self, in_feats, hidden=64, emb_dim=64):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_feats, 64, kernel_size=5, padding=2), nn.ReLU(),
            nn.Conv1d(64, 64, kernel_size=3, padding=1),      nn.ReLU()
        )
        self.gru  = nn.GRU(64, hidden, batch_first=True, bidirectional=True)
        self.proj = nn.Linear(hidden*2, emb_dim)
        self.cls  = nn.Linear(emb_dim, 1)

    def forward(self, x):
        x = x.transpose(1,2)          # (batch, F, L)
        x = self.cnn(x)               # (batch, 64, L)
        x = x.transpose(1,2)          # (batch, L, 64)
        out, _ = self.gru(x)          # (batch, L, hidden*2)
        emb = out.mean(dim=1)         # (batch, hidden*2)
        emb = self.proj(emb)          # (batch, emb_dim)
        logit = self.cls(emb).squeeze(-1)  # (batch,)
        return emb, logit

def train_encoder(X_tr, y_tr, X_va, y_va, L=64, S=32):
    print("\n=== Training CNN+BiGRU encoder ===")
    ds_tr = WindowedDataset(X_tr, y_tr, L, S)
    ds_va = WindowedDataset(X_va, y_va, L, S)
    dl_tr = DataLoader(ds_tr, batch_size=256, shuffle=True)
    dl_va = DataLoader(ds_va, batch_size=256)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNBiGRUEncoder(in_feats=X_tr.shape[1]).to(device)
    opt   = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
    pos, neg = ds_tr.labels.sum(), len(ds_tr) - ds_tr.labels.sum()
    crit = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([neg/pos], device=device))

    best_auc = 0.0
    for epoch in range(1,11):
        model.train()
        for xb, yb in tqdm(dl_tr, desc=f"Epoch {epoch} [train]"):
            xb, yb = xb.to(device).float(), yb.to(device).float()
            emb, logit = model(xb)
            loss = crit(logit, yb)
            opt.zero_grad(); loss.backward(); opt.step()

        model.eval()
        all_p, all_l = [], []
        with torch.no_grad():
            for xb, yb in tqdm(dl_va, desc=f"Epoch {epoch} [val]"):
                xb = xb.to(device).float()
                _, logit = model(xb)
                p = torch.sigmoid(logit).cpu().numpy()
                all_p.append(p); all_l.append(yb.numpy())
        p = np.concatenate(all_p); l = np.concatenate(all_l)
        auc = roc_auc_score(l, p)
        print(f"  Val AUROC: {auc:.4f}")
        if auc > best_auc:
            best_auc = auc
            torch.save(model.state_dict(), 'best_encoder.pth')

    print(f"→ Best encoder AUROC {best_auc:.4f}")
    return model, ds_tr, ds_va

# ----- 4) Hybrid Head -----
def extract_embeddings(model, ds, batch_size=512):
    loader = DataLoader(ds, batch_size=batch_size)
    device = next(model.parameters()).device
    embs, labs = [], []
    model.eval()
    with torch.no_grad():
        for xb, yb in tqdm(loader, desc="Extracting embeddings"):
            xb = xb.to(device).float()
            emb, _ = model(xb)
            embs.append(emb.cpu().numpy()); labs.append(yb.numpy())
    return np.vstack(embs), np.concatenate(labs)

def train_hybrid(model, ds_tr, ds_va):
    print("\n=== Training XGBoost head on embeddings ===")
    Xtr_emb, ytr = extract_embeddings(model, ds_tr)
    Xva_emb, yva = extract_embeddings(model, ds_va)
    Xtr_emb, Xva_emb = sanitize_arrays(Xtr_emb, Xva_emb)
    pos, neg = ytr.sum(), len(ytr) - ytr.sum()

    clf = XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.075,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=neg/pos,
        use_label_encoder=False,
        eval_metric='logloss',
        verbosity=1
    )
    clf.fit(Xtr_emb, ytr)
    joblib.dump(clf, 'xgb_hybrid_head.pkl')
    preds = clf.predict_proba(Xva_emb)[:,1]
    print(f"Hybrid Val AUROC: {roc_auc_score(yva,preds):.4f}, "
          f"AP: {average_precision_score(yva,preds):.4f}, "
          f"F1: {f1_score(yva,preds.round()):.4f}")
    return clf

# ----- 5) Main with stratified fallback -----
if __name__ == '__main__':
    base = Path('/content')
    df_tr, df_va, df_te = load_data(base)

    # Split into arrays
    X_tr, y_tr = split_xy(df_tr)
    X_va, y_va = split_xy(df_va)
    X_te, y_te = split_xy(df_te)

    # If validation contains only one binary class, do stratified hold-out
    if len(np.unique(y_va)) < 2:
        print("⚠️ Val has only one class. Carving 10% stratified from train for val.")
        X_tr, X_va, y_tr, y_va = train_test_split(
            X_tr, y_tr, test_size=0.10, stratify=y_tr, random_state=42
        )

    # Sanitize before any modeling
    X_tr, X_va = sanitize_arrays(X_tr, X_va)
    X_tr, X_te = sanitize_arrays(X_tr, X_te)

    # 1) Baseline
    train_tree_baseline(X_tr, y_tr, X_va, y_va)

    # 2) Encoder
    encoder, ds_tr, ds_va = train_encoder(X_tr, y_tr, X_va, y_va)

    # 3) Hybrid
    train_hybrid(encoder, ds_tr, ds_va)

    print("\n✅ Done. Artifacts:")
    print(" - xgb_baseline.pkl")
    print(" - best_encoder.pth")
    print(" - xgb_hybrid_head.pkl")

Feather not found; reading /content/train_with_features.csv in chunks…
Saving Feather → /content/train_with_features.feather
Feather not found; reading /content/val_with_features.csv in chunks…
Saving Feather → /content/val_with_features.feather
Feather not found; reading /content/test_with_features.csv in chunks…
Saving Feather → /content/test_with_features.feather

=== Training XGBoost baseline ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Baseline Val AUROC: 0.9981, AP: 1.0000, F1: 0.9927

=== Training CNN+BiGRU encoder ===


Creating windows:   0%|          | 0/45652 [00:00<?, ?it/s]

Creating windows:   0%|          | 0/21789 [00:00<?, ?it/s]

Epoch 1 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 1 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 2 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 2 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 3 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 3 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 4 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 4 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 5 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 5 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 6 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 6 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 7 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 7 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 8 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 8 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 9 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 9 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan




Epoch 10 [train]:   0%|          | 0/179 [00:00<?, ?it/s]

Epoch 10 [val]:   0%|          | 0/86 [00:00<?, ?it/s]

  Val AUROC: nan
→ Best encoder AUROC 0.0000

=== Training XGBoost head on embeddings ===




Extracting embeddings:   0%|          | 0/90 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/43 [00:00<?, ?it/s]

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]