Individual AI/ML/DL Models Implementation
Below is a complete Python script covering preprocessing, feature engineering, model training, testing, and saving outputs. Run on a standard server (CPU OK) — GPU speeds up the autoencoder training.

Save as train_models.py. It uses: pandas, numpy, scikit-learn, imblearn, xgboost, tensorflow (keras), matplotlib, seaborn.

In [18]:
# train_models_fixed.py
"""
Fixed, runnable version of your training script (XGBoost + Autoencoder + Stacked meta-classifier).
Put dataset CSV(s) in ./data/ and set DATASET to "UNSW" or "CICIDS".

Dependencies:
pip install pandas numpy scikit-learn imbalanced-learn xgboost tensorflow joblib matplotlib seaborn
"""

import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report,
                             roc_curve, auc)
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.linear_model import LogisticRegression

# ---------------- Config ----------------
DATA_PATH = "data"   # folder with datasets
DATASET = "UNSW"     # "UNSW" or "CICIDS"
RANDOM_STATE = 42
TEST_SIZE = 0.2

# Define column names for UNSW-NB15_1.csv, assuming 49 columns based on common dataset structure
# This list includes 47 generic feature names plus 'attack_cat' and 'label' as the last two.
UNSW_NB15_COL_NAMES = [
    'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
    'spkts', 'dpkts', 'swin', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean',
    'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_ftp_cmd',
    'ct_src_ltm', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
    'ct_dst_src_ltm', 'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm_2',
    'ct_srv_dst_2', 'is_sm_ips_ports', 'conn_id', 'ts', 'uid', 'id.orig_h',
    'id.orig_p', 'id.resp_h', 'id.resp_p', 'attack_cat', 'label' # Last two are labels
]

# ---------------- Loading helpers ----------------
def load_unsw(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"UNSW CSV not found at {path}")
    # Load without header and assign column names explicitly for UNSW-NB15_1.csv
    df = pd.read_csv(path, header=None, names=UNSW_NB15_COL_NAMES, low_memory=False)
    return df

def load_cicids(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"CICIDS CSV not found at {path}")
    df = pd.read_csv(path, low_memory=False)
    return df

# ---------------- Preprocessing helpers ----------------
def basic_preprocess(df, label_col="label"):
    """
    - Consolidate label to binary: normal/benign -> 0, others -> 1
    - Keep numeric columns + label
    - Drop numeric columns that contain NaNs
    """
    df = df.copy()
    # Attempt to normalize label column name if it's not present
    if label_col not in df.columns:
        possible = [c for c in df.columns if c.lower() in ("label", "attack_cat", "class", "result")]
        if possible:
            label_col = possible[0]
        else:
            raise ValueError("Label column not found. Provide a label_col present in the dataframe.")

    # Normalize label values to binary
    def map_label(x):
        s = str(x).strip().lower()
        if s in ("normal", "benign", "normal traffic", "normal_traffic", "benign_traffic", "benignpacket", "0", "none"):
            return 0
        # sometimes CSVs have 'BENIGN' uppercase etc
        if "normal" in s or "benign" in s:
            return 0
        return 1

    df[label_col] = df[label_col].apply(map_label).astype(int)

    # Keep numeric features only (float/int)
    numeric = df.select_dtypes(include=[np.number]).copy()

    # Ensure label is present in numeric (if label was non-numeric earlier, add it)
    if label_col not in numeric.columns:
        numeric[label_col] = df[label_col].values

    # Drop columns with any NaN in numeric (safer for modeling; you can change policy if you prefer)
    cols_before = numeric.shape[1]
    numeric = numeric.dropna(axis=1)
    cols_after = numeric.shape[1]
    dropped = cols_before - cols_after
    if dropped > 0:
        print(f"[preprocess] Dropped {dropped} numeric columns due to NaNs.")

    return numeric, label_col

def split_xy(df, label_col="label"):
    X = df.drop(columns=[label_col])
    y = df[label_col].astype(int)
    return X, y

# ---------------- Load dataset ----------------
# ========== Load dataset ==========
if DATASET == "UNSW":
    csv_path = os.path.join(DATA_PATH, "UNSW-NB15_1.csv")  # <-- correct filename
    df = load_unsw(csv_path)

    # set label column
    label_col = "label"     # or "attack_cat" if you want multi-class


elif DATASET == "CICIDS":
    csv_path = os.path.join(DATA_PATH, "CICIDS2017.csv")
    df = load_cicids(csv_path)
    label_col = "Label" if "Label" in df.columns else ("label" if "label" in df.columns else None)

else:
    raise ValueError("Set DATASET variable to UNSW or CICIDS")


# ---------------- Preprocess ----------------
df_proc, label_col = basic_preprocess(df, label_col=label_col)
X, y = split_xy(df_proc, label_col=label_col)
print(f"Features: {X.shape}, Label distribution: {y.value_counts().to_dict()}")

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                    stratify=y, random_state=RANDOM_STATE)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for later use
joblib.dump(scaler, "scaler.joblib")

# ---------------- Handle imbalance with SMOTE on train ----------------
sm = SMOTE(random_state=RANDOM_STATE)
# SMOTE expects 2D array, 1D labels
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train.values)
print("After SMOTE class counts:", np.bincount(y_train_res.astype(int)))

# ---------------- Model A: XGBoost ----------------
xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=RANDOM_STATE
)
xgb_clf.fit(X_train_res, y_train_res)
joblib.dump(xgb_clf, "xgb_model.joblib")

# Predictions
y_pred_xgb = xgb_clf.predict(X_test_scaled)
y_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1]

# ---------------- Model B: Autoencoder (unsupervised anomaly detection) ----------------
n_features = X_train_scaled.shape[1]
encoding_dim = max(8, n_features // 4)

autoencoder = Sequential([
    Dense(encoding_dim, activation='relu', input_shape=(n_features,)),
    Dense(max(4, encoding_dim // 2), activation='relu'),
    Dense(encoding_dim, activation='relu'),
    Dense(n_features, activation='linear')
])
autoencoder.compile(optimizer='adam', loss='mse')

# Train autoencoder only on normal samples (y_train == 0)
mask_train_normal = (y_train.values == 0)
if mask_train_normal.sum() < 10:
    raise ValueError("Too few normal samples to train the autoencoder. Check dataset and label mapping.")

X_train_norm = X_train_scaled[mask_train_normal]
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = autoencoder.fit(X_train_norm, X_train_norm,
                          epochs=100, batch_size=256, validation_split=0.1,
                          callbacks=[es], verbose=1)

autoencoder.save("autoencoder.h5")

# Calculate reconstruction error as anomaly score for test set
X_test_pred = autoencoder.predict(X_test_scaled)
mse = np.mean(np.square(X_test_scaled - X_test_pred), axis=1)

# choose threshold (e.g., mean + 3*std of train normal errors)
train_norm_pred = autoencoder.predict(X_train_norm)
train_mse = np.mean(np.square(X_train_norm - train_norm_pred), axis=1)
threshold = np.mean(train_mse) + 3 * np.std(train_mse)
print(f"[autoencoder] threshold = {threshold:.6f}")

y_pred_ae = (mse > threshold).astype(int)
ae_scores = mse  # continuous anomaly score

# ---------------- Hybrid model (stacking): Logistic Regression combining XGB probability + AE score ----------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
oof_xgb = np.zeros(len(X_train_scaled), dtype=float)
oof_ae = np.zeros(len(X_train_scaled), dtype=float)

# Generate out-of-fold predictions for stacking training set
for train_idx, val_idx in skf.split(X_train_scaled, y_train):
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train.values[train_idx], y_train.values[val_idx]

    # Resample training fold to handle imbalance
    X_tr_res, y_tr_res = sm.fit_resample(X_tr, y_tr)

    # Fit a temporary XGB on the fold
    clf = xgb.XGBClassifier(
        n_estimators=100, max_depth=6, learning_rate=0.1,
        n_jobs=-1, use_label_encoder=False, eval_metric='logloss',
        random_state=RANDOM_STATE
    )
    clf.fit(X_tr_res, y_tr_res)

    # XGB OOF probability for val
    oof_xgb[val_idx] = clf.predict_proba(X_val)[:, 1]

    # AE recon error for the validation fold
    X_val_pred = autoencoder.predict(X_val)
    oof_ae[val_idx] = np.mean(np.square(X_val - X_val_pred), axis=1)

# Prepare stacking training set and test set features
stack_X_train = np.vstack([oof_xgb, oof_ae]).T
stack_y_train = y_train.values

X_test_xgb_proba = y_proba_xgb  # from earlier trained full xgb_clf on whole train
stack_X_test = np.vstack([X_test_xgb_proba, ae_scores]).T

meta_clf = LogisticRegression(max_iter=1000)
meta_clf.fit(stack_X_train, stack_y_train)
y_pred_stack = meta_clf.predict(stack_X_test)
y_proba_stack = meta_clf.predict_proba(stack_X_test)[:, 1]

# ---------------- Evaluation helper ----------------
def evaluate(y_true, y_pred, y_proba=None, name="Model"):
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall:", recall_score(y_true, y_pred, zero_division=0))
    print("F1:", f1_score(y_true, y_pred, zero_division=0))
    if y_proba is not None:
        try:
            print("AUC:", roc_auc_score(y_true, y_proba))
        except Exception as e:
            print("AUC could not be computed:", e)
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# Evaluate all
evaluate(y_test.values, y_pred_xgb, y_proba_xgb, "XGBoost")
evaluate(y_test.values, y_pred_ae, ae_scores, "Autoencoder (threshold)")
evaluate(y_test.values, y_pred_stack, y_proba_stack, "Hybrid (Stacked)")

# Save meta model
joblib.dump(meta_clf, "meta_logreg.joblib")

# ---------------- Plot ROC curves ----------------
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_test.values, y_proba_xgb)
plt.plot(fpr, tpr, label=f'XGB (AUC={auc(fpr, tpr):.3f})')
# For AE use the anomaly scores as "probabilities" (higher means more likely positive)
fpr, tpr, _ = roc_curve(y_test.values, ae_scores)
plt.plot(fpr, tpr, label=f'Autoencoder (AUC={auc(fpr, tpr):.3f})')
fpr, tpr, _ = roc_curve(y_test.values, y_proba_stack)
plt.plot(fpr, tpr, label=f'Hybrid (AUC={auc(fpr, tpr):.3f})')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.grid(True)
plt.savefig("roc_curves.png", dpi=200)
plt.close()

print("\nAll done. Models and scaler saved (xgb_model.joblib, autoencoder.h5, meta_logreg.joblib, scaler.joblib).")


Features: (700001, 40), Label distribution: {0: 677786, 1: 22215}
After SMOTE class counts: [542228 542228]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.5422 - val_loss: 0.3471
Epoch 2/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.3227 - val_loss: 0.3145
Epoch 3/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.3241 - val_loss: 0.2977
Epoch 4/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.3131 - val_loss: 0.2864
Epoch 5/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.2930 - val_loss: 0.2801
Epoch 6/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.2837 - val_loss: 0.2777
Epoch 7/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.2741 - val_loss: 0.2761
Epoch 8/100
[1m1907/1907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.3064 - val_loss: 0.2765
Epoch 9/100
[1m



[1m4376/4376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step
[1m16945/16945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1ms/step
[autoencoder] threshold = 29.200563


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step

--- XGBoost ---
Accuracy: 0.9977214448468226
Precision: 0.937791932059448
Recall: 0.9941480981318929
F1: 0.9651480388943516
AUC: 0.9998948801900652
Confusion matrix:
 [[135265    293]
 [    26   4417]]
              precision    recall  f1-score   support

           0     0.9998    0.9978    0.9988    135558
           1     0.9378    0.9941    0.9651      4443

    accuracy                         0.9977    140001
   macro avg     0.9688    0.9960    0.9820    140001
weighted avg     0.9978    0.9977    0.9978    140001


--- Autoencoder (threshold) ---
Accuracy: 0.9682930836208313
Precision: 0.5294117647058824
Recall: 0.008102633355840648
F1: 0.015960984260696077
AUC: 0.9823479279285221
Confusion matrix:
 [[135526     32]
 [  4407     36]]
              precision    recall  f1-score   support

           0     0.9685    0.9998    0.9839    135558
           1     0.5294    0.0081    0.0160      4443

    a