In [10]:
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "Individual AI/ML/DL Models Implementation\n",
        "Below is a complete Python script covering preprocessing, feature engineering, model training, testing, and saving outputs. Run on a standard server (CPU OK) — GPU speeds up the autoencoder training.\n",
        "\n",
        "Save as train_models.py. It uses: pandas, numpy, scikit-learn, imblearn, xgboost, tensorflow (keras), matplotlib, seaborn."
      ],
      "metadata": {
        "id": "TKey1l7nVkad"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# train_models_fixed.py\n",
        "\"\"\"\n",
        "Fixed, runnable version of your training script (XGBoost + Autoencoder + Stacked meta-classifier).\n",
        "Put dataset CSV(s) in ./data/ and set DATASET to \"UNSW\" or \"CICIDS\".\n",
        "\n",
        "Dependencies:\n",
        "pip install pandas numpy scikit-learn imbalanced-learn xgboost tensorflow joblib matplotlib seaborn\n",
        "\"\"\"\n",
        "\n",
        "import os\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import joblib\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "\n",
        "from sklearn.model_selection import train_test_split, StratifiedKFold\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.metrics import (accuracy_score, precision_score, recall_score,\n",
        "                             f1_score, roc_auc_score, confusion_matrix, classification_report,\n",
        "                             roc_curve, auc)\n",
        "from imblearn.over_sampling import SMOTE\n",
        "import xgboost as xgb\n",
        "from tensorflow.keras.models import Sequential\n",
        "from tensorflow.keras.layers import Dense\n",
        "from tensorflow.keras.callbacks import EarlyStopping\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "\n",
        "# ---------------- Config ----------------\n",
        "DATA_PATH = \"data\"   # folder with datasets\n",
        "DATASET = \"UNSW\"     # \"UNSW\" or \"CICIDS\"\n",
        "RANDOM_STATE = 42\n",
        "TEST_SIZE = 0.2\n",
        "\n",
        "# Define column names for UNSW-NB15_1.csv, assuming 49 columns based on common dataset structure\n",
        "# This list includes 47 generic feature names plus 'attack_cat' and 'label' as the last two.\n",
        "UNSW_NB15_COL_NAMES = [\n",
        "    'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',\n",
        "    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',\n",
        "    'spkts', 'dpkts', 'swin', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean',\n",
        "    'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_ftp_cmd',\n",
        "    'ct_src_ltm', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',\n",
        "    'ct_dst_src_ltm', 'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm_2',\n",
        "    'ct_srv_dst_2', 'is_sm_ips_ports', 'conn_id', 'ts', 'uid', 'id.orig_h',\n",
        "    'id.orig_p', 'id.resp_h', 'id.resp_p', 'attack_cat', 'label' # Last two are labels\n",
        "]\n",
        "\n",
        "# ---------------- Loading helpers ----------------\n",
        "def load_unsw(path):\n",
        "    if not os.path.exists(path):\n",
        "        raise FileNotFoundError(f\"UNSW CSV not found at {path}\")\n",
        "    # Load without header and assign column names explicitly for UNSW-NB15_1.csv\n",
        "    df = pd.read_csv(path, header=None, names=UNSW_NB15_COL_NAMES, low_memory=False)\n",
        "    return df\n",
        "\n",
        "def load_cicids(path):\n",
        "    if not os.path.exists(path):\n",
        "        raise FileNotFoundError(f\"CICIDS CSV not found at {path}\")\n",
        "    df = pd.read_csv(path, low_memory=False)\n",
        "    return df\n",
        "\n",
        "# ---------------- Preprocessing helpers ----------------\n",
        "def basic_preprocess(df, label_col=\"label\"):\n",
        "    \"\"\"\n",
        "    - Consolidate label to binary: normal/benign -> 0, others -> 1\n",
        "    - Keep numeric columns + label\n",
        "    - Drop numeric columns that contain NaNs\n",
        "    \"\"\"\n",
        "    df = df.copy()\n",
        "    # Attempt to normalize label column name if it's not present\n",
        "    if label_col not in df.columns:\n",
        "        possible = [c for c in df.columns if c.lower() in (\"label\", \"attack_cat\", \"class\", \"result\")]\n",
        "        if possible:\n",
        "            label_col = possible[0]\n",
        "        else:\n",
        "            raise ValueError(\"Label column not found. Provide a label_col present in the dataframe.\")\n",
        "\n",
        "    # Normalize label values to binary\n",
        "    def map_label(x):\n",
        "        s = str(x).strip().lower()\n",
        "        if s in (\"normal\", \"benign\", \"normal traffic\", \"normal_traffic\", \"benign_traffic\", \"benignpacket\", \"0\", \"none\"):\n",
        "            return 0\n",
        "        # sometimes CSVs have 'BENIGN' uppercase etc\n",
        "        if \"normal\" in s or \"benign\" in s:\n",
        "            return 0\n",
        "        return 1\n",
        "\n",
        "    df[label_col] = df[label_col].apply(map_label).astype(int)\n",
        "\n",
        "    # Keep numeric features only (float/int)\n",
        "    numeric = df.select_dtypes(include=[np.number]).copy()\n",
        "\n",
        "    # Ensure label is present in numeric (if label was non-numeric earlier, add it)\n",
        "    if label_col not in numeric.columns:\n",
        "        numeric[label_col] = df[label_col].values\n",
        "\n",
        "    # Drop columns with any NaN in numeric (safer for modeling; you can change policy if you prefer)\n",
        "    cols_before = numeric.shape[1]\n",
        "    numeric = numeric.dropna(axis=1)\n",
        "    cols_after = numeric.shape[1]\n",
        "    dropped = cols_before - cols_after\n",
        "    if dropped > 0:\n",
        "        print(f\"[preprocess] Dropped {dropped} numeric columns due to NaNs.\")\n",
        "\n",
        "    return numeric, label_col\n",
        "\n",
        "def split_xy(df, label_col=\"label\"):\n",
        "    X = df.drop(columns=[label_col])\n",
        "    y = df[label_col].astype(int)\n",
        "    return X, y\n",
        "\n",
        "# ---------------- Load dataset ----------------\n",
        "# ========== Load dataset ==========\n",
        "if DATASET == \"UNSW\":\n",
        "    csv_path = os.path.join(DATA_PATH, \"UNSW-NB15_1.csv\")  # <-- correct filename\n",
        "    df = load_unsw(csv_path)\n",
        "\n",
        "    # set label column\n",
        "    label_col = \"label\"     # or \"attack_cat\" if you want multi-class\n",
        "\n",
        "\n",
        "elif DATASET == \"CICIDS\":\n",
        "    csv_path = os.path.join(DATA_PATH, \"CICIDS2017.csv\")\n",
        "    df = load_cicids(csv_path)\n",
        "    label_col = \"Label\" if \"Label\" in df.columns else (\"label\" if \"label\" in df.columns else None)\n",
        "\n",
        "else:\n",
        "    raise ValueError(\"Set DATASET variable to UNSW or CICIDS\")\n",
        "\n",
        "\n",
        "# ---------------- Preprocess ----------------\n",
        "df_proc, label_col = basic_preprocess(df, label_col=label_col)\n",
        "X, y = split_xy(df_proc, label_col=label_col)\n",
        "print(f\"Features: {X.shape}, Label distribution: {y.value_counts().to_dict()}\")\n",
        "\n",
        "# Train/test split (stratified)\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,\n",
        "                                                    stratify=y, random_state=RANDOM_STATE)\n",
        "\n",
        "# Scale features\n",
        "scaler = StandardScaler()\n",
        "X_train_scaled = scaler.fit_transform(X_train)\n",
        "X_test_scaled = scaler.transform(X_test)\n",
        "\n",
        "# Save scaler for later use\n",
        "joblib.dump(scaler, \"scaler.joblib\")\n",
        "\n",
        "# ---------------- Handle imbalance with SMOTE on train ----------------\n",
        "sm = SMOTE(random_state=RANDOM_STATE)\n",
        "# SMOTE expects 2D array, 1D labels\n",
        "X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train.values)\n",
        "print(\"After SMOTE class counts:\", np.bincount(y_train_res.astype(int)))\n",
        "\n",
        "# ---------------- Model A: XGBoost ----------------\n",
        "xgb_clf = xgb.XGBClassifier(\n",
        "    n_estimators=200,\n",
        "    max_depth=6,\n",
        "    learning_rate=0.1,\n",
        "    n_jobs=-1,\n",
        "    use_label_encoder=False,\n",
        "    eval_metric='logloss',\n",
        "    random_state=RANDOM_STATE\n",
        ")\n",
        "xgb_clf.fit(X_train_res, y_train_res)\n",
        "joblib.dump(xgb_clf, \"xgb_model.joblib\")\n",
        "\n",
        "# Predictions\n",
        "y_pred_xgb = xgb_clf.predict(X_test_scaled)\n",
        "y_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1]\n",
        "\n",
        "# ---------------- Model B: Autoencoder (unsupervised anomaly detection) ----------------\n",
        "n_features = X_train_scaled.shape[1]\n",
        "encoding_dim = max(8, n_features // 4)\n",
        "\n",
        "autoencoder = Sequential([\n",
        "    Dense(encoding_dim, activation='relu', input_shape=(n_features,)),\n",
        "    Dense(max(4, encoding_dim // 2), activation='relu'),\n",
        "    Dense(encoding_dim, activation='relu'),\n",
        "    Dense(n_features, activation='linear')\n",
        "])\n",
        "autoencoder.compile(optimizer='adam', loss='mse')\n",
        "\n",
        "# Train autoencoder only on normal samples (y_train == 0)\n",
        "mask_train_normal = (y_train.values == 0)\n",
        "if mask_train_normal.sum() < 10:\n",
        "    raise ValueError(\"Too few normal samples to train the autoencoder. Check dataset and label mapping.\")\n",
        "\n",
        "X_train_norm = X_train_scaled[mask_train_normal]\n",
        "es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n",
        "history = autoencoder.fit(X_train_norm, X_train_norm,\n",
        "                          epochs=100, batch_size=256, validation_split=0.1,\n",
        "                          callbacks=[es], verbose=1)\n",
        "\n",
        "autoencoder.save(\"autoencoder.h5\")\n",
        "\n",
        "# Calculate reconstruction error as anomaly score for test set\n",
        "X_test_pred = autoencoder.predict(X_test_scaled)\n",
        "mse = np.mean(np.square(X_test_scaled - X_test_pred), axis=1)\n",
        "\n",
        "# choose threshold (e.g., mean + 3*std of train normal errors)\n",
        "train_norm_pred = autoencoder.predict(X_train_norm)\n",
        "train_mse = np.mean(np.square(X_train_norm - train_norm_pred), axis=1)\n",
        "threshold = np.mean(train_mse) + 3 * np.std(train_mse)\n",
        "print(f\"[autoencoder] threshold = {threshold:.6f}\")\n",
        "\n",
        "y_pred_ae = (mse > threshold).astype(int)\n",
        "ae_scores = mse  # continuous anomaly score\n",
        "\n",
        "# ---------------- Hybrid model (stacking): Logistic Regression combining XGB probability + AE score ----------------\n",
        "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)\n",
        "oof_xgb = np.zeros(len(X_train_scaled), dtype=float)\n",
        "oof_ae = np.zeros(len(X_train_scaled), dtype=float)\n",
        "\n",
        "# Generate out-of-fold predictions for stacking training set\n",
        "for train_idx, val_idx in skf.split(X_train_scaled, y_train):\n",
        "    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]\n",
        "    y_tr, y_val = y_train.values[train_idx], y_train.values[val_idx]\n",
        "\n",
        "    # Resample training fold to handle imbalance\n",
        "    X_tr_res, y_tr_res = sm.fit_resample(X_tr, y_tr)\n",
        "\n",
        "    # Fit a temporary XGB on the fold\n",
        "    clf = xgb.XGBClassifier(\n",
        "        n_estimators=100, max_depth=6, learning_rate=0.1,\n",
        "        n_jobs=-1, use_label_encoder=False, eval_metric='logloss',\n",
        "        random_state=RANDOM_STATE\n",
        "    )\n",
        "    clf.fit(X_tr_res, y_tr_res)\n",
        "\n",
        "    # XGB OOF probability for val\n",
        "    oof_xgb[val_idx] = clf.predict_proba(X_val)[:, 1]\n",
        "\n",
        "    # AE recon error for the validation fold\n",
        "    X_val_pred = autoencoder.predict(X_val)\n",
        "    oof_ae[val_idx] = np.mean(np.square(X_val - X_val_pred), axis=1)\n",
        "\n",
        "# Prepare stacking training set and test set features\n",
        "stack_X_train = np.vstack([oof_xgb, oof_ae]).T\n",
        "stack_y_train = y_train.values\n",
        "\n",
        "X_test_xgb_proba = y_proba_xgb  # from earlier trained full xgb_clf on whole train\n",
        "stack_X_test = np.vstack([X_test_xgb_proba, ae_scores]).T\n",
        "\n",
        "meta_clf = LogisticRegression(max_iter=1000)\n",
        "meta_clf.fit(stack_X_train, stack_y_train)\n",
        "y_pred_stack = meta_clf.predict(stack_X_test)\n",
        "y_proba_stack = meta_clf.predict_proba(stack_X_test)[:, 1]\n",
        "\n",
        "# ---------------- Evaluation helper ----------------\n",
        "def evaluate(y_true, y_pred, y_proba=None, name=\"Model\"):\n",
        "    print(f\"\\n--- {name} ---\")\n",
        "    print(\"Accuracy:\", accuracy_score(y_true, y_pred))\n",
        "    print(\"Precision:\", precision_score(y_true, y_pred, zero_division=0))\n",
        "    print(\"Recall:\", recall_score(y_true, y_pred, zero_division=0))\n",
        "    print(\"F1:\", f1_score(y_true, y_pred, zero_division=0))\n",
        "    if y_proba is not None:\n",
        "        try:\n",
        "            print(\"AUC:\", roc_auc_score(y_true, y_proba))\n",
        "        except Exception as e:\n",
        "            print(\"AUC could not be computed:\", e)\n",
        "    print(\"Confusion matrix:\\n\", confusion_matrix(y_true, y_pred))\n",
        "    print(classification_report(y_true, y_pred, digits=4, zero_division=0))\n",
        "\n",
        "# Evaluate all\n",
        "evaluate(y_test.values, y_pred_xgb, y_proba_xgb, \"XGBoost\")\n",
        "evaluate(y_test.values, y_pred_ae, ae_scores, \"Autoencoder (threshold)\")\n",
        "evaluate(y_test.values, y_pred_stack, y_proba_stack, \"Hybrid (Stacked)\")\n",
        "\n",
        "# Save meta model\n",
        "joblib.dump(meta_clf, \"meta_logreg.joblib\")\n",
        "\n",
        "# ---------------- Plot ROC curves ----------------\n",
        "plt.figure(figsize=(8, 6))\n",
        "fpr, tpr, _ = roc_curve(y_test.values, y_proba_xgb)\n",
        "plt.plot(fpr, tpr, label=f'XGB (AUC={auc(fpr, tpr):.3f})')\n",
        "# For AE use the anomaly scores as \"probabilities\" (higher means more likely positive)\n",
        "fpr, tpr, _ = roc_curve(y_test.values, ae_scores)\n",
        "plt.plot(fpr, tpr, label=f'Autoencoder (AUC={auc(fpr, tpr):.3f})')\n",
        "fpr, tpr, _ = roc_curve(y_test.values, y_proba_stack)\n",
        "plt.plot(fpr, tpr, label=f'Hybrid (AUC={auc(fpr, tpr):.3f})')\n",
        "plt.plot([0, 1], [0, 1], '--', color='gray')\n",
        "plt.legend()\n",
        "plt.xlabel(\"False Positive Rate\")\n",
        "plt.ylabel(\"True Positive Rate\")\n",
        "plt.title(\"ROC Curves\")\n",
        "plt.grid(True)\n",
        "plt.savefig(\"roc_curves.png\", dpi=200)\n",
        "plt.close()\n",
        "\n",
        "print(\"\\nAll done. Models and scaler saved (xgb_model.joblib, autoencoder.h5, meta_logreg.joblib, scaler.joblib).\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "15ozKO6hgePz",
        "outputId": "476bfc05-942e-43ec-e995-77d4cf81ac34"
      },
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Features: (700001, 40), Label distribution: {0: 677786, 1: 22215}\n",
            "After SMOTE class counts: [542228 542228]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/xgboost/training.py:199: UserWarning: [18:59:31] WARNING: /workspace/src/learner.cc:790: \n",
            "Parameters: { \"use_label_encoder\" } are not used.\n",
            "\n",
            "  bst.update(dtrain, iteration=i, fobj=obj)\n",
            "/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:93: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
            "  super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 3ms/step - loss: 0.5422 - val_loss: 0.3471\n",
            "Epoch 2/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.3227 - val_loss: 0.3145\n",
            "Epoch 3/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.3241 - val_loss: 0.2977\n",
            "Epoch 4/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.3131 - val_loss: 0.2864\n",
            "Epoch 5/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2930 - val_loss: 0.2801\n",
            "Epoch 6/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2837 - val_loss: 0.2777\n",
            "Epoch 7/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2741 - val_loss: 0.2761\n",
            "Epoch 8/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.3064 - val_loss: 0.2765\n",
            "Epoch 9/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2711 - val_loss: 0.2751\n",
            "Epoch 10/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2972 - val_loss: 0.2747\n",
            "Epoch 11/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 5ms/step - loss: 0.2804 - val_loss: 0.2742\n",
            "Epoch 12/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2689 - val_loss: 0.2750\n",
            "Epoch 13/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2853 - val_loss: 0.2740\n",
            "Epoch 14/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2888 - val_loss: 0.2727\n",
            "Epoch 15/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2713 - val_loss: 0.2727\n",
            "Epoch 16/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2635 - val_loss: 0.2727\n",
            "Epoch 17/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2754 - val_loss: 0.2725\n",
            "Epoch 18/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 3ms/step - loss: 0.2996 - val_loss: 0.2741\n",
            "Epoch 19/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 4ms/step - loss: 0.2612 - val_loss: 0.2724\n",
            "Epoch 20/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2803 - val_loss: 0.2726\n",
            "Epoch 21/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2743 - val_loss: 0.2737\n",
            "Epoch 22/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2814 - val_loss: 0.2723\n",
            "Epoch 23/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2869 - val_loss: 0.2726\n",
            "Epoch 24/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 2ms/step - loss: 0.2692 - val_loss: 0.2730\n",
            "Epoch 25/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2757 - val_loss: 0.2719\n",
            "Epoch 26/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2800 - val_loss: 0.2722\n",
            "Epoch 27/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2719 - val_loss: 0.2714\n",
            "Epoch 28/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2859 - val_loss: 0.2719\n",
            "Epoch 29/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2797 - val_loss: 0.2713\n",
            "Epoch 30/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 4ms/step - loss: 0.2845 - val_loss: 0.2734\n",
            "Epoch 31/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2856 - val_loss: 0.2711\n",
            "Epoch 32/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2776 - val_loss: 0.2713\n",
            "Epoch 33/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2941 - val_loss: 0.2723\n",
            "Epoch 34/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2777 - val_loss: 0.2711\n",
            "Epoch 35/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2726 - val_loss: 0.2718\n",
            "Epoch 36/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2746 - val_loss: 0.2718\n",
            "Epoch 37/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2832 - val_loss: 0.2717\n",
            "Epoch 38/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2721 - val_loss: 0.2707\n",
            "Epoch 39/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 4ms/step - loss: 0.2617 - val_loss: 0.2724\n",
            "Epoch 40/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2847 - val_loss: 0.2717\n",
            "Epoch 41/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2874 - val_loss: 0.2712\n",
            "Epoch 42/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2838 - val_loss: 0.2711\n",
            "Epoch 43/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2907 - val_loss: 0.2705\n",
            "Epoch 44/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 3ms/step - loss: 0.2676 - val_loss: 0.2713\n",
            "Epoch 45/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 3ms/step - loss: 0.2716 - val_loss: 0.2712\n",
            "Epoch 46/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2594 - val_loss: 0.2708\n",
            "Epoch 47/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 3ms/step - loss: 0.2848 - val_loss: 0.2716\n",
            "Epoch 48/100\n",
            "\u001b[1m1907/1907\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 4ms/step - loss: 0.2643 - val_loss: 0.2713\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m4376/4376\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m11s\u001b[0m 2ms/step\n",
            "\u001b[1m16945/16945\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m25s\u001b[0m 1ms/step\n",
            "[autoencoder] threshold = 29.200563\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/xgboost/training.py:199: UserWarning: [19:05:56] WARNING: /workspace/src/learner.cc:790: \n",
            "Parameters: { \"use_label_encoder\" } are not used.\n",
            "\n",
            "  bst.update(dtrain, iteration=i, fobj=obj)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m3500/3500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 2ms/step\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/xgboost/training.py:199: UserWarning: [19:06:24] WARNING: /workspace/src/learner.cc:790: \n",
            "Parameters: { \"use_label_encoder\" } are not used.\n",
            "\n",
            "  bst.update(dtrain, iteration=i, fobj=obj)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m3500/3500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 1ms/step\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/xgboost/training.py:199: UserWarning: [19:06:49] WARNING: /workspace/src/learner.cc:790: \n",
            "Parameters: { \"use_label_encoder\" } are not used.\n",
            "\n",
            "  bst.update(dtrain, iteration=i, fobj=obj)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m3500/3500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 1ms/step\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/xgboost/training.py:199: UserWarning: [19:07:15] WARNING: /workspace/src/learner.cc:790: \n",
            "Parameters: { \"use_label_encoder\" } are not used.\n",
            "\n",
            "  bst.update(dtrain, iteration=i, fobj=obj)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m3500/3500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 1ms/step\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/xgboost/training.py:199: UserWarning: [19:07:40] WARNING: /workspace/src/learner.cc:790: \n",
            "Parameters: { \"use_label_encoder\" } are not used.\n",
            "\n",
            "  bst.update(dtrain, iteration=i, fobj=obj)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[1m3500/3500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 1ms/step\n",
            "\n",
            "--- XGBoost ---\n",
            "Accuracy: 0.9977214448468226\n",
            "Precision: 0.937791932059448\n",
            "Recall: 0.9941480981318929\n",
            "F1: 0.9651480388943516\n",
            "AUC: 0.9998948801900652\n",
            "Confusion matrix:\n",
            " [[135265    293]\n",
            " [    26   4417]]\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0     0.9998    0.9978    0.9988    135558\n",
            "           1     0.9378    0.9941    0.9651      4443\n",
            "\n",
            "    accuracy                         0.9977    140001\n",
            "   macro avg     0.9688    0.9960    0.9820    140001\n",
            "weighted avg     0.9978    0.9977    0.9978    140001\n",
            "\n",
            "\n",
            "--- Autoencoder (threshold) ---\n",
            "Accuracy: 0.9682930836208313\n",
            "Precision: 0.5294117647058824\n",
            "Recall: 0.008102633355840648\n",
            "F1: 0.015960984260696077\n",
            "AUC: 0.9823479279285221\n",
            "Confusion matrix:\n",
            " [[135526     32]\n",
            " [  4407     36]]\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0     0.9685    0.9998    0.9839    135558\n",
            "           1     0.5294    0.0081    0.0160      4443\n",
            "\n",
            "    accuracy                         0.9683    140001\n",
            "   macro avg     0.7490    0.5039    0.4999    140001\n",
            "weighted avg     0.9546    0.9683    0.9532    140001\n",
            "\n",
            "\n",
            "--- Hybrid (Stacked) ---\n",
            "Accuracy: 0.9981571560203142\n",
            "Precision: 0.9659318637274549\n",
            "Recall: 0.9763673193787981\n",
            "F1: 0.9711215580926796\n",
            "AUC: 0.999894855284879\n",
            "Confusion matrix:\n",
            " [[135405    153]\n",
            " [   105   4338]]\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0     0.9992    0.9989    0.9990    135558\n",
            "           1     0.9659    0.9764    0.9711      4443\n",
            "\n",
            "    accuracy                         0.9982    140001\n",
            "   macro avg     0.9826    0.9876    0.9851    140001\n",
            "weighted avg     0.9982    0.9982    0.9982    140001\n",
            "\n",
            "\n",
            "All done. Models and scaler saved (xgb_model.joblib, autoencoder.h5, meta_logreg.joblib, scaler.joblib).\n"
          ]
        }
      ]
    }
  ]
}

{'nbformat': 4,
 'nbformat_minor': 0,
 'metadata': {'colab': {'provenance': []},
  'kernelspec': {'name': 'python3', 'display_name': 'Python 3'},
  'language_info': {'name': 'python'}},
 'cells': [{'cell_type': 'markdown',
   'source': ['Individual AI/ML/DL Models Implementation\n',
    'Below is a complete Python script covering preprocessing, feature engineering, model training, testing, and saving outputs. Run on a standard server (CPU OK) — GPU speeds up the autoencoder training.\n',
    '\n',
    'Save as train_models.py. It uses: pandas, numpy, scikit-learn, imblearn, xgboost, tensorflow (keras), matplotlib, seaborn.'],
   'metadata': {'id': 'TKey1l7nVkad'}},
  {'cell_type': 'code',
   'source': ['# train_models_fixed.py\n',
    '"""\n',
    'Fixed, runnable version of your training script (XGBoost + Autoencoder + Stacked meta-classifier).\n',
    'Put dataset CSV(s) in ./data/ and set DATASET to "UNSW" or "CICIDS".\n',
    '\n',
    'Dependencies:\n',
    'pip install pandas num