# Modeling Dataset 2 using MultiHeadSelfAttention BiLSTM With ADWIN Drift Detection

In [10]:
from keras.layers import (
    Input,
    Dense,
    Dropout,
    LayerNormalization,
    MultiHeadAttention,
    Bidirectional,
    LSTM,
    GlobalAveragePooling1D,
)
from keras.models import Model
from keras.optimizers.legacy import RMSprop
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.utils import to_categorical
import os
from keras.callbacks import ModelCheckpoint, Callback
import prettytable
from performance_evaluator.metrics import evaluate
from performance_evaluator.plots import (
    confusion_matrix,
    precision_recall_curve,
    roc_curve,
)
import shutil
from sklearn.model_selection import train_test_split
from river.drift import ADWIN

plt.rcParams["font.family"] = "Space Mono"

In [11]:
def reset_random():
    seed = 1
    import os
    os.environ['PYTHONHASHSEED'] = str(seed)
    import random
    random.seed(seed)
    import numpy as np
    import scipy
    _ = scipy
    np.random.seed(seed)
    import warnings
    warnings.filterwarnings('ignore', category=Warning)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
    import tensorflow as tf
    tf.compat.v1.random.set_random_seed(seed)
    tf.compat.v1.set_random_seed(seed)


reset_random()

In [12]:
df = pd.read_csv('Data/preprocessed/Dataset2.csv')
df.head()

Unnamed: 0,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Label
0,0.0,0.0,0.0,0.0,0.0,0.977311,0.001573,0.001336,0.0,0.0,...,1.7e-05,4e-06,1.2e-05,1.7e-05,0.163713,0.000702,0.159448,0.164662,0.0,0.0
1,0.066667,0.53092,0.035714,0.108126,0.352941,0.969805,0.023666,0.022209,0.052303,0.01686,...,0.027877,0.006956,0.021157,0.029935,0.18812,0.001975,0.184363,0.188154,0.016182,0.0
2,0.066667,0.0,0.035714,0.0,0.0,0.96936,0.004036,0.00334,0.0,0.0,...,0.022181,2.1e-05,0.012877,0.032818,0.189789,0.002607,0.18702,0.189766,0.0,0.0
3,0.066667,0.0,0.071429,0.0,0.0,0.971944,0.004036,0.00334,0.0,0.0,...,0.018322,2.9e-05,0.010657,0.027109,0.191493,0.005139,0.193939,0.190919,0.0,0.0
4,0.133333,0.855045,0.035714,0.108126,0.352941,0.971025,0.033105,0.017366,0.049688,0.016068,...,0.027831,0.00616,0.019091,0.03583,0.19937,0.010594,0.200126,0.192519,0.016202,0.0


In [13]:
from performance_evaluator.make_cls import make_cls
CLASSES = range(0, 12)
# x, y = df.values[:, :-1], df.values[:, -1]
x, y = make_cls(df, 'Label', CLASSES)
x = np.expand_dims(x, axis=1)
y_cat = to_categorical(y, len(CLASSES))

In [24]:
def buildModel(time_steps, features, num_classes):
    inputs = Input(shape=(time_steps, features))

    # ---- BiLSTM Encoder ----
    x = Bidirectional(LSTM(64, return_sequences=True))(inputs)
    x = LayerNormalization()(x)

    # ---- Multi-Head Self Attention ----
    attn = MultiHeadAttention(num_heads=4, key_dim=32)(x, x)

    x = x + attn
    x = LayerNormalization()(x)

    # ---- Temporal Aggregation ----
    x = GlobalAveragePooling1D()(x)

    x = Dense(64, activation="relu")(x)
    x = Dropout(0.3)(x)

    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs)

    model.compile(
        optimizer=RMSprop(0.0008),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [25]:
RESULTS_PLOT = {
    "Train": {
        "CONF_MAT": plt.figure(num=1, figsize=(6, 6)),
        "PR_CURVE": plt.figure(num=2),
        "ROC_CURVE": plt.figure(num=3),
    },
    "Test": {
        "CONF_MAT": plt.figure(num=4, figsize=(6, 6)),
        "PR_CURVE": plt.figure(num=5),
        "ROC_CURVE": plt.figure(num=6),
    },
}
ACC_PLOT = plt.figure(num=7)
LOSS_PLOT = plt.figure(num=8)

<Figure size 600x600 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [26]:
def print_df_to_table(df, p=True):
    field_names = list(df.columns)
    p_table = prettytable.PrettyTable(field_names=field_names)
    p_table.add_rows(df.values.tolist())
    d = "\n".join(
        ["\t\t{0}".format(p_) for p_ in p_table.get_string().splitlines(keepends=False)]
    )
    if p:
        print(d)
    return d


def plot_line(plt_, y1, y2, epochs, for_, save_path):
    ax = plt_.gca()
    ax.clear()
    ax.plot(range(epochs), y1, label="Training", color="dodgerblue")
    ax.plot(range(epochs), y2, label="Validation", color="orange")
    ax.set_title("Training and Validation {0}".format(for_))
    ax.set_xlabel("Epochs")
    ax.set_ylabel(for_)
    ax.set_xlim([0, epochs])
    ax.legend()
    plt_.tight_layout()
    plt_.savefig(save_path)


def plot_acc_loss(df, plt1, plt2, save_dir):
    epochs = len(df)
    acc = df["accuracy"].values
    val_acc = df["val_accuracy"].values
    loss = df["loss"].values
    val_loss = df["val_loss"].values
    plot_line(
        plt1, acc, val_acc, epochs, "Accuracy", os.path.join(save_dir, "accuracy.png")
    )
    plot_line(plt2, loss, val_loss, epochs, "Loss", os.path.join(save_dir, "loss.png"))


def plot(y, pred, prob, plts, results_dir):
    for_ = os.path.basename(results_dir)
    print("\t[INFO] Evaluating {0} Data".format(for_))
    os.makedirs(results_dir, exist_ok=True)

    m = evaluate(y, pred, prob, CLASSES)
    df = m.overall_metrics
    df.to_csv(os.path.join(results_dir, "metrics.csv"), index=False)
    print_df_to_table(df)

    fig = plts[for_]["CONF_MAT"]
    ax = fig.gca()
    ax.clear()
    confusion_matrix(
        y,
        pred,
        CLASSES,
        ax=ax,
        title="{0}ing Phase - Confusion matrix".format(for_),
    )
    fig.tight_layout()
    fig.savefig(os.path.join(results_dir, "conf_mat.png"))
    fig.show()

    fig = plts[for_]["PR_CURVE"]
    ax = fig.gca()
    ax.clear()
    precision_recall_curve(
        y,
        prob,
        CLASSES,
        ax=ax,
        legend_ncol=2,
        title="Precision-Recall Curve",
    )
    fig.tight_layout()
    fig.savefig(os.path.join(results_dir, "pr_curve.png"))
    fig.show()

    fig = plts[for_]["ROC_CURVE"]
    ax = fig.gca()
    ax.clear()
    roc_curve(
        y,
        prob,
        CLASSES,
        ax=ax,
        legend_ncol=2,
        title="ROC Curve",
    )
    fig.tight_layout()
    fig.savefig(os.path.join(results_dir, "roc_curve.png"))
    fig.show()


class TrainingCallback(Callback):
    def __init__(self, acc_loss_path, plt1, plt2):
        self.acc_loss_path = acc_loss_path
        self.plt1 = plt1
        self.plt2 = plt2
        if os.path.isfile(self.acc_loss_path):
            self.df = pd.read_csv(self.acc_loss_path)
            plot_acc_loss(
                self.df, self.plt1, self.plt2, os.path.dirname(self.acc_loss_path)
            )
        else:
            self.df = pd.DataFrame(
                [], columns=["epoch", "accuracy", "val_accuracy", "loss", "val_loss"]
            )
            self.df.to_csv(self.acc_loss_path, index=False)
        Callback.__init__(self)

    def on_epoch_end(self, epoch, logs=None):
        self.df.loc[len(self.df.index)] = [
            int(epoch + 1),
            round(logs["accuracy"], 4),
            round(logs["val_accuracy"], 4),
            round(logs["loss"], 4),
            round(logs["val_loss"], 4),
        ]
        self.df.to_csv(self.acc_loss_path, index=False)
        plot_acc_loss(
            self.df, self.plt1, self.plt2, os.path.dirname(self.acc_loss_path)
        )


class ADWINDriftCallback(Callback):

    def __init__(self, monitor="val_loss"):
        super().__init__()
        self.monitor = monitor
        self.adwin = ADWIN()

    def on_epoch_end(self, epoch, logs=None):

        value = logs.get(self.monitor)

        if value is None:
            return

        in_drift, _ = self.adwin.update(value)

        if in_drift:
            print("\n⚠️ Concept Drift Detected by ADWIN")

            # Example reaction: reduce learning rate
            lr = self.model.optimizer.learning_rate.numpy()
            new_lr = lr * 0.5
            self.model.optimizer.learning_rate.assign(new_lr)

            print(f"Learning rate reduced → {new_lr}")

In [27]:
model_dir = "models/Dataset2"
if os.path.isdir(model_dir):
    shutil.rmtree(model_dir)
os.makedirs(model_dir, exist_ok=True)
acc_loss_csv_path = os.path.join(model_dir, "acc_loss.csv")
model_path = os.path.join(model_dir, "model.h5")
training_cb = TrainingCallback(acc_loss_csv_path, ACC_PLOT, LOSS_PLOT)
checkpoint = ModelCheckpoint(
    model_path,
    save_best_only=True,
    save_weights_only=True,
    monitor="val_accuracy",
    mode="max",
    verbose=False,
)
model = buildModel(x.shape[1], x.shape[2], len(CLASSES))
initial_epoch = 0
if os.path.isfile(model_path) and os.path.isfile(acc_loss_csv_path):
    print("[INFO] Loading Pre-Trained Model :: {0}".format(model_path))
    model.load_weights(model_path)
    initial_epoch = len(pd.read_csv(acc_loss_csv_path))
print("[INFO] Fitting Data")
model.fit(
    x,
    y_cat,
    validation_data=(x, y_cat),
    batch_size=4096,
    epochs=50,
    verbose=1,
    initial_epoch=initial_epoch,
    callbacks=[training_cb, checkpoint],
)
model.load_weights(model_path)

[INFO] Fitting Data
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [28]:
print("[INFO] Evaluating Training|Testing ==> 70:30")
train_x, test_x, train_y, test_y = train_test_split(
    x, y, test_size=0.3, shuffle=True, random_state=1
)
train_prob = model.predict(train_x, verbose=False)
train_pred = np.argmax(train_prob, axis=1).ravel().astype(int)
plot(
    train_y.ravel().astype(int),
    train_pred,
    train_prob,
    RESULTS_PLOT,
    "results/Dataset2/Train",
)
test_prob = model.predict(test_x, verbose=False)
test_pred = np.argmax(test_prob, axis=1).ravel().astype(int)
plot(
    test_y.ravel().astype(int),
    test_pred,
    test_prob,
    RESULTS_PLOT,
    "results/Dataset2/Test",
)

[INFO] Evaluating Training|Testing ==> 70:30
	[INFO] Evaluating Train Data
		+-----------+--------+
		|  Metrics  | Values |
		+-----------+--------+
		|  Accuracy | 0.9998 |
		| Precision | 0.9987 |
		|   Recall  | 0.9781 |
		|  F1-Score | 0.9877 |
		|    MCC    | 0.9880 |
		+-----------+--------+
	[INFO] Evaluating Test Data
		+-----------+--------+
		|  Metrics  | Values |
		+-----------+--------+
		|  Accuracy | 0.9997 |
		| Precision | 0.9968 |
		|   Recall  | 0.9652 |
		|  F1-Score | 0.9793 |
		|    MCC    | 0.9800 |
		+-----------+--------+
