NSL-KDD Multi-class 5
---

Accuracy: 0.9925193905271861

Precision: 0.9925327544029282

F1-score: 0.9925215669204571


#Imports

In [3]:
!pip install keras-tuner
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM

from kerastuner.tuners import RandomSearch, BayesianOptimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')
normalized_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/nslkdd/NSL-KDD preprocessed multi_classification.csv")

normalized_df

Mounted at /content/drive


Unnamed: 0,land,logged_in,is_host_login,is_guest_login,class,duration,src_bytes,dst_bytes,wrong_fragment,urgent,...,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.0,0.0,0.0,1,-0.112481,-0.007246,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0,-0.112481,-0.007318,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1,-0.112481,-0.007436,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0,-0.112481,-0.007332,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,3,0.001308,0.044992,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171999,0.0,0.0,0.0,0.0,1,-0.112481,-0.007436,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172000,0.0,0.0,0.0,0.0,1,-0.112481,-0.007436,-0.004614,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172001,0.0,1.0,0.0,0.0,0,-0.112481,-0.007388,-0.004212,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
172002,0.0,1.0,0.0,0.0,0,-0.112481,-0.007394,-0.004107,-0.085488,-0.010403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
#Extraction des X et Y
x = normalized_df.drop(['class'], axis = 1).values
y = normalized_df['class'].values.astype('int')

#initialisation PCA
pca = PCA(n_components = 108)
pca = pca.fit(x)
x_r = pca.transform(x)

print("# Of original features: ", x.shape[1])
print("# Features after reduction: ", x_r.shape[1])




# Of original features:  112
# Features after reduction:  108


In [6]:
#Training splits
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42)

#reduced features splits
xr_train, xr_test, yr_train, yr_test = train_test_split(x_r, y, test_size = 0.3, random_state = 42)
xr_val, xr_test, yr_val, yr_test = train_test_split(xr_test, yr_test, test_size = 0.5, random_state = 42)

print("x_train shape : ", x_train.shape)
print("y_train shape : ", y_train.shape)

print("x_test shape : ", x_test.shape)
print("y_test shape : ", y_test.shape)

x_train shape :  (120402, 112)
y_train shape :  (120402,)
x_test shape :  (25801, 112)
y_test shape :  (25801,)


#Models and evaluation


In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix

evaluations = {}

def evaluate_model(model, X_test, y_true):
    # Step 2: Make predictions
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)

    # Step 3: Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Step 4: Calculate precision
    precision = precision_score(y_true, y_pred, average='weighted')

    # Step 5: Calculate F1-score
    f1score = f1_score(y_true, y_pred, average='weighted')

    # Step 6: Calculate confusion matrix
    confusion_mat = confusion_matrix(y_true, y_pred)

    # Print evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("F1-score:", f1score)
    print("Confusion Matrix:")
    print(confusion_mat)

# Models


In [8]:
def build_model(hp):
  # Define your CNN-LSTM model
  model = Sequential()
  model.add(Dense(units = hp.Choice('Dense_units1',values = [32, 64, 128]), activation='relu', input_shape=(112, 1)))
  model.add(Conv1D(filters = hp.Choice('Filters',values = [32, 64, 128]), kernel_size=3, activation='relu'))
  model.add(MaxPooling1D(pool_size=hp.Choice('max_pool',values = [2, 3, 4])))
  model.add(LSTM(units = hp.Choice('lstm_units',values = [32, 64, 128]), return_sequences=True))
  model.add(Dropout(rate = hp.Choice('Dropout',values = [0.3, 0.4, 0.5])))
  model.add(Flatten())
  model.add(Dense(units = hp.Choice('Dense_units2',values = [32, 64, 128]), activation='relu'))
  model.add(Dense(5, activation='softmax'))
  # Compile the model
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = hp.Choice("learning_rate", values = [ 1e-2, 1e-3])), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

tunner = BayesianOptimization(
  build_model,
  objective = "val_accuracy",
  max_trials = 45,
  executions_per_trial = 2,
  directory = "/content/drive/MyDrive/Colab Notebooks/hp2/",
  project_name = "hpt_bayesian")

tunner.search(x_train, y_train, epochs = 40,batch_size = 64,  validation_data=(x_val, y_val))
best_model = tunner.get_best_models(num_models = 1)[0]
best_hp = tunner.get_best_hyperparameters(num_trials = 1)[0]



Search: Running Trial #41

Value             |Best Value So Far |Hyperparameter
32                |32                |Dense_units1
128               |128               |Filters
2                 |2                 |max_pool
32                |128               |lstm_units
0.3               |0.3               |Dropout
128               |128               |Dense_units2
0.001             |0.001             |learning_rate

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
  37/1882 [..............................] - ETA: 3:37 - loss: 0.0241 - accuracy: 0.9932

KeyboardInterrupt: ignored

Trial 26 Complete [00h 22m 49s]
val_accuracy: 0.9922483563423157

Best val_accuracy So Far: 0.9922643601894379
Total elapsed time: 02h 59m 17s

Search: Running Trial #27

Value             |Best Value So Far |Hyperparameter
128               |128               |Dense_units1
64                |128               |Filters
2                 |2                 |max_pool
128               |128               |lstm_units
0.3               |0.4               |Dropout
32                |64                |Dense_units2
0.001             |0.001             |learning_rate



Search: Running Trial #41

Value             |Best Value So Far |Hyperparameter
32                |32                |Dense_units1
128               |128               |Filters
2                 |2                 |max_pool
32                |128               |lstm_units
0.3               |0.3               |Dropout
128               |128               |Dense_units2
0.001             |0.001             |learning_rate

In [None]:
"""model.summary()
# Train the model
history = model.fit(x_train, y_train, epochs=100, validation_data=(x_val,y_val),batch_size = 64)
evaluate_model(model, x_test, y_test)
"""