# LSTM-Multiclass with NSL-KDD

Currently, I am using the same inputs as the compared SDN-DNN solution to enable qualitative comparsion amongst the two.  
The NSL-KDD version used is the preprocessed one by the University of New Brunswick, Canada.

In [1]:
from datetime import datetime
import json
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

## Data loading and prep

As we've pickled the normalized and encoded dataset, we only need to load these pickles to get the Pandas DataFrames back.  
**Hint**: If you miss the pickles, go ahead and run the notebook named *Pickle-NSL-KDD.ipynb*

In [2]:
def load_df(filename):
    filepath = os.path.join('NSL_KDD', filename+'.pkl')
    return pd.read_pickle(filepath)

In [3]:
kdd_train_data = load_df('kdd_train_data')
kdd_test_data = load_df('kdd_test_data')
kdd_train_labels = load_df('kdd_train_labels')
kdd_test_labels = load_df('kdd_test_labels')

In [4]:
kdd_train_data.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
125968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.360078,0.048924,1.0,1.0,0.0,0.0,0.14,0.06,0.0,1.0,0.098039,0.1,0.06,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.028986,0.1
125969,0.000186,7.608895e-08,1.106923e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003914,0.003914,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.956863,0.96,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.5,0.028986,0.0
125970,0.0,1.616709e-06,2.931438e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001957,0.001957,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.117647,0.12,0.06,0.0,0.0,0.72,0.0,0.01,0.0,0.0,0.217391,0.0
125971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2818,0.015656,1.0,1.0,0.0,0.0,0.06,0.05,0.0,1.0,0.031373,0.03,0.05,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.507246,0.1
125972,0.0,1.094232e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001957,0.001957,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.301961,0.3,0.03,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The paper mentions that they only use six features of the full dataset which is why we filter the dataframes for these.

In [5]:
# only keep columns that are actually used
used_fields = ['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
kdd_train_data = kdd_train_data.filter(used_fields)
kdd_test_data = kdd_test_data.filter(used_fields)
kdd_train_data.tail()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
125968,0.0,0.0,0.0,0.0,0.360078,0.048924
125969,0.000186,0.5,7.608895e-08,1.106923e-07,0.003914,0.003914
125970,0.0,0.0,1.616709e-06,2.931438e-07,0.001957,0.001957
125971,0.0,0.0,0.0,0.0,0.2818,0.015656
125972,0.0,0.0,1.094232e-07,0.0,0.001957,0.001957


## Label Translation

As we are doing binary classification, we only need to know if the entry is normal/benign (0) or malicious (1)

In [6]:
with open(os.path.join('NSL_KDD','kdd_label_wordindex.json')) as json_in:
    data = json.load(json_in)
    print(data)
    normal_index = data['normal']

{'normal': 1, 'neptune': 2, 'warezclient': 3, 'ipsweep': 4, 'portsweep': 5, 'teardrop': 6, 'nmap': 7, 'satan': 8, 'smurf': 9, 'pod': 10, 'back': 11, 'guess_passwd': 12, 'ftp_write': 13, 'multihop': 14, 'rootkit': 15, 'buffer_overflow': 16, 'imap': 17, 'warezmaster': 18, 'phf': 19, 'land': 20, 'loadmodule': 21, 'spy': 22, 'perl': 23, 'saint': 24, 'mscan': 25, 'apache2': 26, 'snmpgetattack': 27, 'processtable': 28, 'httptunnel': 29, 'ps': 30, 'snmpguess': 31, 'mailbomb': 32, 'named': 33, 'sendmail': 34, 'xterm': 35, 'worm': 36, 'xlock': 37, 'xsnoop': 38, 'sqlattack': 39, 'udpstorm': 40}


In [7]:
def f(x):
    return 0 if x == normal_index else 1
f = np.vectorize(f)

In [8]:
kdd_train_labels.head()

Unnamed: 0,label,difficulty_level,label_encoded
0,normal,20,1
1,normal,15,1
2,neptune,19,2
3,normal,21,1
4,normal,21,1


In [9]:
# We only want to know if it's benign or not, so we switch to 0 or 1
kdd_train_labels = f(kdd_train_labels['label_encoded'].values)
kdd_test_labels = f(kdd_test_labels['label_encoded'].values)

In [10]:
kdd_train_labels[:5]

array([0, 0, 1, 0, 0])

In [11]:
print("No of train entries:\t", len(kdd_train_data))
print("No of train labels:\t", len(kdd_train_labels))
print("-----------")
print("No of test entries:\t", len(kdd_test_data))
print("No of test labels:\t", len(kdd_test_labels))

No of train entries:	 125973
No of train labels:	 125973
-----------
No of test entries:	 22543
No of test labels:	 22543


## Runtime Preqs

Let's go ahead and set some crucial parameters for the learning process

In [12]:
epochs = 50
batch_size = 32
no_of_classes = len(np.unique(np.concatenate((kdd_train_labels, kdd_test_labels))))

In [13]:
run_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
runtype_name = 'nsl-kdd-lstm-2multiclass-b{}-e{}'.format(batch_size, epochs)
log_folder_path = os.path.join('logs',runtype_name + '-{}'.format(run_date))

In [14]:
print("No of scaled train entries:\t", len(kdd_train_data))
print("No of train labels:\t\t", len(kdd_train_labels))
print("-----------")
print("No of test entries:\t\t", len(kdd_test_data))
print("No of test labels:\t\t", len(kdd_test_labels))

No of scaled train entries:	 125973
No of train labels:		 125973
-----------
No of test entries:		 22543
No of test labels:		 22543


The Keras Embedding Layer expects a maximum vocabulary size, which we can simply calculate by finding max() of the encoded data

In [30]:
all_data = pd.concat([kdd_train_data, kdd_test_data])

In [32]:
voc_size = (all_data.max().max()+1).astype('int64')
print("Maximum vocabulary size:", voc_size)

Maximum vocabulary size: 2


## Building and Training the Model

In [16]:
from keras.callbacks import EarlyStopping,ModelCheckpoint,TensorBoard

callbacks = [
    ModelCheckpoint(
        filepath='models/'+runtype_name+'-{}.h5'.format(run_date),
        monitor='val_loss',   
        save_best_only=True    # Only save one. Only overwrite this one if val_loss has improved
    ),
    TensorBoard(
        log_dir=log_folder_path
    )
]

Using TensorFlow backend.


In [25]:
print(kdd_test_data.values[0:5])

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.48140900e-01 1.95694716e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  2.66144814e-01 1.95694716e-03]
 [4.66113545e-05 0.00000000e+00 9.40821721e-06 0.00000000e+00
  1.95694716e-03 1.95694716e-03]
 [0.00000000e+00 1.00000000e+00 1.44931329e-08 0.00000000e+00
  1.95694716e-03 1.27201566e-01]
 [2.33056773e-05 0.00000000e+00 0.00000000e+00 1.14509289e-08
  1.95694716e-03 1.56555773e-02]]


In [33]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.optimizers import RMSprop
from keras.utils import plot_model

# see https://stackoverflow.com/a/49436133/3864726
# This is especially important in an environment like Jupyter, where the Kernel keeps on running
from keras import backend as K
K.clear_session()


model = Sequential()
model.add(Embedding(voc_size, 32)) 
model.add(LSTM(32, name='LSTMnet'))
model.add(Dense(2, activation='softmax')) # Multiclass classification. For binary, one would use i.e. sigmoid

model.compile(optimizer=RMSprop(lr=0.001),
              loss='sparse_categorical_crossentropy', # Multiclass classification! Binary would be binary_crossentropy
              metrics=['acc'])

model.summary()

history = model.fit(kdd_train_data, kdd_train_labels,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=1,
                    validation_data=(kdd_test_data, kdd_test_labels),
                    callbacks=callbacks)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          64        
_________________________________________________________________
LSTMnet (LSTM)               (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 8,450
Trainable params: 8,450
Non-trainable params: 0
_________________________________________________________________
Train on 125973 samples, validate on 22543 samples
Epoch 1/50
Epoch 2/50
 13728/125973 [==>...........................] - ETA: 18s - loss: 0.6661 - acc: 0.5818

KeyboardInterrupt: 