# LSTM-Multiclass with NSL-KDD

Currently, I am using the same inputs as the compared SDN-DNN solution to enable qualitative comparsion amongst the two.

## Data loading and prep

In [1]:
import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

header_col = pd.read_csv(os.path.join('NSL_KDD', 'Field Names.csv'), header=None)
header_col = header_col.append(pd.DataFrame([['label','symbolic'],['unknown','continuous']]))

header_names = header_col[0].values

### Training Set

In [2]:
ftrain = os.path.join('NSL_KDD','KDDTrain+.csv')
kdd_train = pd.read_csv(ftrain, header=None, names=header_names)

# split off labels
kdd_train_labels = kdd_train['label']

# only keep columns that are actually used
used_fields = ['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
kdd_train = kdd_train.filter(used_fields)
kdd_train.head()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
0,0,tcp,491,0,2,2
1,0,udp,146,0,13,1
2,0,tcp,0,0,123,6
3,0,tcp,232,8153,5,5
4,0,tcp,199,420,30,32


### Test Set

In [3]:
ftest = os.path.join('NSL_KDD','KDDTest+.csv')
kdd_test = pd.read_csv(ftest, header=None, names=header_names)

# split off labels
kdd_test_labels = kdd_test['label']

# only keep columns that are actually used
kdd_test = kdd_test.filter(used_fields)

kdd_test.tail()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
22538,0,tcp,794,333,1,1
22539,0,tcp,317,938,2,11
22540,0,tcp,54540,8314,5,10
22541,0,udp,42,42,4,6
22542,0,tcp,0,0,4,10


## Text encoding

In [4]:
from keras.preprocessing.text import Tokenizer
# tokenize the LABELS
label_tokenizer = Tokenizer(num_words=50, filters='')
label_tokenizer.fit_on_texts((kdd_train_labels.append(kdd_test_labels)).values)

# Run the fitted tokenizer on the label column and save the encoded data as dataframe
kdd_train_labels = label_tokenizer.texts_to_sequences(kdd_train_labels)
kdd_train_labels = np.concatenate(kdd_train_labels).ravel()

# as the Encoder documentation states, 0 will never assigned to a label.
# I, on the other hand, need an index starting with 0. So we substract 1 of all classes.
kdd_train_labels = kdd_train_labels -1

# Do the same for the test labels
kdd_test_labels = label_tokenizer.texts_to_sequences(kdd_test_labels)
kdd_test_labels = np.concatenate(kdd_test_labels).ravel()
kdd_test_labels = kdd_test_labels -1

Using TensorFlow backend.


In [5]:
# build a big dataframe out of both sets to train the tokenizer
full = pd.concat([kdd_train, kdd_test])

In [6]:
# tokenize the protocol_type column
protocol_tokenizer = Tokenizer(num_words=50, filters='')
protocol_tokenizer.fit_on_texts(full['protocol_type'])

train_enc = protocol_tokenizer.texts_to_sequences(kdd_train['protocol_type'])
test_enc = protocol_tokenizer.texts_to_sequences(kdd_test['protocol_type'])

In [7]:
# drop the column containing strings
kdd_train.drop('protocol_type', axis=1, inplace=True)
# ... and add the encoded column instead
kdd_train = pd.concat([kdd_train, pd.DataFrame(train_enc,columns=['protocol_type'])], axis=1, sort=False)
kdd_train.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,protocol_type
125968,0,0,0,184,25,1
125969,8,105,145,2,2,2
125970,0,2231,384,1,1,1
125971,0,0,0,144,8,1
125972,0,151,0,1,1,1


In [8]:
kdd_test.drop('protocol_type', axis=1, inplace=True)
kdd_test = pd.concat([kdd_test, pd.DataFrame(test_enc,columns=['protocol_type'])], axis=1, sort=False)
kdd_test.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,protocol_type
22538,0,794,333,1,1,1
22539,0,317,938,2,11,1
22540,0,54540,8314,5,10,1
22541,0,42,42,4,6,2
22542,0,0,0,4,10,1


## Data Normalization

In [9]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
kdd_train = min_max_scaler.fit_transform(kdd_train)

print("Training Dataset:")
print("Shape of the final training dataset:", kdd_train.shape)
print("Outer type:", type(kdd_train))
print("Single entry type:", type(kdd_train[0]))

Training Dataset:
Shape of the final training dataset: (125973, 6)
Outer type: <class 'numpy.ndarray'>
Single entry type: <class 'numpy.ndarray'>


In [10]:
min_max_scaler = preprocessing.MinMaxScaler()
kdd_test = min_max_scaler.fit_transform(kdd_test)

print("Test Dataset:")
print("Shape of the final test dataset:", kdd_test.shape)
print("Outer type:", type(kdd_test))
print("Single entry type:", type(kdd_test[0]))

Test Dataset:
Shape of the final test dataset: (22543, 6)
Outer type: <class 'numpy.ndarray'>
Single entry type: <class 'numpy.ndarray'>


## Runtime Preqs

In [11]:
from datetime import datetime
from os.path import exists, join

# Define some semi-global stuff

# test_size = 0.3 # There's a separate test set!
batch_size = 10
no_of_classes = len(np.unique(np.concatenate((kdd_train_labels, kdd_test_labels))))

run_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
runtype_name = 'nsl-kdd-lstm-multiclass'
log_folder_path = os.path.join('logs',runtype_name + '-{}'.format(run_date))

In [12]:
print("No of scaled train entries:\t", len(kdd_train))
print("No of train labels:\t\t", len(kdd_train_labels))
print("-----------")
print("No of test entries:\t\t", len(kdd_test))
print("No of test labels:\t\t", len(kdd_test_labels))

No of scaled train entries:	 125973
No of train labels:		 125973
-----------
No of test entries:		 22543
No of test labels:		 22543


The Keras Embedding Layer expects a maximum vocabulary size, which we can simply calculate by finding max() of the encoded labels

In [13]:
voc_size = (kdd_train.max()+1).astype('int64')
print("Maximum vocabulary size:", voc_size)

Maximum vocabulary size: 2


## Building and Training the Model

In [14]:
from keras.callbacks import EarlyStopping,ModelCheckpoint,TensorBoard

callbacks = [
    EarlyStopping(
        monitor='acc', # Which metric to monitor
        patience=3     # Interrupt training after acc has stopped improving for more than 1 epoch
    ),
    ModelCheckpoint(
        filepath='models/'+runtype_name+'-{}.h5'.format(run_date),
        monitor='val_loss',   
        save_best_only=True    # Only save one. Only overwrite this one if val_loss has improved
    ),
    TensorBoard(
        log_dir=log_folder_path
    )
]

In [15]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.optimizers import RMSprop
from keras.utils import plot_model

# see https://stackoverflow.com/a/49436133/3864726
# This is especially important in an environment like Jupyter, where the Kernel keeps on running
from keras import backend as K
K.clear_session()


model = Sequential()
model.add(Embedding(voc_size, 32)) 
model.add(LSTM(32, name='LSTMnet'))
model.add(Dense(no_of_classes, activation='softmax')) # Multiclass classification. For binary, one would use i.e. sigmoid

model.compile(optimizer=RMSprop(lr=0.001),
              loss='sparse_categorical_crossentropy', # Multiclass classification! Binary would be binary_crossentropy
              metrics=['acc'])

model.summary()
plot_model(model, to_file='model-nsl-kdd-lstm-multiclass.png', show_layer_names=True, show_shapes=True)

history = model.fit(kdd_train, kdd_train_labels,
                    epochs=10,
                    batch_size=batch_size,
                    verbose=1,
                    validation_data=(kdd_test, kdd_test_labels),
                    callbacks=callbacks)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          64        
_________________________________________________________________
LSTMnet (LSTM)               (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 40)                1320      
Total params: 9,704
Trainable params: 9,704
Non-trainable params: 0
_________________________________________________________________
Train on 125973 samples, validate on 22543 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
