## IMPORTING THE REQUIRED LIBRARIES

In [1]:
#Make sure all the following libraries are installed before running the application.
import pandas as pd
import numpy as np
import sklearn
from sklearn.utils import shuffle
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import warnings

from keras.preprocessing import sequence
from keras import optimizers
# from keras.utils import np_utils # This import is causing the error
from tensorflow.keras.utils import to_categorical # Use to_categorical from tensorflow.keras.utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, SimpleRNN, BatchNormalization
from keras.models import model_from_json

warnings.filterwarnings("ignore")
%matplotlib inline

## LOADING TRAINING AND TESTING DATA

In [2]:
#load the csv file containing the column names
column_name = pd.read_csv("Field Names.csv", header = None)

In [3]:
#Convert the array into list
new_columns = list(column_name[0].values)

In [4]:
#adding difficulty
new_columns += ['class', 'difficulty']

In [5]:
#loading train and test data files
train_data = pd.read_csv('KDDTrain+.txt', names = new_columns)
test_data = pd.read_csv('KDDTest+.txt', names = new_columns)

In [6]:
#Training data sample
print("The training data is")
train_data.tail()

The training data is


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
89854,0,tcp,private,S0,0,0,0,0,0,0,...,0.02,0.08,0.0,0.0,1.0,1.0,0.0,0.0,neptune,21.0
89855,31,tcp,ftp,SF,1495,4152,0,0,0,30,...,0.31,0.06,0.01,0.0,0.0,0.0,0.0,0.0,normal,20.0
89856,0,tcp,supdup,S0,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune,20.0
89857,0,udp,domain_u,SF,44,44,0,0,0,0,...,0.98,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal,18.0
89858,0,tcp,http,SF,245,285,0,0,0,0,...,,,,,,,,,,


In [7]:
#Output total rows and columns of dataframe
print(f"The shape of the training dataframe is : {train_data.shape}")

The shape of the training dataframe is : (89859, 43)


In [8]:
#Same for testing
print("The testing data is")
test_data.head()

The testing data is


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [9]:
#Idem dito ^
print(f"The shape of the testing dataframe is : {test_data.shape}")

The shape of the testing dataframe is : (22544, 43)


In [10]:
#Load attacks.txt containing the attack categories
map_attacks = [x.strip().split() for x in open('attacks.txt', 'r')]
map_attacks = {k:v for (k,v) in map_attacks}

FileNotFoundError: [Errno 2] No such file or directory: 'attacks.txt'

In [None]:
#Replace the "class" column values to 5 attack categories in training and testing dataframe
train_data['class'] = train_data['class'].replace(map_attacks)
test_data['class'] = test_data['class'].replace(map_attacks)

In [None]:
train_data = shuffle(train_data)

## DATA PREPROCESSING

In [None]:
#separate the training dataframe into feature columns and label columns
X = train_data.drop('class', axis = 1) #Independent features
y = train_data['class'] #Dependent features (Labels)

In [None]:
#Converting String to Integer with get_dummies by pandas
columns = ['protocol_type', 'service', 'flag']
X_new = pd.get_dummies(X, columns = columns, drop_first = True)

In [None]:
#Idem dito for class ^
y_new = train_data['class']
y_new = pd.get_dummies(y_new)

In [None]:
#Split data: 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size = 0.2, random_state = 101)

In [None]:
#Use StandardScaler() to standardize data - explained in Honours Project
sc = StandardScaler()
sc.fit(np.array(X_train))
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

MODEL 1:

First model. This model contains all the dense layers. In dense layers each node or neuron in a layer is connected to all the nodes/neurons in the preceding layer, further explained in Honours Document.

In [None]:
#Use the keras's sequential API
#First dense layer takes an input parameter as 256 (number of neurons in the first layer).
#The second parameter "input_dim" corresponds to the input features.
#Use "relu" as activation function.
#The activation function for last dense layer is "softmax" because of the multiple classes, further explained in document.
#Set dropout for 10%.
model1 = Sequential()
model1.add(Dense(64, input_dim = 120, activation = "relu", kernel_initializer = "lecun_normal"))
model1.add(Dense(128, activation = "relu"))
model1.add(Dense(5, activation = "softmax"))

In [None]:
#Summary of model architecture listing information about parameters per layer.
model1.summary()

In [None]:
#Three paramaters:
#Loss - The loss function.
#Optimizer - To minimize the loss function.
#Metrics - The mode of evaluation for our model.
#"categorical_loss" - is used because of the multi-class classifcation problem.
#"adam" - The updated version of SGD.
optim = optimizers.SGD(learning_rate = 0.0001)
model1.compile(loss = 'categorical_crossentropy', optimizer = optim, metrics = ['accuracy'])

In [None]:
#Fit the model on our data.
#X_train - The feature columns of the training data.
#y_train - The labels columns of the training data.
#validation_data - The validation data
#batch_size and epochs further explained in document.
history = model1.fit(X_train, y_train,
          validation_data = (X_test, y_test),
          batch_size = 32,
          epochs = 20)

In [None]:
#use matplitlib to draw the plots
plt.figure(figsize = (15, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label = "TRAINING LOSS")
plt.plot(history.history['val_loss'], label = "VALIDATION LOSS")
plt.title("TRAINING LOSS vs VALIDATION LOSS")
plt.xlabel("EPOCH'S")
plt.ylabel("TRAINING LOSS vs VALIDATION LOSS")
plt.legend(loc = "best")

plt.subplot(1, 2, 2)
plt.plot(history.history['acc'], label = "TRAINING ACCURACY")
plt.plot(history.history['val_acc'], label = "VALIDATION ACCURACY")
plt.title("TRAINING ACCURACY vs VALIDATION ACCURACY")
plt.xlabel("EPOCH'S")
plt.ylabel("TRAINING ACC vs VALIDATION ACCURACY")
plt.legend(loc = "best")

Explained in Results section.

Load and save model 1:

In [None]:
#Serialize model 1 , save with json.
model_json = model1.to_json()
with open("model1.json", "w") as json_file:
    json_file.write(model_json)
model1.save_weights('model1_weights.h5')
print("Saved model to disk")

In [None]:
# load model 1.
json_file = open("model1.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("model1_weights.h5")
print("Loaded model from disk")

Model 2:

Single Dense Hidden Layer with Output Layer

In [None]:
model2 = Sequential()
model2.add(Dense(32, input_dim = 120, activation = "relu", kernel_initializer = "lecun_normal"))
model2.add(BatchNormalization())
model2.add(Dropout(0.2))
model2.add(Dense(32, activation = "relu"))
model2.add(Dense(5, activation = "softmax"))

In [None]:
#List summary again
model2.summary()

In [None]:
#SGD used instead of Adam!
optim2 = optimizers.Adam(lr = 0.0001)
model2.compile(loss = 'categorical_crossentropy', optimizer = optim, metrics = ['accuracy'])

In [None]:
#Train model and validate.
history2 = model2.fit(X_train, y_train,
           batch_size = 32,
           epochs = 30,
           validation_data = (X_test, y_test))

In [None]:
plt.figure(figsize = (15, 4))
plt.subplot(1, 2, 1)
plt.plot(history2.history['loss'], label = "TRAINING LOSS")
plt.plot(history2.history['val_loss'], label = "VALIDATION LOSS")
plt.title("TRAINING LOSS vs VALIDATION LOSS")
plt.xlabel("EPOCH'S")
plt.ylabel("TRAINING ACC vs VALIDATION LOSS")
plt.legend(loc = "best")

plt.subplot(1, 2, 2)
plt.plot(history2.history['accuracy'], label = "TRAINING ACCURACY")
plt.plot(history2.history['val_accuracy'], label = "VALIDATION ACCURACY")
plt.title("TRAINING ACCURACY vs VALIDATION ACCURACY")
plt.xlabel("EPOCH'S")
plt.ylabel("TRAINING ACC vs VALIDATION ACCURACY")
plt.legend(loc = "best")

Explained in results section.