In [1]:
"""keras_model.py: 

This model is the implementation of Gaussian Naive Bayes Classification of KDD datasets.
"""

__author__ = 'Youngseok Joung'
__copyright__ = "Copyright 2007, The Cogent Project"
__credits__ = ["Youngseok Joung"]
__license__ = "GPL"
__version__ = "1.0.1"
__maintainer__ = "Youngseok Joung"
__email__ = "none"
__status__ = "Production"


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as splitter
from keras import regularizers
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import cProfile
import pstats
import os
import sys
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import pickle
from joblib import dump, load

In [2]:
def labelEncoding(model_name, data):
    for column in data.columns:
        # If the data type of the cell is 'object'(Categorical), it will be transformed as a numerical 
        if data[column].dtype == type(object):
            le_file_path = 'result/' + model_name + '/' + model_name + '_' + column + '_encoder.pkl'
            print(os.path.exists(le_file_path))
            if os.path.exists(le_file_path):
                pkl_file = open(le_file_path, 'rb')
                le = pickle.load(pkl_file) 
                pkl_file.close()
                data[column] = le.transform(data[column])            
            else:
                le = LabelEncoder()
                data[column] = le.fit_transform(data[column])
                #exporting the departure encoder
                output = open(le_file_path, 'wb')
                pickle.dump(le, output)
                output.close()
            if column == 'result':
                le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
                print(le_name_mapping)
                
    return data, le

In [3]:
def Preprocessing(model_name, data):
    y = data.result
    x = data.drop('result', axis=1)
    
    # Preprocessing: Split 7:3 Train: Test
    x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3)
    
    return x_train, x_test, y_train, y_test

In [4]:
def train_and_test(model_name, x_train, x_test, y_train, y_test):
    # Profile: Start 
    profile = cProfile.Profile()
    profile.enable()
    
    # train and test
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    val_indices = 200
    x_val = x_train[-val_indices:]
    y_val = y_train[-val_indices:]
    # train and test
    model = Sequential()
    model.add(Dense(1024, activation='relu', input_dim=x_train.shape[1], kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=15, batch_size=512, validation_data=(x_val, y_val))
    y_pred = model.predict(x_test)

    y_pred = np.argmax(y_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)

    # Profile: End 
    profile.disable()
    profile.dump_stats('output.prof')
    stream = open('result/' + model_name + '/' + model_name + '_profiling.txt', 'w')
    stats = pstats.Stats('output.prof', stream=stream)
    stats.sort_stats('cumtime')
    stats.print_stats()
    os.remove('output.prof')
    
    # Estimation: Confusion Matrix & classification-report 
    _confusion_matrix = confusion_matrix(y_test, y_pred)
    _classification_report = classification_report(y_test, y_pred)
    
    with open('result/' + model_name + '/' + model_name + '_output.txt', 'w') as f:
        f.write("\n---Confusion Matrix---\n")
        f.write(np.array2string(_confusion_matrix, separator=', '))
        f.write("\n---Classification Report---\n")
        f.write(_classification_report)

    # Freezing model for production 
    dump(model, 'result/' + model_name + '/' + model_name + '_model.joblib') 
    
    return _confusion_matrix, _classification_report

In [5]:
model_name = 'keras_kdd'
# model_name = 'keras_nsl_kdd'
dataset_name = 'kdd_prediction'
# dataset_name = 'kdd_prediction_NSL'

data = pd.read_csv('./dataset/' + dataset_name + '.csv', delimiter=',', dtype={'protocol_type': str, 'service': str, 'flag': str, 'result': str})
print(data.head)

<bound method NDFrame.head of        duration protocol_type   service  flag  src_bytes  dst_bytes      land  \
0     -0.106216           tcp      smtp    SF  -0.003736  -0.040352 -0.011722   
1     -0.107850           tcp      http    SF  -0.004276  -0.036652 -0.011722   
2     -0.107850           tcp      http    SF  -0.004262   0.005956 -0.011722   
3     -0.107033           tcp       ftp    SF  -0.003699  -0.006723 -0.011722   
4     -0.107850           udp  domain_u    SF  -0.004368  -0.044940 -0.011722   
...         ...           ...       ...   ...        ...        ...       ...   
13446 -0.107850           tcp      http    SF  -0.004225   0.049683 -0.011722   
13447 -0.107850           tcp      nntp  RSTO  -0.004392  -0.047028 -0.011722   
13448 -0.107033           tcp      smtp    SF  -0.003734  -0.041519 -0.011722   
13449 -0.107850           tcp      nnsp   REJ  -0.004392  -0.047028 -0.011722   
13450 -0.107850           tcp      link    S0  -0.004392  -0.047028 -0.011722  

In [6]:
# labeling
data, _ = labelEncoding(model_name, data)

True
True
True
True
{'dos': 0, 'normal': 1, 'probe': 2, 'r2l': 3, 'u2r': 4}




In [7]:
# Preprocessing
x_train, x_test, y_train, y_test = Preprocessing(model_name, data)

In [8]:
# Train and Test
cm, cr = train_and_test(model_name, x_train, x_test, y_train, y_test)
print('\n-----Confusion Matrix-----\n')
print(cm)
print('\n-----Classification Report-----\n')
print(cr)

Epoch 1/15


 1/19 [>.............................] - ETA: 0s - loss: 7.8301 - accuracy: 0.3574

 2/19 [==>...........................] - ETA: 0s - loss: 45.6475 - accuracy: 0.4766

 3/19 [===>..........................] - ETA: 1s - loss: 33.1495 - accuracy: 0.4349

 4/19 [=====>........................] - ETA: 1s - loss: 26.6264 - accuracy: 0.4204

































Epoch 2/15
 1/19 [>.............................] - ETA: 0s - loss: 5.1612 - accuracy: 0.7402

 2/19 [==>...........................] - ETA: 0s - loss: 5.1782 - accuracy: 0.6367

 3/19 [===>..........................] - ETA: 1s - loss: 5.2530 - accuracy: 0.5436

 4/19 [=====>........................] - ETA: 1s - loss: 5.2484 - accuracy: 0.4854

































Epoch 3/15


 1/19 [>.............................] - ETA: 0s - loss: 4.2872 - accuracy: 0.3633

 2/19 [==>...........................] - ETA: 1s - loss: 4.1741 - accuracy: 0.5107

 3/19 [===>..........................] - ETA: 1s - loss: 4.0366 - accuracy: 0.5859

 4/19 [=====>........................] - ETA: 1s - loss: 3.9490 - accuracy: 0.6362

































Epoch 4/15


 1/19 [>.............................] - ETA: 0s - loss: 3.7264 - accuracy: 0.7969

 2/19 [==>...........................] - ETA: 1s - loss: 3.5492 - accuracy: 0.7939

 3/19 [===>..........................] - ETA: 1s - loss: 3.3783 - accuracy: 0.8040

 4/19 [=====>........................] - ETA: 1s - loss: 3.2590 - accuracy: 0.8110

































Epoch 5/15


 1/19 [>.............................] - ETA: 0s - loss: 2.6103 - accuracy: 0.7070

 2/19 [==>...........................] - ETA: 0s - loss: 2.5903 - accuracy: 0.7998

 3/19 [===>..........................] - ETA: 1s - loss: 2.5433 - accuracy: 0.7917

 4/19 [=====>........................] - ETA: 1s - loss: 2.5029 - accuracy: 0.8145

































Epoch 6/15


 1/19 [>.............................] - ETA: 0s - loss: 1.6093 - accuracy: 0.9551

 2/19 [==>...........................] - ETA: 0s - loss: 1.6053 - accuracy: 0.9375

 3/19 [===>..........................] - ETA: 1s - loss: 1.7813 - accuracy: 0.8958

 4/19 [=====>........................] - ETA: 1s - loss: 2.0468 - accuracy: 0.8384

































Epoch 7/15


 1/19 [>.............................] - ETA: 0s - loss: 1.3098 - accuracy: 0.9258

 2/19 [==>...........................] - ETA: 0s - loss: 1.3378 - accuracy: 0.9131

 3/19 [===>..........................] - ETA: 1s - loss: 1.3252 - accuracy: 0.9062

 4/19 [=====>........................] - ETA: 1s - loss: 1.2969 - accuracy: 0.9111

































Epoch 8/15


 1/19 [>.............................] - ETA: 0s - loss: 0.8526 - accuracy: 0.9473

 2/19 [==>...........................] - ETA: 1s - loss: 0.8710 - accuracy: 0.9375

 3/19 [===>..........................] - ETA: 1s - loss: 0.8569 - accuracy: 0.9401

 4/19 [=====>........................] - ETA: 1s - loss: 0.8582 - accuracy: 0.9399

































Epoch 9/15


 1/19 [>.............................] - ETA: 0s - loss: 0.6913 - accuracy: 0.9258

 2/19 [==>...........................] - ETA: 0s - loss: 0.6774 - accuracy: 0.9268

 3/19 [===>..........................] - ETA: 1s - loss: 0.6842 - accuracy: 0.9225

 4/19 [=====>........................] - ETA: 1s - loss: 0.6704 - accuracy: 0.9238

































Epoch 10/15


 1/19 [>.............................] - ETA: 0s - loss: 0.5849 - accuracy: 0.9316

 2/19 [==>...........................] - ETA: 0s - loss: 0.5905 - accuracy: 0.9258

 3/19 [===>..........................] - ETA: 1s - loss: 0.5708 - accuracy: 0.9303

 4/19 [=====>........................] - ETA: 1s - loss: 0.5666 - accuracy: 0.9282

































Epoch 11/15


 1/19 [>.............................] - ETA: 0s - loss: 0.4516 - accuracy: 0.9434

 2/19 [==>...........................] - ETA: 1s - loss: 0.4387 - accuracy: 0.9443

 3/19 [===>..........................] - ETA: 1s - loss: 0.4538 - accuracy: 0.9382

 4/19 [=====>........................] - ETA: 1s - loss: 0.4558 - accuracy: 0.9370

































Epoch 12/15


 1/19 [>.............................] - ETA: 0s - loss: 0.4718 - accuracy: 0.9336

 2/19 [==>...........................] - ETA: 0s - loss: 0.4287 - accuracy: 0.9424

 3/19 [===>..........................] - ETA: 1s - loss: 0.4240 - accuracy: 0.9453

 4/19 [=====>........................] - ETA: 1s - loss: 0.4397 - accuracy: 0.9399





















KeyboardInterrupt: 

In [None]:
def production(model_name, data):
    real_data, le = labelEncoding(model_name, data)
    real_y = real_data.result
    real_x = real_data.drop('result', axis=1)
#     print(real_y)
#     print(real_x)

    clf = load('result/' + model_name + '/' + model_name + '_model.joblib')
    yy_pred = clf.predict(real_x)
    pred_label = le.inverse_transform(yy_pred)
    real_label = le.inverse_transform(real_y)

    return pred_label, real_label

In [None]:
# Production
real_data = pd.read_csv('./dataset/kdd_prediction.csv', delimiter=',', dtype={'protocol_type': str, 'service': str, 'flag': str, 'result': str})
real_data = real_data.head(1)

pred_label, real_label = production(model_name, real_data)
print(pred_label, real_label)