In [1]:
# %reload_ext tensorboard
import numpy as np
import tensorflow as tf
import csv
import pandas as pd
import datetime
import logging
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report
# from tensorflow.keras import backend as K
from keras_self_attention import SeqSelfAttention
import keras
import os
from imblearn.over_sampling import SMOTE, ADASYN
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler('tensorflow.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

tf.keras.backend.set_floatx('float64')

filenames = ['Friday-02-03-2018_TrafficForML_CICFlowMeter.csv', #0
             'Friday-16-02-2018_TrafficForML_CICFlowMeter.csv',
             'Friday-23-02-2018_TrafficForML_CICFlowMeter.csv',
             'Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv',
             'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv',
             'Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv',
             'Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv',
             'Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv',
             'Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv', #8
             'Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'
            ]
def get_data(class_num=7):
    if class_num==7:
        li = []
        for filename in filenames:
            print(filename)
            df = pd.read_csv("7_classes/processed/%s"%filename, index_col=None)
            li.append(df)

        frame = pd.concat(li, axis=0, ignore_index=True)
        frame.pop("Unnamed: 0")
        return frame

def preprocess_data(frame, class_num=7, batch_size=128):
    train_li, validate_li, test_li = [], [], []
    for i in range(class_num):
        print(i)
        count = len(frame[frame['Label']==i])
        if count>2000000:
            count = 2000000
        train_count = int(0.7 * count)
        test_count = int(0.15 * count)
        validate_count = int(0.15 * count)
        train_li.append(frame[frame['Label']==i].sample(n=count,random_state=rnd_seed)[0:train_count])
        validate_li.append(frame[frame['Label']==i].sample(n=count,
                                                           random_state=rnd_seed)[train_count:train_count+validate_count])
        test_li.append(frame[frame['Label']==i].sample(n=count,random_state=rnd_seed)[train_count+validate_count:count])
    
    train_frame = pd.concat(train_li, axis=0, ignore_index=True)
    validate_frame = pd.concat(validate_li, axis=0, ignore_index=True)
    test_frame = pd.concat(test_li, axis=0, ignore_index=True)
    
    train_labels = train_frame.pop("Label")
    validate_labels = validate_frame.pop("Label")
    test_labels = test_frame.pop("Label")

    train_dataset = tf.data.Dataset.from_tensor_slices((train_frame.values.reshape([-1,78,1]), 
                                                        tf.keras.utils.to_categorical(train_labels, 
                                                                                      class_num))).batch(batch_size)
    test_dataset = tf.data.Dataset.from_tensor_slices((test_frame.values.reshape([-1,78,1]), 
                                                        tf.keras.utils.to_categorical(test_labels, 
                                                                                      class_num))).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((validate_frame.values.reshape([-1,78,1]), 
                                                        tf.keras.utils.to_categorical(validate_labels, 
                                                                                      class_num))).batch(batch_size)
    return (train_dataset, val_dataset, test_dataset)

def load_tmp_dataset(sampling_parameters, class_num=7):
    train_frame = pd.read_csv("train.csv", index_col=None)
    train_frame = train_frame.sample(n=len(train_frame))
    log.info("train_frame loaded")
    validate_frame = pd.read_csv("val.csv", index_col=None)
    log.info("validate_frame loaded")
    test_frame = pd.read_csv("test.csv", index_col=None)
    log.info("test_frame loaded")
    
    train_labels = train_frame.pop("Label")
    validate_labels = validate_frame.pop("Label")
    test_labels = test_frame.pop("Label")
    
    train_frame.pop("Dst Port")
    validate_frame.pop("Dst Port")
    test_frame.pop("Dst Port")

    x = train_frame.values
    y = train_labels #tf.keras.utils.to_categorical(train_labels, class_num)
    
    
    oversample = SMOTE(sampling_strategy = sampling_parameters)
    train_x, train_y = oversample.fit_resample(x, y)
    train_x = train_x.reshape([-1,77,1])
    train_y = tf.keras.utils.to_categorical(train_y, class_num)
    
    
    test_x = test_frame.values.reshape([-1,77,1]) 
    test_y = tf.keras.utils.to_categorical(test_labels, class_num)
    
    val_x = validate_frame.values.reshape([-1,77,1])
    val_y = tf.keras.utils.to_categorical(validate_labels, class_num)
    log.info("dataset loaded")
    return (train_x, train_y, val_x, val_y, test_x, test_y)

def get_compiled_model(parameters, out_classes=7):
    tmp_loss='categorical_crossentropy'
    metrics=['categorical_accuracy',
#                  tf.keras.metrics.Recall(dtype='float32'),
#                  tf.keras.metrics.Precision(dtype='float32'),
#                  f1
            ]
    model = keras.Sequential([keras.layers.LSTM(parameters['lstm1'], input_shape=(77, 1), return_sequences=True),
                              keras.layers.LSTM(parameters['lstm2'], return_sequences=True, dropout=0.1),
                              SeqSelfAttention(attention_width=parameters['att'], 
                                               attention_activation='sigmoid',
                                               name='Attention'),
                              keras.layers.LSTM(parameters['lstm3'], dropout=0.1),
#                               keras.layers.Flatten(),
                              keras.layers.Dense(parameters['dense1'], activation='relu'),
                              keras.layers.Dense(parameters['dense2'], activation='relu'),
                              keras.layers.Dense(7, activation='softmax')
                                ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.00003),
                  loss=tmp_loss,
                  metrics=metrics
                 )
    return model

Using TensorFlow backend.


In [2]:
sampling_parameters = [{#0:0, # 1 400 000
                        #1:0, #   266 664
                        2:100000, #       649
                        #3:0, #   884 753
                        #4:0, #   458 010
                        #5:0, #   200333
                        #6:0  #   113 353
                       },
                       {#0:0,
                        #1:0,
                        2:150000,
                        #3:0,
                        #4:0,
                        #5:0,
                        #6:0
                       },
                       {#0:0,
                        #1:0,
                        2:200000,
                        #3:0,
                        #4:0,
                        #5:0,
                        #6:0
                       },
                       {#0:0,
                        #1:0,
                        2:250000,
                        #3:0,
                        #4:0,
                        #5:0,
                        #6:0
                       },
#                        {#0:0,
#                         1:500000,
#                         2:500000,
#                         #3:0,
#                         4:500000,
#                         5:500000,
#                         6:500000
#                        },
#                        {#0:0,
#                         #1:0,
#                         2:200000,
#                         #3:0,
#                         #4:0,
#                         #5:0,
#                         6:200000
#                        },
                      ]
DNN_parameters = [{'lstm1':256,
                   'lstm2':256,
                   'att':256,
                   'lstm3':128,
                   'dense1':100,
                   'dense2':80,
                  },
                  {'lstm1':128,
                   'lstm2':128,
                   'att':128,
                   'lstm3':92,
                   'dense1':80,
                   'dense2':80,
                  },
                  {'lstm1':64,#256,
                   'lstm2':64,#128,
                   'att':64,#128,
                   'lstm3':64,
                   'dense1':64,
                   'dense2':32,
                  },
#                   {'lstm1':256,
#                    'lstm2':256,
#                    'att':128,
#                    'lstm3':128,
#                    'dense1':128,
#                    'dense2':64,
#                   },
#                   {'lstm1':92,
#                    'lstm2':92,
#                    'att':92,
#                    'lstm3':64,
#                    'dense1':64,
#                    'dense2':32,
#                   }
                 ]

In [8]:
predictions = {}
for num1, samp_par in enumerate(sampling_parameters):
#     if num1==0 or num1==1:
#         continue
    for num2, net_par in enumerate(DNN_parameters):
        train_x, train_y, val_x, val_y, test_x, test_y = load_tmp_dataset(samp_par)
        model = get_compiled_model(net_par)
        log_dir=os.path.join('logs/long_fit_GRU/'+"s%s_n%s/"%(num1, num2))
        tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch = 100000000)
        mcp_save = ModelCheckpoint('GRU_models/model_%s_%s.h5'%(num1, num2), save_best_only=True, monitor='val_loss', mode='min')
        train_history = model.fit(train_x,
                                  train_y,
                                  epochs=10,
                                  batch_size=512,
                                  validation_data=(val_x, val_y),
                                  callbacks=[tensorboard_callback, mcp_save]
                                 )
        model = keras.models.load_model('GRU_models/model_%s_%s.h5'%(num1, num2), custom_objects={'SeqSelfAttention':SeqSelfAttention})
#         model.evaluate(test_x, test_y, callbacks=[tensorboard_callback])
        predictions["GRU_results/s%s_n%s"%(num1, num2)] = model.predict(test_x)
        np.save("GRU_results/s%s_n%s.npy"%(num1, num2), predictions["GRU_results/s%s_n%s"%(num1, num2)])
        print('===============')
        print(confusion_matrix(test_y.argmax(axis=1), predictions["GRU_results/s%s_n%s"%(num1, num2)].argmax(axis=1)))
        print(classification_report(test_y.argmax(axis=1), predictions["GRU_results/s%s_n%s"%(num1, num2)].argmax(axis=1)))
        print(num1,num2)
        print('===============')

INFO:tensorflow:train_frame loaded
INFO:tensorflow:validate_frame loaded
INFO:tensorflow:test_frame loaded
INFO:tensorflow:dataset loaded
Train on 3423113 samples, validate on 712233 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[297655     23    181    416    179    254   1292]
 [     6  53438      0      0   3699      0      0]
 [     5      0    131      0      0      4      0]
 [   189      0      7 189394      0      1      0]
 [    28  10264      0      0  87852      0      1]
 [   181      0    215      1      0  42531      2]
 [ 19170      3     25     62     10     28   4993]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96    300000
           1       0.84      0.94      0.88     57143
           2       0.23      0.94      0.37       140
           3       1.00      1.00      1.00    189591
           4       0.96      0.90      0.93     98145
        

              precision    recall  f1-score   support

           0       0.94      0.99      0.96    300000
           1       0.82      0.95      0.88     57143
           2       0.10      0.88      0.18       140
           3       1.00      1.00      1.00    189591
           4       0.96      0.88      0.92     98145
           5       0.99      0.99      0.99     42930
           6       0.79      0.18      0.29     24291

    accuracy                           0.95    712240
   macro avg       0.80      0.84      0.75    712240
weighted avg       0.94      0.95      0.94    712240

0 2
INFO:tensorflow:train_frame loaded
INFO:tensorflow:validate_frame loaded
INFO:tensorflow:test_frame loaded
INFO:tensorflow:dataset loaded
Train on 3473113 samples, validate on 712233 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[297618     12    221    423    186    267   1273]
 [     2  53573      0      0   3568      0  

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[295404     38   1293   1407    230    407   1221]
 [    27  53637      0      0   3478      0      1]
 [     7      0    128      0      0      5      0]
 [   379      0    111 189100      0      1      0]
 [   153  10887      2      7  87096      0      0]
 [   166      0    139      3     13  42609      0]
 [ 19636     12    241     68     24     42   4268]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96    300000
           1       0.83      0.94      0.88     57143
           2       0.07      0.91      0.12       140
           3       0.99      1.00      0.99    189591
           4       0.96      0.89      0.92     98145
           5       0.99      0.99      0.99     42930
           6       0.78      0.18      0.29     24291

    accuracy                           0.94    712240
   macro avg       0.79      0.84      0.74    712240
weighted avg       0.94      0.94    

In [4]:
target_names = ['Benign', 'bruteforce', 'Web', 'DDOS', 'dos', 'Bot', 'Infilteration']
for name,pred in predictions.items():
    print("-----------%s---------------"%name)
    print('Confusion Matrix')
    print(confusion_matrix(test_y.argmax(axis=1), pred.argmax(axis=1)))
    print('Classification Report')
    print(classification_report(test_y.argmax(axis=1), pred.argmax(axis=1), target_names=target_names))

-----------s0_n0---------------
Confusion Matrix
[[297702      9    142    408    154    201   1384]
 [     0  53746      1      0   3396      0      0]
 [     5      0    131      0      0      4      0]
 [    82      0     10 189499      0      0      0]
 [    15  10178      0      0  87952      0      0]
 [   137      0    203      1      0  42589      0]
 [ 18789      3      5     59      1     15   5419]]
Classification Report
               precision    recall  f1-score   support

       Benign       0.94      0.99      0.97    300000
   bruteforce       0.84      0.94      0.89     57143
          Web       0.27      0.94      0.41       140
         DDOS       1.00      1.00      1.00    189591
          dos       0.96      0.90      0.93     98145
          Bot       0.99      0.99      0.99     42930
Infilteration       0.80      0.22      0.35     24291

     accuracy                           0.95    712240
    macro avg       0.83      0.85      0.79    712240
 weighted av

[[297348      5    239    479    140    259   1530]
 [     1  53414      1      0   3726      0      1]
 [     5      0    131      0      0      4      0]
 [   157      0     98 189335      1      0      0]
 [    17   9938      0      0  88190      0      0]
 [    45      0    124      1      0  42760      0]
 [ 18602      3     78     65      1     26   5516]]
Classification Report
               precision    recall  f1-score   support

       Benign       0.94      0.99      0.97    300000
   bruteforce       0.84      0.93      0.89     57143
          Web       0.20      0.94      0.32       140
         DDOS       1.00      1.00      1.00    189591
          dos       0.96      0.90      0.93     98145
          Bot       0.99      1.00      0.99     42930
Infilteration       0.78      0.23      0.35     24291

     accuracy                           0.95    712240
    macro avg       0.82      0.85      0.78    712240
 weighted avg       0.95      0.95      0.94    712240

-----

In [6]:
for name,pred in predictions.items():
    np.save('%s.npy'%name, pred)

In [4]:
# sampling_parameters[0] DNN_parameters[0]

# import tensorflow as tf
tf.debugging.set_log_device_placement(True)
# strategy = tf.distribute.MirroredStrategy()
# # with strategy.scope():
model = get_compiled_model(DNN_parameters[0])
#     log_dir=os.path.join('logs/long_fit/'+"testtt/")
#     tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch = 100000000)
#     train_history = model.fit(train_x,
#                               train_y,
#                               epochs=20,
#                               batch_size=512,
#                               validation_data=(val_x, val_y),
#                               callbacks=[tensorboard_callback]
#                              )

Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ConcatV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0