In [0]:
import numpy as np
import random
import math
import keras
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, GlobalMaxPooling2D, Flatten, Input, Reshape, LSTM

In [0]:
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
def print_result(y_pred, y_test, clf_name):
    matrix = confusion_matrix(y_test, y_pred)
    TP, FP = matrix[0]
    FN, TN = matrix[1]
    PPV = (TP * 1.0) / (TP + FP)
    TPR = (TP * 1.0) / (TP + FN)
    TNR = (FP * 1.0) / (TN + FP)
    ACC = (TP + TN) * 1.0 / (TP + TN + FP + FN)
    F1 = 2.0 * PPV * TPR / (PPV + TPR)
    print("%s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" %
          (clf_name, PPV, TPR, TNR, ACC, F1))
def process_raw_data(normal_data, anomalous_data):
    # create dict
    char_dict = {}
    char_smpl = ' '.join(anomalous_data)
    char_smpl = sorted(list(set(char_smpl)))
    for idx, ch in enumerate(char_smpl):
        char_dict[ch] = idx
    # convert
    normal_data = [[char_dict[el] for el in line] for line in normal_data]
    anomalous_data = [[char_dict[el] for el in line] for line in anomalous_data]
    # merge data and create target data
    data = normal_data + anomalous_data
    # train_target = np.ones(len(normal_data)).tolist() + np.zeros(len(anomalous_data)).tolist()
    target = [1]*len(normal_data) + [0]*len(anomalous_data)
    print('Good requests:', len(normal_data))
    print('Bad requests:', len(anomalous_data))
    print('Total requests:', len(target))
    # set max len element of data
    for i in range(len(data)):
        if (len(data[i]) < 300):
            data[i] = data[i] + [0]*(300 - len(data[i]))
        else:
            data[i] = data[i][:300]
    # split
    train_data, test_data, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 21)
    # one-hot vector
    X_train = np.asarray([to_categorical(i, num_classes=63) for i in train_data])
    X_test = np.asarray([to_categorical(i, num_classes=63) for i in test_data])
    # print
    print("Requests for Train: ", len(y_train))
    print("Requests for Test: ", len(y_test))
    print("Split Train:Test = 8:2")
    return X_train, X_test, y_train, y_test

In [0]:
normal_data = loadData('normalRequestTraining.txt')
anomalous_data = loadData('anomalousRequestTest.txt')

In [4]:
X_train, X_test, y_train, y_test = process_raw_data(normal_data, anomalous_data)

Good requests: 36000
Bad requests: 25065
Total requests: 61065
Requests for Train:  48852
Requests for Test:  12213
Split Train:Test = 8:2


In [5]:
shape = X_train.shape
print(shape)

(48852, 300, 63)


In [6]:
model = Sequential()
model.add(Reshape((shape[1], shape[2], 1), input_shape=(shape[1], shape[2])))

model.add(Conv2D(32, (3, 63), activation='relu'))
model.add(Conv2D(32, (3, 1), activation='relu'))
model.add(Conv2D(32, (3, 1), activation='relu'))

model.add(Dropout(0.25))
model.add(GlobalMaxPooling2D())

model.add(Dense(64))
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2)

y_pred = model.predict_classes(X_test)
print_result(y_pred, y_test, 'CNN Conv2d: ')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 300, 63, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 298, 1, 32)        6080      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 296, 1, 32)        3104      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 294, 1, 32)        3104      
_________________________________________________________________
dropout_1 (Dropout)          (None, 294, 1, 32)        0         
_________________________________________________________________
global_max_pooling2d_

In [7]:
model2 = Sequential()
model2.add(Reshape((shape[1], shape[2], 1), input_shape=(shape[1], shape[2])))

model2.add(Conv2D(32, (3, 63), activation='relu'))
model2.add(Conv2D(32, (3, 1), activation='relu'))
model2.add(Conv2D(32, (3, 1), activation='relu'))
model2.add(Conv2D(32, (3, 1), activation='relu'))
model2.add(Conv2D(32, (3, 1), activation='relu'))

model2.add(Dropout(0.25))
model2.add(GlobalMaxPooling2D())

model2.add(Dense(64))
model2.add(Dense(1, activation='sigmoid'))

model2.summary()
model2.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['acc'])

model2.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2)

y_pred2 = model2.predict_classes(X_test)
print_result(y_pred2, y_test, 'CNN Conv2d: ')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_2 (Reshape)          (None, 300, 63, 1)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 298, 1, 32)        6080      
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 296, 1, 32)        3104      
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 294, 1, 32)        3104      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 292, 1, 32)        3104      
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 290, 1, 32)        3104      
_________________________________________________________________
dropout_2 (Dropout)          (None, 290, 1, 32)        0         
__________

In [8]:
model3 = Sequential()

model3.add(Reshape((shape[1], shape[2], 1), input_shape=(shape[1], shape[2])))

model3.add(Conv2D(128, (3, 63), activation='relu'))
model3.add(Conv2D(128, (3, 1), activation='relu'))
model3.add(Conv2D(128, (3, 1), activation='relu'))

model3.add(Dropout(0.25))
model3.add(GlobalMaxPooling2D())

model3.add(Dense(128))
model3.add(Dense(1, activation='sigmoid'))

model3.summary()
model3.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model3.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2)

y_pred3 = model3.predict_classes(X_test)
print_result(y_pred3, y_test, 'CNN Conv2d: ')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_3 (Reshape)          (None, 300, 63, 1)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 298, 1, 128)       24320     
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 296, 1, 128)       49280     
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 294, 1, 128)       49280     
_________________________________________________________________
dropout_3 (Dropout)          (None, 294, 1, 128)       0         
_________________________________________________________________
global_max_pooling2d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
__________

In [9]:
model4 = Sequential()

model4.add(Reshape((shape[1], shape[2], 1), input_shape=(shape[1], shape[2])))

model4.add(Conv2D(128, (3, 63), activation='relu'))
model4.add(Conv2D(128, (3, 1), activation='relu'))
model4.add(Conv2D(128, (3, 1), activation='relu'))
model4.add(Conv2D(128, (3, 1), activation='relu'))
model4.add(Conv2D(128, (3, 1), activation='relu'))

model4.add(Dropout(0.25))
model4.add(GlobalMaxPooling2D())

model4.add(Dense(128))
model4.add(Dense(1, activation='sigmoid'))

model4.summary()
model4.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model4.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2)

y_pred4 = model4.predict_classes(X_test)
print_result(y_pred4, y_test, 'CNN Conv2d: ')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_4 (Reshape)          (None, 300, 63, 1)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 298, 1, 128)       24320     
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 296, 1, 128)       49280     
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 294, 1, 128)       49280     
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 292, 1, 128)       49280     
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 290, 1, 128)       49280     
_________________________________________________________________
dropout_4 (Dropout)          (None, 290, 1, 128)       0         
__________

In [17]:
model5 = Sequential()

model5.add(LSTM(64, input_shape=(shape[1], shape[2])))
# model5.add(LSTM(32, return_sequences=True))
# model5.add(LSTM(32))

model5.add(Dropout(0.25))
model5.add(Dense(32, activation='relu'))
model5.add(Dense(1, activation='sigmoid'))

model5.summary()
model5.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

model5.fit(X_train, y_train, batch_size=256, epochs=10, validation_split=0.2)

y_pred5 = model5.predict_classes(X_test)
print_result(y_pred5, y_test, 'RNN LSTM: ')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 64)                32768     
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 33        
Total params: 34,881
Trainable params: 34,881
Non-trainable params: 0
_________________________________________________________________
Train on 39081 samples, validate on 9771 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
RNN LSTM: 	0.13733	0.66412	0.39158	0.61320	0.22760
