In [1]:
import numpy as np
import random
import math
import keras
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, GlobalMaxPooling2D, Flatten, Input, Reshape

Using TensorFlow backend.


In [2]:
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
def print_result(y_pred, y_test, clf_name):
    matrix = confusion_matrix(y_test, y_pred)
    TP, FP = matrix[0]
    FN, TN = matrix[1]
    PPV = (TP * 1.0) / (TP + FP)
    TPR = (TP * 1.0) / (TP + FN)
    TNR = (FP * 1.0) / (TN + FP)
    ACC = (TP + TN) * 1.0 / (TP + TN + FP + FN)
    F1 = 2.0 * PPV * TPR / (PPV + TPR)
    print("%s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" %
          (clf_name, PPV, TPR, TNR, ACC, F1))
def process_raw_data(normal_data, anomalous_data):
    # create dict
    char_dict = {}
    char_smpl = ' '.join(anomalous_data)
    char_smpl = sorted(list(set(char_smpl)))
    for idx, ch in enumerate(char_smpl):
        char_dict[ch] = idx
    # convert
    normal_data = [[char_dict[el] for el in line] for line in normal_data]
    anomalous_data = [[char_dict[el] for el in line] for line in anomalous_data]
    # merge data and create target data
    data = normal_data + anomalous_data
    # train_target = np.ones(len(normal_data)).tolist() + np.zeros(len(anomalous_data)).tolist()
    target = [1]*len(normal_data) + [0]*len(anomalous_data)
    print('Good requests:', len(normal_data))
    print('Bad requests:', len(anomalous_data))
    print('Total requests:', len(target))
    # set max len element of data
    for i in range(len(data)):
        if (len(data[i]) < 300):
            data[i] = data[i] + [0]*(300 - len(data[i]))
        else:
            data[i] = data[i][:300]
    # split
    train_data, test_data, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 21)
    # one-hot vector
    X_train = np.asarray([to_categorical(i, num_classes=63) for i in train_data])
    X_test = np.asarray([to_categorical(i, num_classes=63) for i in test_data])
    # print
    print("Requests for Train: ", len(y_train))
    print("Requests for Test: ", len(y_test))
    print("Split Train:Test = 8:2")
    return X_train, X_test, y_train, y_test

In [3]:
normal_data = loadData('normalRequestTraining.txt')
anomalous_data = loadData('anomalousRequestTest.txt')

In [4]:
X_train, X_test, y_train, y_test = process_raw_data(normal_data, anomalous_data)

Good requests: 36000
Bad requests: 25065
Total requests: 61065
Requests for Train:  48852
Requests for Test:  12213
Split Train:Test = 8:2


In [5]:
shape = X_train.shape
print(shape)

(48852, 300, 63)


In [6]:
model = Sequential()
model.add(Reshape((shape[1], shape[2], 1), input_shape=(shape[1], shape[2])))

model.add(Conv2D(128, (3, 63)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 1)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 1)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 1)))
model.add(Activation('relu'))

model.add(Dropout(0.25))
model.add(GlobalMaxPooling2D())

model.add(Dense(64))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 300, 63, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 298, 1, 128)       24320     
_________________________________________________________________
activation_1 (Activation)    (None, 298, 1, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 296, 1, 64)        24640     
_________________________________________________________________
activation_2 (Activation)    (None, 296, 1, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)    

In [7]:
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 39081 samples, validate on 9771 samples
Epoch 1/10
Epoch 2/10
 7936/39081 [=====>........................] - ETA: 2:05 - loss: 0.0793 - acc: 0.9729

KeyboardInterrupt: 

In [None]:
y_pred = model.predict_classes(X_test)
print_result(y_pred, y_test, 'CNN Conv2d: ')