In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import urllib.parse
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import io
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
import os
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Embedding, Reshape

Using TensorFlow backend.


In [2]:
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
def print_result(y_pred, y_test, clf_name):
    matrix = confusion_matrix(y_test, y_pred)
    TP, FP = matrix[0]
    FN, TN = matrix[1]
    PPV = (TP * 1.0) / (TP + FP)
    TPR = (TP * 1.0) / (TP + FN)
    TNR = (FP * 1.0) / (TN + FP)
    ACC = (TP + TN) * 1.0 / (TP + TN + FP + FN)
    F1 = 2.0 * PPV * TPR / (PPV + TPR)
    print("%s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" %
          (clf_name, PPV, TPR, TNR, ACC, F1))

In [3]:
bad_requests = loadData('anomalousRequestTest.txt')
good_requests = loadData('normalRequestTraining.txt')

In [4]:
all_requests = bad_requests + good_requests
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

In [5]:
print("Total requests : ", len(all_requests))
print("Bad requests: ", len(bad_requests))
print("Good requests: ", len(good_requests))

Total requests :  61065
Bad requests:  25065
Good requests:  36000


In [6]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

In [8]:
print("Requests for Train: ", len(y_train))
print("Requests for Test: ", len(y_test))
print("Use Trigram (n=3). Split Train:Test = 8:2.\n")

Requests for Train:  48852
Requests for Test:  12213
Use Trigram (n=3). Split Train:Test = 8:2.



In [9]:
shape = X.shape

In [10]:
model = Sequential()
model.add(Dense(64, input_shape=(shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                1846272   
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
_________________________________________________________________
activation_3 (Activat

In [11]:
model.fit(X_train, y_train, batch_size=16, epochs=5, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 39081 samples, validate on 9771 samples
Epoch 1/5

KeyboardInterrupt: 

In [None]:
y_pred = model.predict_classes(X_test)
print_result(y_pred, y_test, 'Deeplearning dense: ')

## m2 có Dense ban đầu 128 units

In [None]:
m2 = Sequential()
m2.add(Dense(128, input_shape=(shape[1],)))
m2.add(Activation('relu'))
m2.add(Dropout(0.2))
m2.add(Activation('relu'))
m2.add(Dense(1))
m2.add(Activation('sigmoid'))
m2.summary()
m2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
m2.fit(X_train, y_train, batch_size=16, epochs=5, validation_split=0.2)

In [None]:
y_pred2 = m2.predict_classes(X_test)
print_result(y_pred2, y_test, 'Deeplearning dense 128 units: ')

## m3 ban đầu có Dense 32 units

In [None]:
m3 = Sequential()
m3.add(Dense(32, input_shape=(shape[1],)))
m3.add(Activation('relu'))
m3.add(Dropout(0.2))
# m3.add(Dense(64))
m3.add(Activation('relu'))
m3.add(Dense(1))
m3.add(Activation('sigmoid'))
m3.summary()
m3.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
m3.fit(X_train, y_train, batch_size=16, epochs=5, validation_split=0.2)

In [None]:
y_pred3 = m3.predict_classes(X_test)
print_result(y_pred3, y_test, 'Deeplearning dense 32 units: ')

## m4 không cần activation relu

In [None]:
m4 = Sequential()
m4.add(Dense(32, input_shape=(shape[1],)))
m4.add(Dropout(0.2))
m4.add(Dense(1))
m4.add(Activation('sigmoid'))
m4.summary()
m4.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
m4.fit(X_train, y_train, batch_size=16, epochs=5, validation_split=0.2)

In [None]:
y_pred4 = m4.predict_classes(X_test)
print_result(y_pred4, y_test, 'Deeplearning dense 32 units: ')

## m5 sử dụng các hàm kích hoạt sigmoid, có vẻ như hiệu quả kém hơn một chút

In [None]:
m5 = Sequential()
m5.add(Dense(32, input_shape=(shape[1],)))
m5.add(Activation('sigmoid'))
m5.add(Dropout(0.2))
m5.add(Activation('sigmoid'))
m5.add(Dense(1))
m5.add(Activation('sigmoid'))
m5.summary()
m5.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
m5.fit(X_train, y_train, batch_size=16, epochs=5, validation_split=0.2)