In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from sklearn.metrics import accuracy_score, f1_score
from keras.layers import Dense, Dropout, Activation, Reshape
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [0]:
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
def print_result(y_pred, y_test, clf_name):
    ACC = accuracy_score(y_pred, y_test)
    F1 = f1_score(y_pred, y_test, average='macro')
    print("%s\t(accuracy, f1) = (%.5f, %.5f)"%(clf_name, ACC, F1))

In [0]:
bad_requests = loadData('anomalousRequest.txt')
good_requests = loadData('normalRequest.txt')

In [0]:
all_requests = bad_requests + good_requests
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

In [5]:
print("Total requests : ", len(all_requests))
print("Bad requests: ", len(bad_requests))
print("Good requests: ", len(good_requests))

Total requests :  61065
Bad requests:  25065
Good requests:  36000


In [0]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 22)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 11)

In [8]:
print("Requests for Train data: ", len(y_train))
print("Requests for Validation data: ", len(y_val))
print("Requests for Test data: ", len(y_test))
print("Use Trigram (n=3). Split Train:Validation:Test = 6:2:2\n")

Requests for Train data:  36639
Requests for Validation data:  12213
Requests for Test data:  12213
Use Trigram (n=3). Split Train:Validation:Test = 6:2:2



In [0]:
shape = X.shape

## Model cơ bản

In [10]:
model = Sequential()
model.add(Dense(32, input_shape=(shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

y_pred = model.predict_classes(X_test)
print_result(y_pred, y_test, "Deep learning standard: ")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                923136    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 923,169
Trainable params: 923,169
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 36639 samples, validate on 12213 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep

## Tăng số lượng tầng Dense
Kết quả đạt được thì lại kém hơn so với mạng standard cơ bản
### Kết luận: Việc tăng số lượng tầng không đem lại hiệu quả

In [11]:
model2 = Sequential()
model2.add(Dense(32, input_shape=(shape[1],), activation='relu'))
model2.add(Dense(32, activation='relu'))
model2.add(Dense(32, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.summary()

model2.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['acc'])
model2.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

y_pred2 = model2.predict_classes(X_test)
print_result(y_pred2, y_test, "Deep learning standard: ")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                923136    
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 925,281
Trainable params: 925,281
Non-trainable params: 0
_________________________________________________________________
Train on 36639 samples, validate on 12213 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/1

## Model 3 có độ rộng của tầng Dense tăng lên từ 32 lên 128
Nhưng kết quả vẫn kém hơn so với model 1
### Kết luận: Việc tăng độ rộng tầng Dense không đem lại hiệu quả

In [12]:
model3 = Sequential()
model3.add(Dense(128, input_shape=(shape[1],), activation='relu'))
model3.add(Dropout(0.2))
model3.add(Dense(1, activation='sigmoid'))
model3.summary()

model3.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['acc'])
model3.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

y_pred3 = model3.predict_classes(X_test)
print_result(y_pred2, y_test, "Deep learning standard: ")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 128)               3692544   
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 129       
Total params: 3,692,673
Trainable params: 3,692,673
Non-trainable params: 0
_________________________________________________________________
Train on 36639 samples, validate on 12213 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep learning standard: 	(accuracy, f1) = (0.99116, 0.99087)
