In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import urllib.parse
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import io
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
import os

In [2]:
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
def print_result(X_train, X_test, y_train, y_test, clf, clf_name):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred)
    TP, FP = matrix[0]
    FN, TN = matrix[1]
    PPV = (TP * 1.0) / (TP + FP)
    TPR = (TP * 1.0) / (TP + FN)
    TNR = (FP * 1.0) / (TN + FP)
    ACC = (TP + TN) * 1.0 / (TP + TN + FP + FN)
    F1 = 2.0 * PPV * TPR / (PPV + TPR)
    print("%s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" %
          (clf_name, PPV, TPR, TNR, ACC, F1))

In [3]:
bad_requests = loadData('anomalousRequest.txt')
good_requests = loadData('normalRequest.txt')

In [4]:
all_requests = bad_requests + good_requests
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

In [5]:
print("Total requests : ", len(all_requests))
print("Bad requests: ", len(bad_requests))
print("Good requests: ", len(good_requests))

Total requests :  61065
Bad requests:  25065
Good requests:  36000


In [6]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

In [8]:
print(X)

  (0, 16824)	0.023397999047973718
  (0, 15969)	0.023386328381543045
  (0, 25962)	0.01735528642034234
  (0, 17512)	0.01735528642034234
  (0, 26170)	0.01735528642034234
  (0, 26099)	0.01735528642034234
  (0, 23045)	0.01735528642034234
  (0, 9621)	0.01735528642034234
  (0, 2264)	0.01735528642034234
  (0, 2442)	0.01735528642034234
  (0, 19972)	0.01735528642034234
  (0, 22239)	0.01735528642034234
  (0, 13711)	0.01735528642034234
  (0, 12379)	0.01735528642034234
  (0, 19818)	0.01735528642034234
  (0, 17462)	0.01735528642034234
  (0, 22689)	0.01735528642034234
  (0, 25345)	0.01735528642034234
  (0, 25750)	0.01735528642034234
  (0, 9625)	0.01735528642034234
  (0, 8300)	0.01735528642034234
  (0, 2925)	0.01735528642034234
  (0, 8291)	0.017421062352646448
  (0, 2645)	0.017922932745589364
  (0, 2521)	0.017932919878393086
  :	:
  (61064, 26121)	0.10972608485041455
  (61064, 15885)	0.1699500568168579
  (61064, 15905)	0.14219486189792674
  (61064, 12672)	0.23027670513340467
  (61064, 25364)	0.1345185

In [9]:
print("Requests for Train: ", len(y_train))
print("Requests for Test: ", len(y_test))
print("Use Trigram (n=3). Split Train:Test = 8:2.\n")

Requests for Train:  48852
Requests for Test:  12213
Use Trigram (n=3). Split Train:Test = 8:2.



In [10]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Embedding, Reshape

Using TensorFlow backend.


In [11]:
model = Sequential()
model.add(Reshape((28847,1), input_shape=(28847,)))
model.add(Conv1D(32, 3, activation='relu'))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
model.fit(X_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 39081 samples, validate on 9771 samples
Epoch 1/5
  128/39081 [..............................] - ETA: 2:10:32 - loss: 0.6932 - acc: 0.4922

KeyboardInterrupt: 