In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import urllib.parse
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import io
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
def print_result(X_train, X_test, y_train, y_test, clf, clf_name):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    ACC = accuracy_score(y_pred, y_test)
    F1 = f1_score(y_pred, y_test, average='macro')
    print("%s\t(accuracy, f1) = (%.5f, %.5f)"%(clf_name, ACC, F1))

In [3]:
bad_requests = loadData('anomalousRequestTest.txt')
good_requests = loadData('normalRequestTraining.txt')

In [4]:
all_requests = bad_requests + good_requests
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

In [5]:
print("Total requests : ", len(all_requests))
print("Bad requests: ", len(bad_requests))
print("Good requests: ", len(good_requests))

Total requests :  61065
Bad requests:  25065
Good requests:  36000


In [6]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

In [8]:
print("Requests for Train: ", len(y_train))
print("Requests for Test: ", len(y_test))
print("Use Trigram (n=3). Split Train:Test = 8:2")

Requests for Train:  48852
Requests for Test:  12213
Use Trigram (n=3). Split Train:Test = 8:2


In [9]:
lgs = LogisticRegression()
dtc = tree.DecisionTreeClassifier()
linear_svm = LinearSVC(C=1)
rfc = RandomForestClassifier(n_estimators=50)

In [10]:
print_result(X_train, X_test, y_train, y_test, lgs, "Logistic Regression         ")
print_result(X_train, X_test, y_train, y_test, dtc, "Decision Tree               ")
print_result(X_train, X_test, y_train, y_test, linear_svm, "Linear SVM (C=1)             ")
print_result(X_train, X_test, y_train, y_test, rfc, "Random Forest(tree=50)      ")

Logistic Regression         	(accuracy, f1) = (0.97544, 0.97462)
Decision Tree               	(accuracy, f1) = (0.98338, 0.98286)
Linear SVM (C=1)             	(accuracy, f1) = (0.99329, 0.99307)
Random Forest(tree=50)      	(accuracy, f1) = (0.98035, 0.97971)
