In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns=100

In [2]:
normalTrainingDataFrame = pd.read_csv("normalTrafficTraining_http_requests.csv")
normalTestDataFrame = pd.read_csv("normalTrafficTest_http_requests.csv")
anomalousTestDataFrame = pd.read_csv("anomalousTrafficTest_http_requests.csv")

In [3]:
testDataFrame = normalTestDataFrame.append(anomalousTestDataFrame, ignore_index=True)

In [4]:
normalTrainingDataFrame = normalTrainingDataFrame.drop(columns="Unnamed: 0")
testDataFrame = testDataFrame.drop(columns="Unnamed: 0")

Dropping Trivial Features' columns

In [5]:
# Finding the indexes of the columns to drop

num_columns = len(testDataFrame.columns)
cols_to_drop_index = []

for i in range(num_columns):
    if len(testDataFrame.iloc[:,i].value_counts()) == 1:
        cols_to_drop_index = cols_to_drop_index + [i]

In [6]:
# Finding the name of the columns to drop

df_columns = testDataFrame.columns
cols_to_drop_name = []

for i in range(len(cols_to_drop_index)):
    next_drop = df_columns[cols_to_drop_index[i]]
    cols_to_drop_name = cols_to_drop_name + [next_drop]

In [7]:
normalTrainingDataFrame.drop(cols_to_drop_name, axis=1, inplace=True)
testDataFrame.drop(cols_to_drop_name, axis=1, inplace=True)

Pre-Processing of Columns

In [8]:
# Re-ordering columns
def re_order_last_col_to_front(df):
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    return df

In [9]:
normalTrainingDataFrame = re_order_last_col_to_front(normalTrainingDataFrame)
testDataFrame = re_order_last_col_to_front(testDataFrame)

Features Manipulation - Based on experience in the binary classifier

In [10]:
normalTrainingDataFrame = normalTrainingDataFrame.drop(columns=['Host', 'Cookie'])
testDataFrame = testDataFrame.drop(columns=['Host', 'Cookie'])

In [11]:
# Dropping "http://localhost:8080" from request_addres
normalTrainingDataFrame.rename(columns={'Request_Address':'Request_Address_URI'}, inplace=True)
testDataFrame.rename(columns={'Request_Address':'Request_Address_URI'}, inplace=True)
normalTrainingDataFrame['Request_Address_URI'] = normalTrainingDataFrame['Request_Address_URI'].apply(lambda x: x[21:])
testDataFrame['Request_Address_URI'] = testDataFrame['Request_Address_URI'].apply(lambda x: x[21:])

In [12]:
normalTrainingDataFrame['Content-Length'] = normalTrainingDataFrame['Content-Length'].replace(np.nan, 0)
testDataFrame['Content-Length'] = testDataFrame['Content-Length'].replace(np.nan, 0)

In [13]:
normalTrainingDataFrame['No_of_slashes'] = normalTrainingDataFrame.apply(lambda row: row.Request_Address_URI.count("/"), axis=1)
normalTrainingDataFrame['No_of_questions'] = normalTrainingDataFrame.apply(lambda row: row.Request_Address_URI.count("?"), axis=1)
normalTrainingDataFrame['No_of_equals'] = normalTrainingDataFrame.apply(lambda row: row.Request_Address_URI.count("="), axis=1)

testDataFrame['No_of_slashes'] = testDataFrame.apply(lambda row: row.Request_Address_URI.count("/"), axis=1)
testDataFrame['No_of_questions'] = testDataFrame.apply(lambda row: row.Request_Address_URI.count("?"), axis=1)
testDataFrame['No_of_equals'] = testDataFrame.apply(lambda row: row.Request_Address_URI.count("="), axis=1)

In [14]:
normalTrainingDataFrame['Put_Request'] = normalTrainingDataFrame['Request_Type'] == "PUT"
testDataFrame['Put_Request'] = testDataFrame['Request_Type'] == "PUT"

In [15]:
normalTrainingDataFrame['No_of_ampersands'] = normalTrainingDataFrame.apply(lambda row: row.Request_Address_URI.count("&"), axis=1)
normalTrainingDataFrame['No_of_periods'] = normalTrainingDataFrame.apply(lambda row: row.Request_Address_URI.count("."), axis=1)

testDataFrame['No_of_ampersands'] = testDataFrame.apply(lambda row: row.Request_Address_URI.count("&"), axis=1)
testDataFrame['No_of_periods'] = testDataFrame.apply(lambda row: row.Request_Address_URI.count("."), axis=1)

In [16]:
normalTrainingDataFrame['Length_of_URI'] = normalTrainingDataFrame.apply(lambda row: len(row.Request_Address_URI), axis=1)

testDataFrame['Length_of_URI'] = testDataFrame.apply(lambda row: len(row.Request_Address_URI), axis=1)

In [17]:
def get_ext(uri):
    tokens = uri.split(".")
    if len(tokens)==1:
        return ""
    else:
        return tokens[len(tokens)-1]
    
def get_final_file(uri):
    tokens = uri.split("/")
    return tokens[len(tokens)-1]

# To try and find bad extensions
def max_no_of_periods_between_slashes(uri):
    num_found = 0
    tokens = uri.split("/")
    for token in tokens:
        num_in_token = token.count(".")
        if num_in_token > num_found:
            num_found = num_in_token
    return num_found

In [18]:
normalTrainingDataFrame['URI_ext'] = normalTrainingDataFrame.apply(lambda row: get_ext(row.Request_Address_URI), axis=1)
normalTrainingDataFrame['URI_file'] = normalTrainingDataFrame.apply(lambda row: get_final_file(row.Request_Address_URI), axis=1)
normalTrainingDataFrame['Max_no_periods_between_slashes'] = normalTrainingDataFrame.apply(lambda row: max_no_of_periods_between_slashes(row.Request_Address_URI), axis=1)
normalTrainingDataFrame['URI_ext_end_with_alpha'] = normalTrainingDataFrame.apply(lambda row: row.URI_ext[-1:].isalpha(), axis=1)
normalTrainingDataFrame['URI_ext_end_with_numeric'] = normalTrainingDataFrame.apply(lambda row: row.URI_ext[-1:].isdigit(), axis=1)
normalTrainingDataFrame['URI_has_CAPS'] = normalTrainingDataFrame.apply(lambda row: any(x.isupper() for x in row.Request_Address_URI), axis=1)

testDataFrame['URI_ext'] = testDataFrame.apply(lambda row: get_ext(row.Request_Address_URI), axis=1)
testDataFrame['URI_file'] = testDataFrame.apply(lambda row: get_final_file(row.Request_Address_URI), axis=1)
testDataFrame['Max_no_periods_between_slashes'] = testDataFrame.apply(lambda row: max_no_of_periods_between_slashes(row.Request_Address_URI), axis=1)
testDataFrame['URI_ext_end_with_alpha'] = testDataFrame.apply(lambda row: row.URI_ext[-1:].isalpha(), axis=1)
testDataFrame['URI_ext_end_with_numeric'] = testDataFrame.apply(lambda row: row.URI_ext[-1:].isdigit(), axis=1)
testDataFrame['URI_has_CAPS'] = testDataFrame.apply(lambda row: any(x.isupper() for x in row.Request_Address_URI), axis=1)

In [19]:
normalTrainingDataFrame['URI_ext_shortened'] = normalTrainingDataFrame.apply(lambda row: row.URI_ext[:3], axis=1)

testDataFrame['URI_ext_shortened'] = testDataFrame.apply(lambda row: row.URI_ext[:3], axis=1)

In [20]:
import math

# Referenced from https://stackoverflow.com/questions/1547899/which-characters-make-a-url-invalid 
URI_CHARS_LIST = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="

def shannon_entropy(data, iterator):
    """
    Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html
    """
    if not data:
        return 0
    entropy = 0
    for x in iterator:
        p_x = float(data.count(x))/len(data)
        if p_x > 0:
            entropy += - p_x*math.log(p_x, 2)
    return entropy

In [21]:
normalTrainingDataFrame['Entropy'] = normalTrainingDataFrame.apply(lambda row: shannon_entropy(row.Request_Address_URI, URI_CHARS_LIST), axis=1)

testDataFrame['Entropy'] = testDataFrame.apply(lambda row: shannon_entropy(row.Request_Address_URI, URI_CHARS_LIST), axis=1)

In [22]:
normalTrainingDataFrame['Length_of_URI_File'] = normalTrainingDataFrame.apply(lambda row: len(row.URI_file), axis=1)

testDataFrame['Length_of_URI_File'] = testDataFrame.apply(lambda row: len(row.URI_file), axis=1)

In [23]:
# Static Attacks Detection

# To detect JSESSION_ID in URL Rewrite, none of rows contain "jsessionid="
normalTrainingDataFrame['Has_set-cookie'] = normalTrainingDataFrame.apply(lambda row: "Set-cookie" in row.Request_Address_URI, axis=1)

testDataFrame['Has_set-cookie'] = testDataFrame.apply(lambda row: "Set-cookie" in row.Request_Address_URI, axis=1)

# Identify accesses to backup files
normalTrainingDataFrame['Access_backup'] = normalTrainingDataFrame.apply(lambda row: (row.URI_ext).lower() == "bak", axis=1)

testDataFrame['Access_backup'] = testDataFrame.apply(lambda row: (row.URI_ext).lower() == "bak", axis=1)

# Identify accesses to config files
normalTrainingDataFrame['Access_config'] = normalTrainingDataFrame.apply(lambda row: (row.URI_ext).lower() == "cnf", axis=1)

testDataFrame['Access_config'] = testDataFrame.apply(lambda row: (row.URI_ext).lower() == "cnf", axis=1)

# Identify accesses to default index.html file
normalTrainingDataFrame['Access_index_html'] = normalTrainingDataFrame.apply(lambda row: "index.html" in row.Request_Address_URI, axis=1)

testDataFrame['Access_index_html'] = testDataFrame.apply(lambda row: "index.html" in row.Request_Address_URI, axis=1)

In [24]:
# Dynamic Attacks Detection

# Identify possible SQL Commands
def spot_sql(uri):
    return ("SELECT" in uri or "FROM" in uri or "UNION" in uri or "OR" in uri or "--" in uri or "/**/" in uri or "INSERT" in uri or "UPDATE" in uri or "DELETE" in uri or "CREATE" in uri or "ALTER" in uri or "DROP" in uri)

normalTrainingDataFrame['Possible_SQL_Injection'] = normalTrainingDataFrame.apply(lambda row: spot_sql(row.Request_Address_URI), axis=1)
testDataFrame['Possible_SQL_Injection'] = testDataFrame.apply(lambda row: spot_sql(row.Request_Address_URI), axis=1)

In [25]:
# Cross site scripting
normalTrainingDataFrame['Contains_script_word'] = normalTrainingDataFrame.apply(lambda row: "script" in row.Request_Address_URI.lower(), axis=1)
normalTrainingDataFrame['Contains_script_word'] = normalTrainingDataFrame.apply(lambda row: "script" in row.Request_Address_URI.lower(), axis=1)

testDataFrame['Contains_another_http'] = testDataFrame.apply(lambda row: "http" in row.Request_Address_URI.lower(), axis=1)
testDataFrame['Contains_another_http'] = testDataFrame.apply(lambda row: "http" in row.Request_Address_URI.lower(), axis=1)

Tidying up of Table - Enumeration and Dropping Features

In [26]:
normalTrainingDataFrame = normalTrainingDataFrame.drop(columns=['URI_ext', 'URI_file', 'URI_ext_shortened', 'Request_Address_URI', 'Request_Type'])
testDataFrame = testDataFrame.drop(columns=['URI_ext', 'URI_file', 'URI_ext_shortened', 'Request_Address_URI', 'Request_Type'])

In [27]:
normalTrainingDataFrame = normalTrainingDataFrame.replace([True, False], [1, 0])
normalTrainingDataFrame = normalTrainingDataFrame.replace(['No', 'Yes'], [-1,1])

testDataFrame = testDataFrame.replace([True, False], [1, 0])
testDataFrame = testDataFrame.replace(['No', 'Yes'], [-1,1])


In [28]:
X_train = normalTrainingDataFrame.drop(columns='Normal_Access')
y_train = normalTrainingDataFrame['Normal_Access']

In [29]:
X_test = testDataFrame.loc[testDataFrame['Normal_Access']==1].drop(columns='Normal_Access')
y_test = testDataFrame.loc[testDataFrame['Normal_Access']==1]['Normal_Access']

X_outliers = testDataFrame.loc[testDataFrame['Normal_Access']==-1].drop(columns='Normal_Access')
y_outliers = testDataFrame.loc[testDataFrame['Normal_Access']==-1]['Normal_Access']

## Baseline Models

In [30]:
# One class svm
from sklearn import svm

svm_clf = svm.OneClassSVM(nu=0.05)
svm_clf.fit(X_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.05, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [31]:
y_pred_train = svm_clf.predict(X_train)
y_pred_test = svm_clf.predict(X_test)
y_pred_outliers = svm_clf.predict(X_outliers)

In [32]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

result_df_columns = ['Dataset', 'Accuracy', 'Precision (Micro)', 'Precision (Weighted)', 'Recall (Micro)', 'Recall (Weighted)', 'F-1 (Micro)', 'F1 (Weighted)']
results_svm_dataframe = pd.DataFrame(columns=result_df_columns)


In [33]:
# X_train

x_train_acc = accuracy_score(y_train, y_pred_train)
x_train_prec_mic = precision_score(y_train, y_pred_train, average='micro')
x_train_prec_mac = precision_score(y_train, y_pred_train, average='weighted')
x_train_rec_mic = recall_score(y_train, y_pred_train, average='micro')
x_train_rec_mac = recall_score(y_train, y_pred_train, average='weighted')
x_train_f1_mic = f1_score(y_train, y_pred_train, average='micro')
x_train_f1_mac = f1_score(y_train, y_pred_train, average='weighted')

x_train_res = ["Train", x_train_acc, x_train_prec_mic, x_train_prec_mac, x_train_rec_mic, x_train_rec_mac, x_train_f1_mic, x_train_f1_mac]
x_train_df = pd.DataFrame([x_train_res], columns=result_df_columns)

results_svm_dataframe = results_svm_dataframe.append(x_train_df, ignore_index=True)

In [34]:
# X_test

x_test_acc = accuracy_score(y_test, y_pred_test)
x_test_prec_mic = precision_score(y_test, y_pred_test, average='micro')
x_test_prec_mac = precision_score(y_test, y_pred_test, average='weighted')
x_test_rec_mic = recall_score(y_test, y_pred_test, average='micro')
x_test_rec_mac = recall_score(y_test, y_pred_test, average='weighted')
x_test_f1_mic = f1_score(y_test, y_pred_test, average='micro')
x_test_f1_mac = f1_score(y_test, y_pred_test, average='weighted')

x_test_res = ["Test", x_test_acc, x_test_prec_mic, x_test_prec_mac, x_test_rec_mic, x_test_rec_mac, x_test_f1_mic, x_test_f1_mac]
x_test_df = pd.DataFrame([x_test_res], columns=result_df_columns)

results_svm_dataframe = results_svm_dataframe.append(x_test_df, ignore_index=True)

In [35]:
# X_outliers
x_outliers_acc = accuracy_score(y_outliers, y_pred_outliers)
x_outliers_prec_mic = precision_score(y_outliers, y_pred_outliers, average='micro')
x_outliers_prec_mac = precision_score(y_outliers, y_pred_outliers, average='weighted')
x_outliers_rec_mic = recall_score(y_outliers, y_pred_outliers, average='micro')
x_outliers_rec_mac = recall_score(y_outliers, y_pred_outliers, average='weighted')
x_outliers_f1_mic = f1_score(y_outliers, y_pred_outliers, average='micro')
x_outliers_f1_mac = f1_score(y_outliers, y_pred_outliers, average='weighted')

x_outliers_res = ["Outliers", x_outliers_acc, x_outliers_prec_mic, x_outliers_prec_mac, x_outliers_rec_mic, x_outliers_rec_mac, x_outliers_f1_mic, x_outliers_f1_mac]
x_outliers_df = pd.DataFrame([x_outliers_res], columns=result_df_columns)

results_svm_dataframe = results_svm_dataframe.append(x_outliers_df, ignore_index=True)

In [38]:
results_svm_dataframe

Unnamed: 0,Dataset,Accuracy,Precision (Micro),Precision (Weighted),Recall (Micro),Recall (Weighted),F-1 (Micro),F1 (Weighted)
0,Train,0.713083,0.713083,1.0,0.713083,0.713083,0.713083,0.832514
1,Test,0.712361,0.712361,1.0,0.712361,0.712361,0.712361,0.832022
2,Outliers,0.690006,0.690006,1.0,0.690006,0.690006,0.690006,0.816572


In [45]:
# isolation forest

## Future Iterations

In [43]:
# Reduced Features
# Curse of dimensionality crucial in outlier detection


In [44]:
# plots???

## Conclusion