In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 200)
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../../1_Attack dataset/Internet/attack_dataset.csv.gz") # attack dataset
bonafide = pd.read_csv('../../2_Bonafide dataset/data/bonafide_dataset_20191121.csv.gz') # bonafide traffic from mawilab
bonafide = pd.concat([bonafide, pd.read_csv('../../2_Bonafide dataset/data/bonafide_dataset_20201110.csv.gz')])
bonafide = pd.concat([bonafide, pd.read_csv('../../2_Bonafide dataset/data/bonafide_dataset_20201129.csv.gz')])
print(df.shape, bonafide.shape)

(455503, 42) (380438, 41)


# Pre-processing

In [3]:
bonafide['label'] = "bonafide" # label column in the bonafide dataset

### Convert specific hex fields to integer

In [4]:
fields = ['eth.type', 'ip.id', 'ip.flags', 'ip.checksum', 'ip.dsfield', 'tcp.flags', 'tcp.checksum']

for field in fields:
    df[field] = df[field].apply(lambda x: int(str(x), 16))
    

### Fill NA with 0 and convert hex to integer

In [5]:
bonafide = bonafide.fillna(0)
for field in fields:
    bonafide[field] = bonafide[field].apply(lambda x: int(str(x), 16))

### Create a dataset with all packets (bonafide and attack)

In [6]:
full_data = pd.concat([bonafide, df])

### Check if there are packets with protocol field different than TCP (value 6)

In [7]:
wrong_proto = full_data[full_data['ip.proto'] != 6]['label'].value_counts().values
full_data = full_data[full_data['ip.proto'] == 6]
print("Found and removed", wrong_proto,"packets from the original dataset.")

Found and removed [52177] packets from the original dataset.


### Features not applicable to this work

> Remove features from layer 2 (link):
> - frame_info.time
> - frame_info.encap_type
> - frame_info.time_epoch
> - frame_info.number
> - frame_info.len
> - frame_info.cap_len
> - eth.type

> Remove redundant features or non-variant (constant)
> - ip.version - we are considering only IPv4
> - ip.proto - we are considering only TCP
> - ip.src
> - ip.dst
> - ip.flags
> - tcp.flags

In [8]:
full_data.drop(columns=['frame_info.time', 'frame_info.encap_type', 'frame_info.time_epoch', 'frame_info.number', 
                        'frame_info.len', 'frame_info.cap_len', 'eth.type', 'ip.flags', 'ip.src', 'ip.dst',
                        'ip.version', 'ip.proto', 'tcp.flags'], axis=1, inplace=True)

### Remove columns with variance zero

In [9]:
full_data.drop(columns=['ip.hdr_len', 'ip.tos', 'ip.flags.rb', 
                        'ip.flags.mf', 'ip.frag_offset'], axis=1, inplace=True)

### Replace labels by 0 (bonafide) and 1 (attack)

In [10]:
full_data.label[full_data.label == "bonafide"] = 0 # replace "normal" labels to 0
full_data.label[full_data.label != 0] = 1 # replace all scan labels to 1
full_data['label'].value_counts()

1    455503
0    328261
Name: label, dtype: int64

### Remove more columns
>
> - checksum and acknowlegde are random
> - tcp.dstport will tend to learn the testbed (some tools were targeted to specific services)

In [11]:
full_data.drop(columns=["ip.checksum", "tcp.checksum", 
                        "tcp.ack", "tcp.dstport"], axis=1, inplace=True)

### Drop duplicates

In [12]:
full_data.drop_duplicates(inplace=True, ignore_index=True)

# Generate models

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# preprocessor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Best Params from Grid Search (previous step)

- MLP: {'hidden_layer_sizes': 10}
- SVM: {}
- KNN: {'n_neighbors': 1}
- XGB: {}
-  NB: {}
-  LR: {}
-  RF: {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 15}
-  DT: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 15}

### Prepare data

In [14]:
full_data = full_data.fillna(0)
X = full_data.drop(columns = ["label"])
y = full_data.label
X = X.astype(int) 
X.columns.values

array(['ip.id', 'ip.flags.df', 'ip.ttl', 'ip.len', 'ip.dsfield',
       'tcp.srcport', 'tcp.seq', 'tcp.len', 'tcp.hdr_len',
       'tcp.flags.fin', 'tcp.flags.syn', 'tcp.flags.reset',
       'tcp.flags.push', 'tcp.flags.ack', 'tcp.flags.urg',
       'tcp.flags.cwr', 'tcp.window_size', 'tcp.urgent_pointer',
       'tcp.options.mss_val'], dtype=object)

### Prepare data - Pre-processing for ML

In [15]:
prep = StandardScaler()
prep.fit(X)

StandardScaler()

In [16]:
from joblib import dump
dump(prep, open('Models/preprocessor.pkl', 'wb'))

# Generate ML Models

Models with the parameters determined with grid search in the previous step.

In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=17)
svm = LinearSVC(random_state=17)
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
xgb = XGBClassifier(n_jobs=-1, random_state=17)
nb = GaussianNB()
lr = LogisticRegression(n_jobs=-1, random_state=17)
rf = RandomForestClassifier(class_weight="balanced_subsample", criterion="entropy", max_depth=10, n_estimators=15, n_jobs=-1, random_state=17)
dt = DecisionTreeClassifier(class_weight=None, criterion="entropy", max_depth=15, random_state=17)

Generate the _full model_ (supervised learning) with all dataset

In [18]:
mlp.fit(X, y)
svm.fit(X, y)
knn.fit(X, y)
xgb.fit(X, y)
nb.fit(X, y)
lr.fit(X, y)
rf.fit(X, y)
dt.fit(X, y)



DecisionTreeClassifier(criterion='entropy', max_depth=15, random_state=17)

Exporting the models to be used by the IDS application

In [19]:
dump(mlp, open('Models/mlp.pkl', 'wb'))
dump(svm, open('Models/svm.pkl', 'wb'))
dump(knn, open('Models/knn.pkl', 'wb'))
dump(xgb, open('Models/xgb.pkl', 'wb'))
dump(nb, open('Models/nb.pkl', 'wb'))
dump(lr, open('Models/lr.pkl', 'wb'))
dump(rf, open('Models/rf.pkl', 'wb'))
dump(dt, open('Models/dt.pkl', 'wb'))