# XGBoost with CICIDS

This notebook provides comparsion stats.  
The CICIDS2017 version used is [from the University of New Brunswick, Canada](http://www.unb.ca/cic/datasets/ids-2017.html).

Problem / ToDo summary:

- is my keras layer architecture right? 4 vs. 5 layers
    - especially the last layer - image shows 2 nodes -> I am using 1 node with binary crossentropy
- Float issue / overflow with infinite values being recognized by numpy/pandas
- count & srv_count don't have an appropriate representation in CICIDS
- training dropout not defined by authors
- activation functions not defined by authors
- normalization strategy not defined by authors

# Data loading and prep

In [None]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

flows = pd.DataFrame()

datafile_names_sorted = [
    'Monday-WorkingHours.pcap_ISCX_clean.csv',
    'Tuesday-WorkingHours.pcap_ISCX_clean.csv',
    'Wednesday-WorkingHours.pcap_ISCX_clean.csv',
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_clean.csv',
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX_clean.csv',
    'Friday-WorkingHours-Morning.pcap_ISCX_clean.csv',
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_clean.csv',
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_clean.csv'
]

for filename in datafile_names_sorted:
    inputFileName = os.path.join('CICIDS2017', filename)
    print('Appending', inputFileName)
    new_flows = pd.read_csv(inputFileName)
    if 'external_ip' not in new_flows: # This field is not in all datafiles
            new_flows['external_ip'] = "0.0.0.0"
    flows = flows.append(new_flows,ignore_index=True,sort=False)

print('Found these class labels:', str(flows.label.unique()))
flows.tail()


We only need 6 features, so we create a new DF that only holds them.  
The mapping is as follows:  

| NSL-KDD field | CICIDS2017 field |
|---------------|---------------------|
| duration | flow_duration |
| protocol_type | protocol |
| src_bytes | total_fwd_packets |
| dst_bytes | total_backward_packets |
| count | flow_packets_per_s |
| srv_count | destination_port |


In [None]:
features = flows.filter(['flow_duration', 'protocol', 'total_fwd_packets', 'total_backward_packets','flow_packets_per_s','destination_port'], axis=1) 
#convert into numpy array, as keras seems to like that better
features_nd = features.astype('float64').values
features.head()

In [None]:
features_nd[features_nd == np.inf] = 0 # FIXME: replace with something sensible
print("Data has NaN:",np.any(np.isnan(features_nd)))
print("Data has only finite values",np.all(np.isfinite(features_nd)))

In [None]:
from keras.preprocessing.text import Tokenizer
# tokenize the LABELS
label_tokenizer = Tokenizer(num_words=20, filters='') # don't filter any of the characters. 1 entry = 1 label 
label_tokenizer.fit_on_texts(flows['label'])

# Run the fitted tokenizer on the label column and save the encoded data as dataframe
enc_labels = label_tokenizer.texts_to_sequences(flows['label'])
enc_labels = np.concatenate(enc_labels).ravel()

# as the Encoder documentation states, 0 will never assigned to a label.
# I, on the other hand, need an index starting with 0. So we substract 1 of all classes.
enc_labels = enc_labels -1


In [None]:
def f(x):
    return 1 if x > 0 else 0
f = np.vectorize(f)

# We only want to know if it's benign or not, so we switch to 0 or 1
enc_labels = f(enc_labels)

## Data Split and Normalization

In [None]:
test_size = 0.3

from sklearn.model_selection import train_test_split
print("No of flows:", len(features_nd))
print("No of labels:", len(enc_labels))

data_train, data_test, labels_train, labels_test = train_test_split(features_nd, enc_labels, test_size=test_size, shuffle=False)

print("Training Set Size:",len(labels_train))
print("Validation Set Size:",len(labels_test))

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
#flows_scaled = min_max_scaler.fit_transform(features_nd)
data_train = min_max_scaler.fit_transform(data_train)
data_test = min_max_scaler.transform(data_test)

In [None]:
print("Shape of the final netflow dataset:", data_train.shape)
print("Outer type:", type(data_train))
print("Single entry type:", type(data_train[0]))

## Runtime preqs

In [None]:
from datetime import datetime
from os.path import exists, join

# Define some semi-global stuff

batch_size = 10
epochs = 100
learn_rate = 0.001

run_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
runtype_name = 'cicids2017-sdn-dnn'
log_folder_path = os.path.join('logs',runtype_name + '-{}'.format(run_date))

In [None]:
# https://github.com/keras-team/keras/blob/master/examples/tensorboard_embeddings_mnist.py

# save the class labels to disk to color data points in TensorBoard accordingly
filename = os.path.join(log_folder_path,'metadata.tsv')
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w') as f:
    np.savetxt(f, labels_test)

## Building and training the model

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=0
)

xgb.fit(data_train, labels_train)

In [None]:
from sklearn.metrics import accuracy_score

kdd_test_predicitions = xgb.predict(kdd_test)
predicted = [round(value) for value in kdd_test_predicitions]

accuracy = accuracy_score(kdd_test_labels, predicted)
print(f'Mean accuracy score: {accuracy:.3}')