## Dataset Loading and Selection

In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

flows = pd.DataFrame()

for filename in os.listdir('CICIDS2017'): # this is where the CIC CSV files live
    if filename.endswith('_clean.csv'):
        inputFileName = os.path.join('CICIDS2017', filename)
        print('Appending', inputFileName)
        new_flows = pd.read_csv(inputFileName)
        if 'external_ip' not in new_flows: # This field is not in all datafiles
            new_flows['external_ip'] = np.nan

        flows = flows.append(new_flows,ignore_index=True,sort=False)

print('Found these class labels:', str(flows.label.unique()))

Appending CICIDS2017\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_clean.csv


  interactivity=interactivity, compiler=compiler, result=result)


Appending CICIDS2017\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_clean.csv
Appending CICIDS2017\Friday-WorkingHours-Morning.pcap_ISCX_clean.csv
Appending CICIDS2017\Monday-WorkingHours.pcap_ISCX_clean.csv


  interactivity=interactivity, compiler=compiler, result=result)


Appending CICIDS2017\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX_clean.csv


  interactivity=interactivity, compiler=compiler, result=result)


Appending CICIDS2017\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_clean.csv
Appending CICIDS2017\Tuesday-WorkingHours.pcap_ISCX_clean.csv
Appending CICIDS2017\Wednesday-workingHours.pcap_ISCX_clean.csv
Found these class labels: ['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration' 'BruteForce' 'XSS'
 'SQLInjection' 'FTPPatator' 'SSHPatator' 'DoSSlowloris' 'DoSSlowhttptest'
 'DoSHulk' 'DoSGoldenEye' 'Heartbleed']


There are a lot of additional, calculated field in the CIC dataset. Whilst these are interesting to have for research purposes, I am mostly interested to stay as close to conventional netflows.  
For starters, we'll drop the flow_id as well as the timestamp, as both fields introduce problems and are irrelevant.  
As a continuation, we'll drop most of the remaining calculated field to keep the [Curse of Dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) at bay and keep complexity and training times under control.

In [2]:
# drop unused fields.
unused_fields = ['flow_id','timestamp','fwd_packet_length_max','fwd_packet_length_min','fwd_packet_length_mean','fwd_packet_length_std','bwd_packet_length_max','bwd_packet_length_min','bwd_packet_length_mean','bwd_packet_length_std','flow_iat_mean','flow_iat_std','flow_iat_max','flow_iat_min','fwd_iat_total','fwd_iat_mean','fwd_iat_std','fwd_iat_max','fwd_iat_min','bwd_iat_total','bwd_iat_mean','bwd_iat_std','bwd_iat_max','bwd_iat_min','min_packet_length','max_packet_length','packet_length_mean','packet_length_std','packet_length_variance','average_packet_size','avg_fwd_segment_size','avg_bwd_segment_size','fwd_avg_bytes_per_bulk','fwd_avg_packets_per_bulk','fwd_avg_bulk_rate','bwd_avg_bytes_per_bulk','bwd_avg_packets_per_bulk','bwd_avg_bulk_rate','active_mean','active_std','active_max','active_min','idle_mean','idle_std','idle_max','idle_min','min_seg_size_forward']
flows.drop(unused_fields, axis=1, inplace=True)
flows.head()

Unnamed: 0,source_ip,source_port,destination_ip,destination_port,protocol,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,flow_bytes_per_s,flow_packets_per_s,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fwd_header_length,bwd_header_length,fwd_packets_per_s,bwd_packets_per_s,fin_flag_count,syn_flag_count,rst_flag_count,psh_flag_count,ack_flag_count,urg_flag_count,cwe_flag_count,ece_flag_count,down_per_up_ratio,fwd_header_length.1,subflow_fwd_packets,subflow_fwd_bytes,subflow_bwd_packets,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,act_data_pkt_fwd,label,external_ip
0,192.168.10.16,41936,199.244.48.55,443,6,143347,47,60,1325.0,108751.0,767898.8748,746.4404557,0,0,0,0,1200,1928,327.875714,418.564742,0,0,0,1,0,0,0,0,1.0,1200,47,1325,60,108751,29200,61,30,BENIGN,
1,192.168.10.16,42970,54.210.195.63,80,6,50905,1,1,0.0,0.0,0.0,39.28887143,0,0,0,0,32,32,19.644436,19.644436,0,0,0,0,1,1,0,0,1.0,32,1,0,1,0,251,110,0,BENIGN,
2,192.168.10.16,41944,199.244.48.55,443,6,143899,46,58,1325.0,110185.0,774918.5192,722.7291364,0,0,0,0,1168,1864,319.668656,403.06048,0,0,0,1,0,0,0,0,1.0,1168,46,1325,58,110185,29200,61,30,BENIGN,
3,192.168.10.17,12886,192.168.10.3,53,17,313,2,2,90.0,206.0,945686.901,12779.55272,0,0,0,0,40,64,6389.776358,6389.776358,0,0,0,0,0,0,0,0,1.0,40,2,90,2,206,-1,-1,1,BENIGN,
4,192.168.10.16,41942,199.244.48.55,443,6,142605,45,58,1325.0,108751.0,771894.3936,722.274815,0,0,0,0,1136,1864,315.556958,406.717857,0,0,0,1,0,0,0,0,1.0,1136,45,1325,58,108751,29200,61,30,BENIGN,


## Data Encoding

There's still a problem: How can we encode IP addresses in a way that the neural network can make use of them while preserving the hierarchical information they contain?  
Encoding IPs through One Hot let's comlexity and training times explode, so for now I am splitting each IP into its four octet pairs and interpret them as numbers.  
Maybe there's a better way to represent them (especially because I am only able to encode IPv4 right now)

In [3]:
# https://stackoverflow.com/questions/14745022/how-to-split-a-column-into-two-columns
# FIXME: Right now, only IPv4 (4 octets)

# Split the String representation of the IP into it's four octects, which are delimited by a dot
flows['source_ip_o1'],flows['source_ip_o2'],flows['source_ip_o3'],flows['source_ip_o4'] = flows['source_ip'].str.split('.').str
flows['destination_ip_o1'],flows['destination_ip_o2'],flows['destination_ip_o3'],flows['destination_ip_o4'] = flows['destination_ip'].str.split('.').str
flows['external_ip_o1'],flows['external_ip_o2'],flows['external_ip_o3'],flows['external_ip_o4'] = flows['external_ip'].str.split('.').str

# After completion, drop the initial columns, as they aren't needed anymore
flows.drop(['source_ip'], axis=1, inplace=True)
flows.drop(['destination_ip'], axis=1, inplace=True)
flows.drop(['external_ip'], axis=1, inplace=True)

# Finally, let's inspect the outcome
flows.head()

Unnamed: 0,source_port,destination_port,protocol,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,flow_bytes_per_s,flow_packets_per_s,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fwd_header_length,bwd_header_length,fwd_packets_per_s,bwd_packets_per_s,fin_flag_count,syn_flag_count,rst_flag_count,psh_flag_count,ack_flag_count,urg_flag_count,cwe_flag_count,ece_flag_count,down_per_up_ratio,fwd_header_length.1,subflow_fwd_packets,subflow_fwd_bytes,subflow_bwd_packets,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,act_data_pkt_fwd,label,source_ip_o1,source_ip_o2,source_ip_o3,source_ip_o4,destination_ip_o1,destination_ip_o2,destination_ip_o3,destination_ip_o4,external_ip_o1,external_ip_o2,external_ip_o3,external_ip_o4
0,41936,443,6,143347,47,60,1325.0,108751.0,767898.8748,746.4404557,0,0,0,0,1200,1928,327.875714,418.564742,0,0,0,1,0,0,0,0,1.0,1200,47,1325,60,108751,29200,61,30,BENIGN,192,168,10,16,199,244,48,55,,,,
1,42970,80,6,50905,1,1,0.0,0.0,0.0,39.28887143,0,0,0,0,32,32,19.644436,19.644436,0,0,0,0,1,1,0,0,1.0,32,1,0,1,0,251,110,0,BENIGN,192,168,10,16,54,210,195,63,,,,
2,41944,443,6,143899,46,58,1325.0,110185.0,774918.5192,722.7291364,0,0,0,0,1168,1864,319.668656,403.06048,0,0,0,1,0,0,0,0,1.0,1168,46,1325,58,110185,29200,61,30,BENIGN,192,168,10,16,199,244,48,55,,,,
3,12886,53,17,313,2,2,90.0,206.0,945686.901,12779.55272,0,0,0,0,40,64,6389.776358,6389.776358,0,0,0,0,0,0,0,0,1.0,40,2,90,2,206,-1,-1,1,BENIGN,192,168,10,17,192,168,10,3,,,,
4,41942,443,6,142605,45,58,1325.0,108751.0,771894.3936,722.274815,0,0,0,0,1136,1864,315.556958,406.717857,0,0,0,1,0,0,0,0,1.0,1136,45,1325,58,108751,29200,61,30,BENIGN,192,168,10,16,199,244,48,55,,,,


The labels of the dataset (as in: *Benign*, *DDoS*, *Portscan*, etc) are converted into a list of integers and split off of the main DataFrame.  
After this step there is a variable `enc_labels` that holds an integer-encoded list of labels.
A humble example (not representative):  

|Label         | Value          |
|------------- |---------:|
|Benign      | 0|
|DDoS        | 1|
|Portscan    | 2|  

So if the order of the first three Netflows would be *Benign*, *Benign*, *DDos*,  
the resulting `enc_labels` would look like this: `[1,1,2]`

In [4]:
from keras.preprocessing.text import Tokenizer
# tokenize the LABELS
label_tokenizer = Tokenizer(num_words=20, filters='') # don't filter any of the characters. 1 entry = 1 label 
label_tokenizer.fit_on_texts(flows['label'])

# Run the fitted tokenizer on the label column and save the encoded data as dataframe
enc_labels = label_tokenizer.texts_to_sequences(flows['label'])
enc_labels = np.concatenate(enc_labels).ravel()

# as the Encoder documentation states, 0 will never assigned to a label.
# I, on the other hand, need an index starting with 0. So we substract 1 of all classes.
enc_labels = enc_labels -1

# finally, drop the label column
flows.drop(['label'], axis=1, inplace=True)

Using TensorFlow backend.


While we're at it, we make sure to never have any float values exceeding +/- infinity as well as NaN values.  
These are all replaces by zeros, which is a temporary fix and definitely a FIXME for the future

In [5]:
# weed out all NaN and infinite values
flows.replace([np.inf, -np.inf], np.nan)
flows.fillna(inplace=True, value=0) # FIXME: 0 for now, find a better way

As Keras seems to be a bit picky about the presented datatypes, we'll convert the Pandas DataFrame into it's underlying representation of Numpy-Arrays and work with these from this point onwars.

In [6]:
flows_nd = flows.astype('float64').values

In [7]:
# as the pandas infinity stuff is seemingly not enough, check the numpy array once again
from numpy import inf
flows_nd[flows_nd == -inf] = 0
flows_nd[flows_nd == inf] = 0

In [8]:
print("Data has NaN:",np.any(np.isnan(flows_nd)))
print("Data has only finite values",np.all(np.isfinite(flows_nd)))

Data has NaN: False
Data has only finite values True


## Data Normalization and Finishing Touches

Best performance is achieved if all values are normalized. In this approach I am using [sklearn's MinMaxScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html), which implements feature scaling through MinMax-Normalization (Rescaling).

In [9]:
# FIXME: Don't do normalization on test data!
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
flows_scaled = min_max_scaler.fit_transform(flows_nd)

Let's have a final glance at a single entry of our dataset:

In [10]:
print(flows_scaled[0])

[6.39902342e-01 6.75974670e-03 3.52941176e-01 1.19466656e-03
 2.09321162e-04 2.05534355e-04 1.02713178e-04 1.65917305e-04
 1.12250385e-01 3.33457740e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 9.99855861e-01 9.94593719e-01
 1.09291905e-04 2.09282371e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 6.41025641e-03 9.99855861e-01
 2.09321162e-04 1.02949899e-04 2.05534355e-04 1.65917305e-04
 4.45571899e-01 9.46044922e-04 1.40477718e-04 8.60986547e-01
 6.58823529e-01 3.92156863e-02 6.27450980e-02 7.80392157e-01
 9.56862745e-01 1.88235294e-01 2.15686275e-01 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00]


In [11]:
print("Shape of the final netflow dataset:", flows_scaled.shape)
print("Outer type:", type(flows_scaled))
print("Single entry type:", type(flows_scaled[0]))

Shape of the final netflow dataset: (2830743, 47)
Outer type: <class 'numpy.ndarray'>
Single entry type: <class 'numpy.ndarray'>


Last but not least, the [Keras Embedding Layer](https://keras.io/layers/embeddings/#embedding) expects a maximum vocabulary size, which we can simply calculate by finding max() in our scaled data:

In [12]:
#find the maximum vocabulary size
voc_size = (flows_scaled.max()+1).astype('int64')
print("Maximum vocabulary size:", voc_size)

Maximum vocabulary size: 2


## Building and Training the Model

We'll use some nice callbacks for the model at hand. As training of LSTM nets is computationally expensive, we'll save the best model to disk.  
Furthermore, we'll implement a callback that stops the training process as soon as the accuracy stops impproving.  
Finally, we register the tensorboard callback, which allows for detailed insights and nice vizualizations while and after training time.

**Also, this is where we define the percentages of train and test**

In [13]:
# Define some semi-global stuff
test_size = 0.3
batch_size = 64
no_of_classes = len(np.unique(enc_labels))

# https://stackoverflow.com/questions/3674409/how-to-split-partition-a-dataset-into-training-and-test-datasets-for-e-g-cros/18544946#18544946
# Split training and test data, as the tensorboard embedding stuff needs embedding data, too
from sklearn.model_selection import train_test_split
print("No of scaled flows:", len(flows_scaled))
print("No of labels:", len(enc_labels))

data_train, data_test, labels_train, labels_test = train_test_split(flows_scaled, enc_labels, test_size=test_size, shuffle=False)

print("Training Set Size:",len(labels_train))
print("Validation Set Size:",len(labels_test))

No of scaled flows: 2830743
No of labels: 2830743
Training Set Size: 1981520
Validation Set Size: 849223


In [14]:
from datetime import datetime
from os.path import exists, join
run_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# https://github.com/keras-team/keras/blob/master/examples/tensorboard_embeddings_mnist.py

# save the class labels to disk to color data points in TensorBoard accordingly
filename = os.path.join('logs','lstm-{}'.format(run_date),'metadata.tsv')
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w') as f:
    np.savetxt(f, labels_test)

In [15]:
# Time for some nice vizualization stuff. Set this up and include as callback, then:
# tensorboard --logdir=path/to/logdir
from keras.callbacks import EarlyStopping,ModelCheckpoint,TensorBoard

callbacks = [
    EarlyStopping(
        monitor='acc', # Which metric to monitor
        patience=1     # Interrupt training after acc has stopped improving for more than 1 epoch
    ),
    ModelCheckpoint(
        filepath='models/lstm-{}.h5'.format(run_date),
        monitor='val_loss',   
        save_best_only=True    # Only save one. Only overwrite this one if val_loss has improved
    ),
    TensorBoard(
        log_dir='logs/lstm-{}'.format(run_date),
        #histogram_freq=1,     # Record activation histograms every epoch
        #embeddings_freq=1,     # Record embedding data every epoch -> There's something wrong with the embeddings here. Keras crashed with them enabled
        #embeddings_layer_names=['LSTMnet'],
        #embeddings_metadata='metadata.tsv',
        #embeddings_data=data_test,
       # batch_size=batch_size
    )
]

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

# see https://stackoverflow.com/a/49436133/3864726
# This is especially important in an environment like Jupyter, where the Kernel keeps on running
from keras import backend as K
K.clear_session()

model = Sequential()
model.add(Embedding(voc_size, 32)) 
model.add(LSTM(32, name='LSTMnet'))
model.add(Dense(no_of_classes, activation='softmax')) # Multiclass classification. For binary, one would use i.e. sigmoid

model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy', # Multiclass classification! Binary would be binary_crossentropy
              metrics=['acc'])

history = model.fit(data_train, labels_train,
                    epochs=10,
                    batch_size=batch_size,
                    verbose=1,
                    validation_data=(data_test, labels_test),
                    callbacks=callbacks
                   )

Train on 1981520 samples, validate on 849223 samples
Epoch 1/10


KeyboardInterrupt: 