# Deep Learning Approach for Network Intrusion Detection in Software Defined Networking

This is a practical implementation and adaptation of the paper of Tuan A Tang et al.: [10.1109/WINCOM.2016.7777224](https://doi.org/10.1109/WINCOM.2016.7777224).  
Tang et al. built a deep neural network around software defined infrastructure with the target of anomaly-based intrusion detection and archived impressive results.  
Besides the practical implementation they made use of the NSL-KDD Dataset.  
As I am using the CICIDS2017 dataset, some tuning of input parameters is required. Mostly, the *count* and *srv_count* variables need to be adapted.  
These variables, which serve as two of six inputs of the neural network at hand, are calculated as the number of connections to the same host/service as the current connection __in the last two seconds__.  
As the CICIDS2017 dataset does not count the number of connections, it stands to be defined how to deal with this.

# Data loading and prep

In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

flows = pd.DataFrame()

datafile_names_sorted = [
    'Monday-WorkingHours.pcap_ISCX_clean.csv',
    'Tuesday-WorkingHours.pcap_ISCX_clean.csv',
    'Wednesday-WorkingHours.pcap_ISCX_clean.csv',
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_clean.csv',
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX_clean.csv',
    'Friday-WorkingHours-Morning.pcap_ISCX_clean.csv',
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_clean.csv',
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_clean.csv'
]

for filename in datafile_names_sorted:
    inputFileName = os.path.join('CICIDS2017', filename)
    print('Appending', inputFileName)
    new_flows = pd.read_csv(inputFileName)
    if 'external_ip' not in new_flows: # This field is not in all datafiles
            new_flows['external_ip'] = "0.0.0.0"
    flows = flows.append(new_flows,ignore_index=True,sort=False)

print('Found these class labels:', str(flows.label.unique()))
flows.tail()


Appending CICIDS2017\Monday-WorkingHours.pcap_ISCX_clean.csv


  interactivity=interactivity, compiler=compiler, result=result)


Appending CICIDS2017\Tuesday-WorkingHours.pcap_ISCX_clean.csv


  interactivity=interactivity, compiler=compiler, result=result)


Appending CICIDS2017\Wednesday-WorkingHours.pcap_ISCX_clean.csv
Appending CICIDS2017\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_clean.csv
Appending CICIDS2017\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX_clean.csv
Appending CICIDS2017\Friday-WorkingHours-Morning.pcap_ISCX_clean.csv
Appending CICIDS2017\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_clean.csv
Appending CICIDS2017\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_clean.csv


  interactivity=interactivity, compiler=compiler, result=result)


Found these class labels: ['BENIGN' 'FTPPatator' 'SSHPatator' 'DoSSlowloris' 'DoSSlowhttptest'
 'DoSHulk' 'DoSGoldenEye' 'Heartbleed' 'BruteForce' 'XSS' 'SQLInjection'
 'Infiltration' 'Bot' 'PortScan' 'DDoS']


Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol,timestamp,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,bwd_packet_length_max,bwd_packet_length_min,bwd_packet_length_mean,bwd_packet_length_std,flow_bytes_per_s,flow_packets_per_s,flow_iat_mean,flow_iat_std,flow_iat_max,flow_iat_min,fwd_iat_total,fwd_iat_mean,fwd_iat_std,fwd_iat_max,fwd_iat_min,bwd_iat_total,bwd_iat_mean,bwd_iat_std,bwd_iat_max,bwd_iat_min,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fwd_header_length,bwd_header_length,fwd_packets_per_s,bwd_packets_per_s,min_packet_length,max_packet_length,packet_length_mean,packet_length_std,packet_length_variance,fin_flag_count,syn_flag_count,rst_flag_count,psh_flag_count,ack_flag_count,urg_flag_count,cwe_flag_count,ece_flag_count,down_per_up_ratio,average_packet_size,avg_fwd_segment_size,avg_bwd_segment_size,fwd_header_length.1,fwd_avg_bytes_per_bulk,fwd_avg_packets_per_bulk,fwd_avg_bulk_rate,bwd_avg_bytes_per_bulk,bwd_avg_packets_per_bulk,bwd_avg_bulk_rate,subflow_fwd_packets,subflow_fwd_bytes,subflow_bwd_packets,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label,external_ip
2830738,192.168.10.17-192.168.10.50-37758-22-6,192.168.10.50,22,192.168.10.17,37758,6,2017-07-07T17:02:00,85,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35294.1,42.5,7.778175,48.0,37.0,0.0,0.0,0.0,0.0,0.0,48.0,48.0,0.0,48.0,48.0,0,0,0,0,32,64,11764.70588,23529.41176,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,2.0,0.0,0.0,0.0,32,0,0,0,0,0,0,1,0,2,0,243,290,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,
2830739,192.168.10.14-23.10.108.151-59111-443-6,192.168.10.14,59111,23.10.108.151,443,6,2017-07-07T17:02:00,113,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,106195.0,17699.1,113.0,0.0,113.0,113.0,113.0,113.0,0.0,113.0,113.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40,0,17699.11504,0.0,6.0,6.0,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,256,-1,1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,
2830740,192.168.10.17-192.168.10.50-37759-22-6,192.168.10.17,37759,192.168.10.50,22,6,2017-07-07T17:02:00,115,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17391.3,115.0,0.0,115.0,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,32,32,8695.652174,8695.652174,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,1.0,0.0,0.0,0.0,32,0,0,0,0,0,0,1,0,1,0,290,243,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,
2830741,162.213.33.50-192.168.10.51-443-59581-6,192.168.10.51,59581,162.213.33.50,443,6,2017-07-07T17:02:00,191310,3,2,148.0,0.0,148.0,0.0,49.333333,85.44784,0.0,0.0,0.0,0.0,773.614,26.1356,47827.5,54961.65969,95485.0,47.0,95825.0,47912.5,67175.85132,95413.0,412.0,95944.0,95944.0,0.0,95944.0,95944.0,0,0,0,0,104,72,15.681355,10.454237,0.0,148.0,24.666667,60.420747,3650.666667,0,0,0,1,0,0,0,0,0.0,29.6,49.333333,0.0,104,0,0,0,0,0,0,3,148,2,0,29200,235,1,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,
2830742,192.168.10.17-192.168.10.50-37759-22-6,192.168.10.50,22,192.168.10.17,37759,6,2017-07-07T17:02:00,81,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37037.0,40.5,12.020815,49.0,32.0,0.0,0.0,0.0,0.0,0.0,49.0,49.0,0.0,49.0,49.0,0,0,0,0,32,64,12345.67901,24691.35802,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,2.0,0.0,0.0,0.0,32,0,0,0,0,0,0,1,0,2,0,243,290,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,


We only need 6 features, so we create a new DF that only holds them.  
The mapping is as follows:  

| NSL-KDD field | CICIDS2017 field |
|---------------|---------------------|
| duration | flow_duration |
| protocol_type | protocol |
| src_bytes | total_fwd_packets |
| dst_bytes | total_backward_packets |
| count | flow_packets_per_s |
| srv_count | destination_port |


In [2]:
features = flows.filter(['flow_duration', 'protocol', 'total_fwd_packets', 'total_backward_packets','flow_packets_per_s','destination_port'], axis=1) 
#convert into numpy array, as keras seems to like that better
features_nd = features.astype('float64').values
features.head()

Unnamed: 0,flow_duration,protocol,total_fwd_packets,total_backward_packets,flow_packets_per_s,destination_port
0,4,6,2,0,500000.0,49188
1,1,6,2,0,2000000.0,49188
2,1,6,2,0,2000000.0,49188
3,1,6,2,0,2000000.0,49188
4,3,6,2,0,666666.6666666666,49486


In [3]:
features_nd[features_nd == np.inf] = 0 # FIXME: replace with something sensible
print("Data has NaN:",np.any(np.isnan(features_nd)))
print("Data has only finite values",np.all(np.isfinite(features_nd)))

Data has NaN: False
Data has only finite values True


In [4]:
from keras.preprocessing.text import Tokenizer
# tokenize the LABELS
label_tokenizer = Tokenizer(num_words=20, filters='') # don't filter any of the characters. 1 entry = 1 label 
label_tokenizer.fit_on_texts(flows['label'])

# Run the fitted tokenizer on the label column and save the encoded data as dataframe
enc_labels = label_tokenizer.texts_to_sequences(flows['label'])
enc_labels = np.concatenate(enc_labels).ravel()

# as the Encoder documentation states, 0 will never assigned to a label.
# I, on the other hand, need an index starting with 0. So we substract 1 of all classes.
enc_labels = enc_labels -1


Using TensorFlow backend.


## Data Normalization

In [5]:
# FIXME: Don't do normalization on test data!
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
flows_scaled = min_max_scaler.fit_transform(features_nd)

In [6]:
print("Shape of the final netflow dataset:", flows_scaled.shape)
print("Outer type:", type(flows_scaled))
print("Single entry type:", type(flows_scaled[0]))

Shape of the final netflow dataset: (2830743, 6)
Outer type: <class 'numpy.ndarray'>
Single entry type: <class 'numpy.ndarray'>


## Runtime preqs

In [7]:
from datetime import datetime
from os.path import exists, join

# Define some semi-global stuff

test_size = 0.3
batch_size = 10
no_of_classes = len(np.unique(enc_labels))

run_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
runtype_name = 'sdn-dnn'
log_folder_path = os.path.join('logs',runtype_name + '-{}'.format(run_date))

In [8]:
from sklearn.model_selection import train_test_split
print("No of scaled flows:", len(flows_scaled))
print("No of labels:", len(enc_labels))

data_train, data_test, labels_train, labels_test = train_test_split(flows_scaled, enc_labels, test_size=test_size, shuffle=False)

print("Training Set Size:",len(labels_train))
print("Validation Set Size:",len(labels_test))

No of scaled flows: 2830743
No of labels: 2830743
Training Set Size: 1981520
Validation Set Size: 849223


In [9]:
# https://github.com/keras-team/keras/blob/master/examples/tensorboard_embeddings_mnist.py

# save the class labels to disk to color data points in TensorBoard accordingly
filename = os.path.join(log_folder_path,'metadata.tsv')
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w') as f:
    np.savetxt(f, labels_test)

## Building and training the model

In [10]:
# Time for some nice vizualization stuff. Set this up and include as callback, then:
# tensorboard --logdir=path/to/logdir
from keras.callbacks import EarlyStopping,ModelCheckpoint,TensorBoard

callbacks = [
    EarlyStopping(
        monitor='acc', # Which metric to monitor
        patience=3     # Interrupt training after acc has stopped improving for more than 1 epoch
    ),
    ModelCheckpoint(
        filepath='models/'+runtype_name+'-{}.h5'.format(run_date),
        monitor='val_loss',   
        save_best_only=True    # Only save one. Only overwrite this one if val_loss has improved
    ),
    TensorBoard(
        log_dir=log_folder_path,
        #histogram_freq=1,     # Record activation histograms every epoch
        #embeddings_freq=1,     # Record embedding data every epoch -> There's something wrong with the embeddings here. Keras crashed with them enabled
        #embeddings_layer_names=['LSTMnet'],
        #embeddings_metadata='metadata.tsv',
        #embeddings_data=data_test,
       # batch_size=batch_size
    )
]

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop
from keras.utils import plot_model

# see implementation/sdn-dnn.py for details, alternatives and comments
# TODO: Isn't there some kind of dropout missing?

model = Sequential()
model.add(Dense(12, activation='relu', input_dim=6))
model.add(Dense(6, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()
plot_model(model, to_file='model-sdn-dnn-alt.png', show_layer_names=True, show_shapes=True)

history = model.fit(data_train, labels_train, 
                    epochs=100, 
                    batch_size=batch_size,
                    verbose=1,
                    validation_data=(data_test, labels_test),
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 12)                84        
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 78        
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 21        
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 4         
Total params: 187
Trainable params: 187
Non-trainable params: 0
_________________________________________________________________
Train on 1981520 samples, validate on 849223 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
