# Random Forest with NSL-KDD

This notebook provides comparsion stats.  
The NSL-KDD version used is the [preprocessed one by the University of New Brunswick, Canada](http://www.unb.ca/cic/datasets/nsl.html).

In [1]:
import json
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

## Data loading and prep

As we've pickled the normalized and encoded dataset, we only need to load these pickles to get the Pandas DataFrames back.  
Hint: If you miss the pickles, go ahead and run the notebook named Pickle-NSL-KDD-ipynb

In [2]:
def load_df(filename):
    filepath = os.path.join('NSL_KDD', filename+'.pkl')
    return pd.read_pickle(filepath)

In [3]:
kdd_train_data = load_df('kdd_train_data')
kdd_test_data = load_df('kdd_test_data')
kdd_train_labels = load_df('kdd_train_labels')
kdd_test_labels = load_df('kdd_test_labels')

In [4]:
kdd_train_data.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
125968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.360078,0.048924,1.0,1.0,0.0,0.0,0.14,0.06,0.0,1.0,0.098039,0.1,0.06,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.028986,0.1
125969,0.000186,7.608895e-08,1.106923e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003914,0.003914,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.956863,0.96,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.5,0.028986,0.0
125970,0.0,1.616709e-06,2.931438e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001957,0.001957,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.117647,0.12,0.06,0.0,0.0,0.72,0.0,0.01,0.0,0.0,0.217391,0.0
125971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2818,0.015656,1.0,1.0,0.0,0.0,0.06,0.05,0.0,1.0,0.031373,0.03,0.05,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.507246,0.1
125972,0.0,1.094232e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001957,0.001957,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.301961,0.3,0.03,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The paper mentions that they only use six features of the full dataset which is why we filter the dataframes for these.

In [5]:
# only keep columns that are actually used
used_fields = ['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
kdd_train_data = kdd_train_data.filter(used_fields)
kdd_test_data = kdd_test_data.filter(used_fields)
kdd_train_data.tail()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
125968,0.0,0.0,0.0,0.0,0.360078,0.048924
125969,0.000186,0.5,7.608895e-08,1.106923e-07,0.003914,0.003914
125970,0.0,0.0,1.616709e-06,2.931438e-07,0.001957,0.001957
125971,0.0,0.0,0.0,0.0,0.2818,0.015656
125972,0.0,0.0,1.094232e-07,0.0,0.001957,0.001957


## Label Translation

In [6]:
with open(os.path.join('NSL_KDD','kdd_label_wordindex.json')) as json_in:
    data = json.load(json_in)
normal_index = data['normal']

In [7]:
def f(x):
    return 0 if x == normal_index else 1
f = np.vectorize(f)

In [8]:
kdd_train_labels.head()

Unnamed: 0,label,difficulty_level,label_encoded
0,normal,20,1
1,normal,15,1
2,neptune,19,2
3,normal,21,1
4,normal,21,1


In [9]:
# We only want to know if it's benign or not, so we switch to 0 or 1
kdd_train_labels = f(kdd_train_labels['label_encoded'].values)
kdd_test_labels = f(kdd_test_labels['label_encoded'].values)

In [10]:
kdd_train_labels[:5]

array([0, 0, 1, 0, 0])

In [12]:
print("No of scaled train entries:\t", len(kdd_train_data))
print("No of train labels:\t\t", len(kdd_train_labels))
print("-----------")
print("No of test entries:\t\t", len(kdd_test_data))
print("No of test labels:\t\t", len(kdd_test_labels))

No of scaled train entries:	 125973
No of train labels:		 125973
-----------
No of test entries:		 22543
No of test labels:		 22543


## Building and Training the Model

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    oob_score=True,
    n_estimators=100, # Number of trees in the forest
    n_jobs=-1, # Set number of jobs = CPU cores
    random_state=0 # Fixed init state
)

rf.fit(kdd_train_data, kdd_train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

In [14]:
from sklearn.metrics import accuracy_score

predicted = rf.predict(kdd_test_data)

accuracy = accuracy_score(kdd_test_labels, predicted)
print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

Out-of-bag score estimate: 0.962
Mean accuracy score: 0.728
