# XGBoost with NSL-KDD

This notebook provides comparsion stats.  
The NSL-KDD version used is the [preprocessed one by the University of New Brunswick, Canada](http://www.unb.ca/cic/datasets/nsl.html).

## Data loading and prep

In [1]:
import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

header_col = pd.read_csv(os.path.join('NSL_KDD', 'Field Names.csv'), header=None)
header_col = header_col.append(pd.DataFrame([['label','symbolic'],['unknown','continuous']]))

header_names = header_col[0].values

### Training Set

In [2]:
ftrain = os.path.join('NSL_KDD','KDDTrain+.csv')
kdd_train = pd.read_csv(ftrain, header=None, names=header_names)

# split off labels
kdd_train_labels = kdd_train['label']

# only keep columns that are actually used
used_fields = ['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
kdd_train = kdd_train.filter(used_fields)
kdd_train.head()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
0,0,tcp,491,0,2,2
1,0,udp,146,0,13,1
2,0,tcp,0,0,123,6
3,0,tcp,232,8153,5,5
4,0,tcp,199,420,30,32


### Test Set

In [3]:
ftest = os.path.join('NSL_KDD','KDDTest+.csv')
kdd_test = pd.read_csv(ftest, header=None, names=header_names)

# split off labels
kdd_test_labels = kdd_test['label']

# only keep columns that are actually used
kdd_test = kdd_test.filter(used_fields)

kdd_test.tail()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
22538,0,tcp,794,333,1,1
22539,0,tcp,317,938,2,11
22540,0,tcp,54540,8314,5,10
22541,0,udp,42,42,4,6
22542,0,tcp,0,0,4,10


Encode the lables to a numeric list

In [4]:
from keras.preprocessing.text import Tokenizer
# tokenize the LABELS
label_tokenizer = Tokenizer(num_words=50, filters='')
label_tokenizer.fit_on_texts((kdd_train_labels.append(kdd_test_labels)).values)

# Run the fitted tokenizer on the label column and save the encoded data as dataframe
kdd_train_labels = label_tokenizer.texts_to_sequences(kdd_train_labels)
kdd_train_labels = np.concatenate(kdd_train_labels).ravel()

# as the Encoder documentation states, 0 will never assigned to a label.
# I, on the other hand, need an index starting with 0. So we substract 1 of all classes.
kdd_train_labels = kdd_train_labels -1

# Do the same for the test labels
kdd_test_labels = label_tokenizer.texts_to_sequences(kdd_test_labels)
kdd_test_labels = np.concatenate(kdd_test_labels).ravel()
kdd_test_labels = kdd_test_labels -1

Using TensorFlow backend.


In [5]:
def f(x):
    return 1 if x > 0 else 0
f = np.vectorize(f)

# We only want to know if it's benign or not, so we switch to 0 or 1
kdd_train_labels = f(kdd_train_labels)
kdd_test_labels = f(kdd_test_labels)

In [6]:
# build a big dataframe out of both sets to train the tokenizer
full = pd.concat([kdd_train, kdd_test])
kdd_test.tail()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,count,srv_count
22538,0,tcp,794,333,1,1
22539,0,tcp,317,938,2,11
22540,0,tcp,54540,8314,5,10
22541,0,udp,42,42,4,6
22542,0,tcp,0,0,4,10


In [7]:
# tokenize the protocol_type column
protocol_tokenizer = Tokenizer(num_words=50, filters='')
protocol_tokenizer.fit_on_texts(full['protocol_type'])

train_enc = protocol_tokenizer.texts_to_sequences(kdd_train['protocol_type'])
test_enc = protocol_tokenizer.texts_to_sequences(kdd_test['protocol_type'])

In [8]:
kdd_train.drop('protocol_type', axis=1, inplace=True) # drop named column
kdd_train = pd.concat([kdd_train, pd.DataFrame(train_enc,columns=['protocol_type'])], axis=1, sort=False)
kdd_train.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,protocol_type
125968,0,0,0,184,25,1
125969,8,105,145,2,2,2
125970,0,2231,384,1,1,1
125971,0,0,0,144,8,1
125972,0,151,0,1,1,1


In [9]:
kdd_test.drop('protocol_type', axis=1, inplace=True)
kdd_test = pd.concat([kdd_test, pd.DataFrame(test_enc,columns=['protocol_type'])], axis=1, sort=False)
kdd_test.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,protocol_type
22538,0,794,333,1,1,1
22539,0,317,938,2,11,1
22540,0,54540,8314,5,10,1
22541,0,42,42,4,6,2
22542,0,0,0,4,10,1


## Data Normalization

In [10]:
# Fit on train data, transform test data with that
# https://stats.stackexchange.com/questions/174823/how-to-apply-standardization-normalization-to-train-and-testset-if-prediction-i
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
kdd_train = min_max_scaler.fit_transform(kdd_train)
kdd_test = min_max_scaler.transform(kdd_test)

In [12]:
print("Shape of the final netflow dataset:", kdd_train.shape)
print("Outer type:", type(kdd_train))
print("Single entry type:", type(kdd_train[0]))

Shape of the final netflow dataset: (125973, 6)
Outer type: <class 'numpy.ndarray'>
Single entry type: <class 'numpy.ndarray'>


## Runtime Preqs

In [13]:
from datetime import datetime
from os.path import exists, join

# Define some semi-global stuff

batch_size = 10
epochs = 100
learn_rate = 0.001

run_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
runtype_name = 'nsl-kdd-sdn-dnn'
log_folder_path = os.path.join('logs',runtype_name + '-{}'.format(run_date))

print('Using set globals: ')
print('Batch Size:', batch_size)
print('Epochs:', epochs)
print('Learn rate:', learn_rate)

Using set globals: 
Batch Size: 10
Epochs: 100
Learn rate: 0.001


In [14]:
print("No of scaled train entries:\t", len(kdd_train))
print("No of train labels:\t\t", len(kdd_train_labels))
print("-----------")
print("No of test entries:\t\t", len(kdd_test))
print("No of test labels:\t\t", len(kdd_test_labels))

No of scaled train entries:	 125973
No of train labels:		 125973
-----------
No of test entries:		 22543
No of test labels:		 22543


## Building and Training the Model

In [16]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=0
)

xgb.fit(kdd_train, kdd_train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [17]:
from sklearn.metrics import accuracy_score

kdd_test_predicitions = xgb.predict(kdd_test)
predicted = [round(value) for value in kdd_test_predicitions]

accuracy = accuracy_score(kdd_test_labels, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.767


  if diff:
