In [35]:
import os
import pandas as pd

DATA_PATH = os.path.join('data', 'NSL-KDD')
TRAIN_FILE_NAME = 'kdd-train.csv'
TEST_FILE_NAME = 'kdd-test.csv'

def load_data_set(data_path, filename):
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path)

packets = load_data_set(DATA_PATH, TRAIN_FILE_NAME)

last_row = len(packets)
packets = packets.drop(packets.index[last_row - 1])
del packets['idk']

packets['duration'] = pd.to_numeric(packets['duration'])

In [36]:
def one_hot_encode(data, column):
    one_hot_encode = pd.get_dummies(data[column])
    data = data.drop(column, axis=1)
    data = data.join(one_hot_encode)
    return data

In [37]:
def convert_class(x):
    return int(x != 'normal')

test_packets = load_data_set(DATA_PATH, TEST_FILE_NAME)

last_row = len(test_packets)
test_packets = test_packets.drop(test_packets.index[last_row - 1])
del test_packets['idk']

test_packets['class'] = test_packets['class'].apply(convert_class)

test_packets['duration'] = pd.to_numeric(test_packets['duration'])

In [38]:
train_len = len(packets)
frames = [packets, test_packets]
temp = pd.concat(frames, axis=0)

# temp = one_hot_encode(temp, 'protocol_type')
# temp = one_hot_encode(temp, 'service')
# temp = one_hot_encode(temp, 'flag')

temp_preprocessed = pd.get_dummies(temp)

packets = temp_preprocessed[:train_len]
test_packets = temp_preprocessed[train_len:]

In [39]:
def generate_arr(dataset, classification):
    classification_arr = dataset[classification].values
    dataset_arr = dataset[['src_bytes', 'dst_bytes']].values
    
    return dataset_arr, classification_arr

packets_arr, classification_arr = generate_arr(packets, 'class')
test_packets_arr, test_classification_arr = generate_arr(test_packets, 'class')

In [40]:
from sklearn import ensemble

clf = ensemble.AdaBoostClassifier()
clf.fit(packets_arr, classification_arr)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [41]:
clf.predict([test_packets_arr[29]])

array([ 0.])

In [42]:
test_packets.info()
test_packets.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22543 entries, 0 to 22542
Columns: 123 entries, duration to flag_SH
dtypes: float64(38), int64(1), uint8(84)
memory usage: 8.7 MB


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
22538,0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
22539,0,794.0,333.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
22540,0,317.0,938.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
22541,0,54540.0,8314.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
22542,0,42.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [43]:
packets.info()
packets.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125973 entries, 0 to 125972
Columns: 123 entries, duration to flag_SH
dtypes: float64(38), int64(1), uint8(84)
memory usage: 48.5 MB


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
125968,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
125969,8,105.0,145.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
125970,0,2231.0,384.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
125971,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
125972,0,151.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [48]:
test_packets_arr

array([[  0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00],
       [  1.29830000e+04,   0.00000000e+00],
       ..., 
       [  3.17000000e+02,   9.38000000e+02],
       [  5.45400000e+04,   8.31400000e+03],
       [  4.20000000e+01,   4.20000000e+01]])

In [45]:
from sklearn.metrics import zero_one_loss

pred = clf.predict(test_packets_arr)
error_rate = zero_one_loss(test_classification_arr, pred)
error_rate

0.22911768619970718

In [46]:
from sklearn.metrics import accuracy_score
accuracy_score(test_classification_arr, pred)

0.77088231380029282