# Network Intrusion Detection with Deep Learning

In [1]:
import os


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


## The Data

In [3]:
# For the original '99 KDD dataset: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
# For the NSL-KDD Train+/Test+ data: https://github.com/defcom17/NSL_KDD

In [4]:
with open('../data_NSL/kddcup.names', 'r') as infile:
    kdd_names = infile.readlines()
kdd_cols = [x.split(':')[0] for x in kdd_names[1:]]

In [5]:
# The Train+/Test+ datasets include sample difficulty rating and the attack class

In [6]:
kdd_cols += ['class', 'difficulty']

In [7]:
kdd = pd.read_csv('../data_NSL/KDDTrain+.txt', names=kdd_cols)
kdd_t = pd.read_csv('../data_NSL/KDDTest+.txt', names=kdd_cols)

In [8]:
kdd.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [9]:
# Consult the linked references for attack categories: 
# https://www.researchgate.net/post/What_are_the_attack_types_in_the_NSL-KDD_TEST_set_For_example_processtable_is_a_attack_type_in_test_set_Im_wondering_is_it_prob_DoS_R2L_U2R
# The traffic can be grouped into 5 categories: Normal, DOS, U2R, R2L, Probe
# or more coarsely into Normal vs Anomalous for the binary classification task

In [10]:
kdd_cols = [kdd.columns[0]] + sorted(list(set(kdd.protocol_type.values))) + sorted(list(set(kdd.service.values))) + sorted(list(set(kdd.flag.values))) + kdd.columns[4:].tolist()

In [11]:
attack_map = [x.strip().split() for x in open("../data_NSL/training_attack_types.txt", 'r')]
attack_map = {k:v for (k,v) in attack_map}

In [12]:
attack_map2 = [x.strip().split() for x in open("../data_NSL/training_attack_types_clean.txt", 'r')]
attack_map2 = {k:v for (k,v) in attack_map2}

In [13]:
attack_map

{'apache2': 'dos',
 'arppoison': 'dos',
 'back': 'dos',
 'buffer_overflow': 'u2r',
 'casesen': 'u2r',
 'crashiis': 'dos',
 'desnuke': 'dos',
 'dict': 'u2r',
 'eject': 'u2r',
 'fdformat': 'u2r',
 'ffbconfig': 'u2r',
 'framespoof': 'u2r',
 'ftp_write': 'r2l',
 'ftpwrite': 'r2l',
 'guess_passwd': 'r2l',
 'guest': 'r2l',
 'httptunnel': 'u2r',
 'illegal-sniffer': 'u2r',
 'imap': 'r2l',
 'ipsweep': 'probe',
 'land': 'dos',
 'loadmodule': 'u2r',
 'lsdomain': 'u2r',
 'mailbomb': 'dos',
 'mscan': 'probe',
 'msscan': 'probe',
 'multihop': 'r2l',
 'named': 'r2l',
 'ncftp': 'r2l',
 'neptune': 'dos',
 'netbus': 'r2l',
 'netcat': 'r2l',
 'nmap': 'probe',
 'ntfsdos': 'u2r',
 'ntinfoscan': 'u2r',
 'nukepw': 'u2r',
 'perl': 'u2r',
 'phf': 'r2l',
 'pod': 'dos',
 'portsweep': 'probe',
 'ppmacro': 'r2l',
 'processtable': 'dos',
 'ps': 'u2r',
 'queso': 'u2r',
 'rootkit': 'u2r',
 'saint': 'probe',
 'satan': 'probe',
 'sechole': 'u2r',
 'secret': 'u2r',
 'selfping': 'dos',
 'sendmail': 'r2l',
 'smurf': 'dos'

In [14]:
attack_map2

{'arppoison': 'dos',
 'back': 'dos',
 'buffer_overflow': 'u2r',
 'casesen': 'u2r',
 'crashiis': 'dos',
 'desnuke': 'dos',
 'dict': 'u2r',
 'eject': 'u2r',
 'fdformat': 'u2r',
 'ffbconfig': 'u2r',
 'framespoof': 'u2r',
 'ftp_write': 'r2l',
 'ftpwrite': 'r2l',
 'guess_passwd': 'r2l',
 'guest': 'r2l',
 'illegal-sniffer': 'u2r',
 'imap': 'r2l',
 'ipsweep': 'probe',
 'land': 'dos',
 'loadmodule': 'u2r',
 'lsdomain': 'u2r',
 'msscan': 'probe',
 'multihop': 'r2l',
 'ncftp': 'r2l',
 'neptune': 'dos',
 'netbus': 'r2l',
 'netcat': 'r2l',
 'nmap': 'probe',
 'ntfsdos': 'u2r',
 'ntinfoscan': 'u2r',
 'nukepw': 'u2r',
 'perl': 'u2r',
 'phf': 'r2l',
 'pod': 'dos',
 'portsweep': 'probe',
 'ppmacro': 'r2l',
 'queso': 'u2r',
 'rootkit': 'u2r',
 'satan': 'probe',
 'sechole': 'u2r',
 'secret': 'u2r',
 'selfping': 'dos',
 'smurf': 'dos',
 'snmpget': 'r2l',
 'spy': 'r2l',
 'sshtrojan': 'r2l',
 'syslogd': 'dos',
 'teardrop': 'dos',
 'tepreset': 'dos',
 'ucpstorm': 'dos',
 'warezclient': 'r2l',
 'warezmaster':

In [15]:
kdd['class'].value_counts()

normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: class, dtype: int64

In [16]:
##Kopi KDD Test untuk data clean
kdd_t2=kdd_t.copy()

In [17]:
# Here we opt for the 5-class problem
kdd['class'] = kdd['class'].replace(attack_map)
kdd_t['class'] = kdd_t['class'].replace(attack_map)
kdd_t2['class'] = kdd_t2['class'].replace(attack_map2)


In [18]:
kdd['class'].value_counts()

normal    67343
dos       45927
probe     11656
r2l         995
u2r          52
Name: class, dtype: int64

In [19]:
kdd_t['class'].value_counts()

normal    9711
dos       7458
r2l       2754
probe     2421
u2r        200
Name: class, dtype: int64

In [20]:
kdd_t2['class'].value_counts()

normal           9711
dos              5741
r2l              2199
probe            1106
mscan             996
apache2           737
processtable      685
snmpguess         331
saint             319
mailbomb          293
snmpgetattack     178
httptunnel        133
u2r                37
named              17
ps                 15
sendmail           14
xterm              13
xlock               9
xsnoop              4
worm                2
udpstorm            2
sqlattack           2
Name: class, dtype: int64

In [21]:
kdd.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,125973.0,287.14465,2604.515,0.0,0.0,0.0,0.0,42908.0
src_bytes,125973.0,45566.743,5870331.0,0.0,0.0,44.0,276.0,1379964000.0
dst_bytes,125973.0,19779.114421,4021269.0,0.0,0.0,0.0,516.0,1309937000.0
land,125973.0,0.000198,0.01408607,0.0,0.0,0.0,0.0,1.0
wrong_fragment,125973.0,0.022687,0.25353,0.0,0.0,0.0,0.0,3.0
urgent,125973.0,0.000111,0.01436603,0.0,0.0,0.0,0.0,3.0
hot,125973.0,0.204409,2.149968,0.0,0.0,0.0,0.0,77.0
num_failed_logins,125973.0,0.001222,0.04523914,0.0,0.0,0.0,0.0,5.0
logged_in,125973.0,0.395736,0.4890101,0.0,0.0,0.0,1.0,1.0
num_compromised,125973.0,0.27925,23.94204,0.0,0.0,0.0,0.0,7479.0


In [22]:
kdd.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [23]:
def cat_encode(df, col):
    return pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col].values)], axis=1)

In [24]:
def log_trns(df, col):
    return df[col].apply(np.log1p)

In [25]:
cat_lst = ['protocol_type', 'service', 'flag']
for col in cat_lst:
    kdd = cat_encode(kdd, col)
    kdd_t = cat_encode(kdd_t, col)
    kdd_t2 = cat_encode(kdd_t2, col)

In [26]:
kdd_t.shape

(22544, 118)

In [27]:
kdd_t2.shape

(22544, 118)

In [28]:
kdd_t2 = kdd_t2[kdd_t2['class']!= 'apache2']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'httptunnel']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'mscan']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'named']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'processtable']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'ps']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'saint']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'sendmail']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'snmpgetattack']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'snmpguess']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'sqlattack']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'udpstorm']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'worm']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'xlock']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'xsnoop']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'xterm']
kdd_t2 = kdd_t2[kdd_t2['class']!= 'mailbomb']

In [29]:
kdd_t2['class'].value_counts()

normal    9711
dos       5741
r2l       2199
probe     1106
u2r         37
Name: class, dtype: int64

In [30]:
kdd_t2.shape

(18794, 118)

In [31]:
log_lst = ['duration', 'src_bytes', 'dst_bytes']
for col in log_lst:
    kdd[col] = log_trns(kdd, col)
    kdd_t[col] = log_trns(kdd_t, col)
    kdd_t2[col] = log_trns(kdd_t2, col)

In [32]:
kdd_t2.shape

(18794, 118)

In [33]:
kdd = kdd[kdd_cols]
for col in kdd_cols:
    if col not in kdd_t.columns:
        kdd_t[col] = 0
    if col not in kdd_t2.columns:
        kdd_t2[col] = 0
kdd_t = kdd_t[kdd_cols]
kdd_t2 = kdd_t2[kdd_cols]

In [34]:
kdd_t2.shape

(18794, 124)

In [35]:
# Now we have used one-hot encoding and log scaling

In [36]:
kdd.head()

Unnamed: 0,duration,icmp,tcp,udp,IRC,X11,Z39_50,aol,auth,bgp,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
0,0.0,0,1,0,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0.0,0,0,1,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0.0,0,1,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos,19
3,0.0,0,1,0,0,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0.0,0,1,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [37]:
target_class=kdd['class']

In [38]:
from sklearn.utils.class_weight import compute_class_weight
class_weight = compute_class_weight(class_weight='balanced',
                                    classes=np.unique(target_class),
                                    y=target_class)
print(class_weight)

[5.48579267e-01 3.74123517e-01 2.16151338e+00 2.53212060e+01
 4.84511538e+02]


In [39]:
#memotong kolom difficulty dan class
difficulty = kdd.pop('difficulty')
target = kdd.pop('class')
y_diff = kdd_t.pop('difficulty')
y_test = kdd_t.pop('class')
y_diff2 = kdd_t2.pop('difficulty')
y_test2 = kdd_t2.pop('class')

In [40]:
print("data Train: ",kdd.shape)
print("data Test: ",kdd_t.shape)
print("data Test2: ",kdd_t2.shape)

data Train:  (125973, 122)
data Test:  (22544, 122)
data Test2:  (18794, 122)


In [41]:
#merubah kolom class dari categorical menjadi one hot encoding
y_train = pd.get_dummies(target)
y_test = pd.get_dummies(y_test)
y_test2 = pd.get_dummies(y_test2)

In [42]:
y_train.head()

Unnamed: 0,dos,normal,probe,r2l,u2r
0,0,1,0,0,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0


In [43]:
y_test.head()

Unnamed: 0,dos,normal,probe,r2l,u2r
0,1,0,0,0,0
1,1,0,0,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,1,0,0


In [44]:
kdd.dtypes

duration                       float64
icmp                             uint8
tcp                              uint8
udp                              uint8
IRC                              uint8
X11                              uint8
Z39_50                           uint8
aol                              uint8
auth                             uint8
bgp                              uint8
courier                          uint8
csnet_ns                         uint8
ctf                              uint8
daytime                          uint8
discard                          uint8
domain                           uint8
domain_u                         uint8
echo                             uint8
eco_i                            uint8
ecr_i                            uint8
efs                              uint8
exec                             uint8
finger                           uint8
ftp                              uint8
ftp_data                         uint8
gopher                   

In [45]:
gl_float= kdd.select_dtypes(include=['float64'])
print(gl_float.columns) 
print(gl_float.shape) 
##hanya 18 kolom yang float lainnnya integer

Index(['duration', 'src_bytes', 'dst_bytes', 'serror_rate', 'srv_serror_rate',
       'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')
(125973, 18)


In [46]:
##merubah data menjadi array
y_train = y_train.values
train = kdd.values
test = kdd_t.values
y_test = y_test.values

test2 = kdd_t2.values
y_test2 = y_test2.values


In [47]:
# We rescale features to [0, 1]

In [48]:
min_max_scaler = MinMaxScaler()
train = min_max_scaler.fit_transform(train)
test = min_max_scaler.transform(test)
test2 = min_max_scaler.transform(test2)

In [65]:
def save_matrices(A,B,C,D,E,F,G, file_name):
    with open(file_name, 'wb') as f:
        np.save(f, A)
        np.save(f, B)
        np.save(f, C)
        np.save(f, D)
        np.save(f, E)
        np.save(f, F)
        np.save(f, G)

In [66]:
my_file = '../data_NSL/data.npy'
save_matrices(train,test, test2,y_train,y_test,y_test2,class_weight, my_file)


In [67]:
train

array([[0.  , 0.  , 1.  , ..., 0.  , 0.05, 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , ..., 1.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 1.  , ..., 0.  , 0.01, 0.  ],
       [0.  , 0.  , 1.  , ..., 1.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ]])

In [68]:
test2

array([[0.        , 0.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.10299326, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.07      ,
        0.07      ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [61]:
class_weight

array([5.48579267e-01, 3.74123517e-01, 2.16151338e+00, 2.53212060e+01,
       4.84511538e+02])