
# Network Intrusion Detection with Deep Learning

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## The Data

In [2]:
kdd = pd.read_csv('../CIDS-2018/10Train/DataTrain10.csv')
kdd_t = pd.read_csv('../CIDS-2018/2.5Test/DataTest2_5.csv')

In [3]:
kdd.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,8080,6,02/03/2018 12:27:17,597,2,0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
1,8080,6,02/03/2018 11:17:23,10578,3,4,326.0,129.0,326.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
2,8080,6,02/03/2018 11:43:28,523,2,0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
3,8080,6,02/03/2018 11:06:22,538,2,0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
4,53,17,02/03/2018 11:19:42,12870,1,1,51.0,136.0,51.0,51.0,...,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [4]:
kdd_t.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,51252,6,02/03/2018 03:53:06,11868,5,2,129.0,326.0,112.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
1,8080,6,02/03/2018 03:05:55,11164,3,4,326.0,129.0,326.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
2,8080,6,02/03/2018 11:26:37,704,2,0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
3,3389,6,02/03/2018 12:39:48,4187266,14,8,1460.0,1731.0,741.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,8080,6,02/03/2018 02:23:53,10521,3,4,326.0,129.0,326.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot


In [5]:
kdd.isnull().any().any()

False

In [6]:
list(kdd)

['Dst Port',
 'Protocol',
 'Timestamp',
 'Flow Duration',
 'Tot Fwd Pkts',
 'Tot Bwd Pkts',
 'TotLen Fwd Pkts',
 'TotLen Bwd Pkts',
 'Fwd Pkt Len Max',
 'Fwd Pkt Len Min',
 'Fwd Pkt Len Mean',
 'Fwd Pkt Len Std',
 'Bwd Pkt Len Max',
 'Bwd Pkt Len Min',
 'Bwd Pkt Len Mean',
 'Bwd Pkt Len Std',
 'Flow Byts/s',
 'Flow Pkts/s',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Flow IAT Min',
 'Fwd IAT Tot',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Tot',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd IAT Min',
 'Fwd PSH Flags',
 'Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'Fwd Header Len',
 'Bwd Header Len',
 'Fwd Pkts/s',
 'Bwd Pkts/s',
 'Pkt Len Min',
 'Pkt Len Max',
 'Pkt Len Mean',
 'Pkt Len Std',
 'Pkt Len Var',
 'FIN Flag Cnt',
 'SYN Flag Cnt',
 'RST Flag Cnt',
 'PSH Flag Cnt',
 'ACK Flag Cnt',
 'URG Flag Cnt',
 'CWE Flag Count',
 'ECE Flag Cnt',
 'Down/Up Ratio',
 'Pkt Size Avg',
 'Fwd Seg Size Avg',
 'Bwd Seg Size Avg',
 'Fwd B

In [7]:
kdd.dtypes

Dst Port               int64
Protocol               int64
Timestamp             object
Flow Duration          int64
Tot Fwd Pkts           int64
Tot Bwd Pkts           int64
TotLen Fwd Pkts      float64
TotLen Bwd Pkts      float64
Fwd Pkt Len Max      float64
Fwd Pkt Len Min      float64
Fwd Pkt Len Mean     float64
Fwd Pkt Len Std      float64
Bwd Pkt Len Max      float64
Bwd Pkt Len Min      float64
Bwd Pkt Len Mean     float64
Bwd Pkt Len Std      float64
Flow Byts/s          float64
Flow Pkts/s          float64
Flow IAT Mean        float64
Flow IAT Std         float64
Flow IAT Max         float64
Flow IAT Min         float64
Fwd IAT Tot          float64
Fwd IAT Mean         float64
Fwd IAT Std          float64
Fwd IAT Max          float64
Fwd IAT Min          float64
Bwd IAT Tot          float64
Bwd IAT Mean         float64
Bwd IAT Std          float64
                      ...   
URG Flag Cnt           int64
CWE Flag Count         int64
ECE Flag Cnt           int64
Down/Up Ratio 

In [8]:
kdd = kdd.drop(['Timestamp'], axis=1)
kdd.head()

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,8080,6,597,2,0,0.0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
1,8080,6,10578,3,4,326.0,129.0,326.0,0.0,108.666667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
2,8080,6,523,2,0,0.0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
3,8080,6,538,2,0,0.0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
4,53,17,12870,1,1,51.0,136.0,51.0,51.0,51.0,...,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [9]:
kdd_t = kdd_t.drop(['Timestamp'], axis=1)
kdd_t.head()

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,51252,6,11868,5,2,129.0,326.0,112.0,0.0,25.8,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
1,8080,6,11164,3,4,326.0,129.0,326.0,0.0,108.666667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
2,8080,6,704,2,0,0.0,0.0,0.0,0.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot
3,3389,6,4187266,14,8,1460.0,1731.0,741.0,0.0,104.285714,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,8080,6,10521,3,4,326.0,129.0,326.0,0.0,108.666667,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bot


In [10]:
#attack_map = [x.strip().split() for x in open("../CIDS-2018/attack_type.txt", 'r')]
attack_map = [x.strip().split(',') for x in open("../CIDS-2018/attack_type.txt", 'r')]
attack_map = {k:v for (k,v) in attack_map}

In [11]:
attack_map

{'DDOS attack-HOIC': 'DDoS',
 'DDoS attacks-LOIC-HTTP': 'DDoS',
 'DoS attacks-Hulk': 'DoS',
 'Bot': 'Bot',
 'FTP-BruteForce': 'BruteForce',
 'SSH-Bruteforce': 'BruteForce',
 'Infilteration': 'Infilteration',
 'DoS attacks-SlowHTTPTest': 'DoS',
 'DoS attacks-GoldenEye': 'DoS',
 'DoS attacks-Slowloris': 'DoS',
 'DDOS attack-LOIC-UDP': 'DDoS',
 'Brute Force -Web': 'Web Attack',
 'Brute Force -XSS': 'Web Attack',
 'SQL Injection': 'Web Attack'}

In [12]:
kdd['Label'].value_counts()

Benign                      803025
DDOS attack-HOIC            192098
DDoS attacks-LOIC-HTTP      161447
DoS attacks-Hulk            138459
Bot                          85842
FTP-BruteForce               58055
SSH-Bruteforce               56332
Infilteration                48347
DoS attacks-SlowHTTPTest     41974
DoS attacks-GoldenEye        25008
DoS attacks-Slowloris         6612
DDOS attack-LOIC-UDP          1362
Brute Force -Web               496
Brute Force -XSS               187
SQL Injection                   71
Name: Label, dtype: int64

In [13]:
kdd_t['Label'].value_counts()

Benign                      201238
DDOS attack-HOIC             48006
DDoS attacks-LOIC-HTTP       40219
DoS attacks-Hulk             34758
Bot                          21479
FTP-BruteForce               14452
SSH-Bruteforce               14013
Infilteration                11892
DoS attacks-SlowHTTPTest     10484
DoS attacks-GoldenEye         6123
DoS attacks-Slowloris         1630
DDOS attack-LOIC-UDP           368
Brute Force -Web               115
Brute Force -XSS                43
SQL Injection                   16
Name: Label, dtype: int64

In [14]:
# Here we opt for the 5-class problem
kdd['class'] = kdd['Label'].replace(attack_map)
kdd_t['class'] = kdd_t['Label'].replace(attack_map)



In [15]:
kdd['class'].value_counts()

Benign           803025
DDoS             354907
DoS              212053
BruteForce       114387
Bot               85842
Infilteration     48347
Web Attack          754
Name: class, dtype: int64

In [16]:
kdd_t['class'].value_counts()

Benign           201238
DDoS              88593
DoS               52995
BruteForce        28465
Bot               21479
Infilteration     11892
Web Attack          174
Name: class, dtype: int64

In [17]:
def cat_encode(df, col):
    return pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col].values)], axis=1)

In [18]:
def log_trns(df, col):
    return df[col].apply(np.log1p)

In [19]:
kdd_t['Dst Port'].value_counts()


80       156869
53        64176
443       32430
3389      28516
21        24952
8080      21149
22        14210
445       10842
0          3682
5355        729
3128        443
135         363
137         337
67          224
123         189
139         132
500         111
23          103
138          80
27017        37
1900         24
49152        23
8545         19
50774        19
8443         19
51044        17
50760        17
6667         16
1433         16
49956        15
          ...  
58266         1
17294         1
25482         1
42642         1
59034         1
9347          1
37269         1
56220         1
35734         1
39828         1
2950          1
35478         1
49567         1
64236         1
59802         1
61849         1
63896         1
43410         1
42898         1
47504         1
24971         1
27018         1
62189         1
45201         1
26762         1
58095         1
61339         1
59290         1
63384         1
21108         1
Name: Dst Port, Length: 

In [20]:
kdd_t['Protocol'].value_counts()

6     334704
17     66450
0       3682
Name: Protocol, dtype: int64

kdd.info(memory_usage='deep')

gl_float= kdd.select_dtypes(include=['float64'])
#print(gl_float.columns)
for column in gl_float.columns:
    kdd[column] = pd.to_numeric(kdd[column], errors='coerce',downcast='float')

kdd.dtypes

gl_int = kdd.select_dtypes(include=['int64'])

for column in gl_int.columns:
     kdd[column] = pd.to_numeric(kdd[column], errors='coerce',downcast='unsigned')


kdd.info(memory_usage='deep')

gl_float= kdd_t.select_dtypes(include=['float64'])
#print(gl_float.columns)
column=gl_float.columns

for column in gl_float.columns:
    kdd_t[column] = pd.to_numeric(kdd_t[column], errors='coerce',downcast='float')

kdd_t.dtypes

gl_int = kdd_t.select_dtypes(include=['int64'])

for column in gl_int.columns:
     kdd_t[column] = pd.to_numeric(kdd_t[column], errors='coerce',downcast='unsigned')

kdd_t.info(memory_usage='deep')

In [21]:
kdd.isnull().any().any()

False

In [22]:
kdd_t.isnull().any().any()

False

In [23]:
cat_lst = ['Protocol']
for col in cat_lst:
    kdd = cat_encode(kdd, col)
    kdd_t = cat_encode(kdd_t, col)

In [24]:
kdd.shape

(1619315, 82)

In [25]:
kdd_t.shape

(404836, 82)

In [26]:
kdd.isnull().any().any()

False

range_kdd=kdd.describe().transpose()

range_kdd

range_kdd.to_csv (r'../CIDS-2018/range_kdd.csv', header=True)

##merubah data besar ke log
log_lst = ['Dst Port','Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts','TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean','Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min','Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 
'Bwd IAT Max','Bwd IAT Min','Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s','Bwd Pkts/s', 'Pkt Len Max', 
'Pkt Len Var','Subflow Fwd Pkts','Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts','Active Mean', 'Active Std', 'Active Max',
'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
for col in log_lst:
    kdd[col] = log_trns(kdd, col)
    kdd_t[col] = log_trns(kdd_t, col)
    

kdd_t.head()

In [27]:

#memotong kolom difficulty dan class
y_label = kdd.pop('Label')
y_target = kdd.pop('class')
y_label_t = kdd_t.pop('Label')
y_test = kdd_t.pop('class')

In [28]:
print("data Train: ",kdd.shape)
print("data Test: ",kdd_t.shape)


data Train:  (1619315, 80)
data Test:  (404836, 80)


In [29]:
#merubah kolom class dari categorical menjadi one hot encoding
y_train = pd.get_dummies(y_target)
y_test = pd.get_dummies(y_test)


In [30]:
y_train.head()

Unnamed: 0,Benign,Bot,BruteForce,DDoS,DoS,Infilteration,Web Attack
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0


In [31]:
y_test.head()

Unnamed: 0,Benign,Bot,BruteForce,DDoS,DoS,Infilteration,Web Attack
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0


In [32]:
kdd.isnull().any().any()

False

In [33]:
##merubah data menjadi array
y_train = y_train.values
train = kdd.values
test = kdd_t.values
y_test = y_test.values

train=train[~np.isnan(train).any(axis=1)]

In [34]:
np.isnan(train).any()

False

In [35]:
np.shape(train)

(1619315, 80)

In [36]:
np.isnan(test).any()

False

In [37]:
np.shape(test)

(404836, 80)

test=test[~np.isnan(test).any(axis=1)]

In [38]:
np.shape(test)

(404836, 80)

In [39]:
# We rescale features to [0, 1]

In [40]:
min_max_scaler = MinMaxScaler()
train = min_max_scaler.fit_transform(train)
test = min_max_scaler.transform(test)

In [41]:
train

array([[1.23294778e-01, 4.96666679e-06, 3.22968207e-06, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.23294778e-01, 8.81416689e-05, 6.45936414e-06, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.23294778e-01, 4.35000011e-06, 3.22968207e-06, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [8.08740501e-04, 2.19166672e-06, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.22074038e-03, 9.66141307e-01, 4.84452311e-05, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [6.75984985e-03, 9.85723841e-01, 6.78233235e-05, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [42]:
np.shape(train)

(1619315, 80)

In [43]:
#merubah kolom class dari categorical menjadi one hot encoding
ylabel_train = pd.get_dummies(y_label)
ylabel_test = pd.get_dummies(y_label_t)

In [44]:
ylabel_train

Unnamed: 0,Benign,Bot,Brute Force -Web,Brute Force -XSS,DDOS attack-HOIC,DDOS attack-LOIC-UDP,DDoS attacks-LOIC-HTTP,DoS attacks-GoldenEye,DoS attacks-Hulk,DoS attacks-SlowHTTPTest,DoS attacks-Slowloris,FTP-BruteForce,Infilteration,SQL Injection,SSH-Bruteforce
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
ylabel_test

Unnamed: 0,Benign,Bot,Brute Force -Web,Brute Force -XSS,DDOS attack-HOIC,DDOS attack-LOIC-UDP,DDoS attacks-LOIC-HTTP,DoS attacks-GoldenEye,DoS attacks-Hulk,DoS attacks-SlowHTTPTest,DoS attacks-Slowloris,FTP-BruteForce,Infilteration,SQL Injection,SSH-Bruteforce
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
##merubah data menjadi array
ylabel_train = ylabel_train.values
ylabel_test = ylabel_test.values

In [47]:
def save_matrices(A,B,C,D,E,F, file_name):
    with open(file_name, 'wb') as f:
        np.save(f, A)
        np.save(f, B)
        np.save(f, C)
        np.save(f, D)
        np.save(f, E)
        np.save(f, F)

In [48]:
my_file = '../CIDS-2018/10data.npy'
save_matrices(train,test,y_train,y_test,ylabel_train, ylabel_test,my_file)

In [49]:
y_test.dtype

dtype('uint8')

In [50]:
train.dtype

dtype('float64')

In [51]:
test = np.float32(test)
train = np.float32(train)

In [52]:
#########simpan ulang 32 float
my_file = '../CIDS-2018/10data2.npy'
save_matrices(train,test,y_train,y_test,ylabel_train, ylabel_test,my_file)

In [53]:
train

array([[1.2329478e-01, 4.9666669e-06, 3.2296821e-06, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.2329478e-01, 8.8141671e-05, 6.4593642e-06, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.2329478e-01, 4.3499999e-06, 3.2296821e-06, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       ...,
       [8.0874050e-04, 2.1916667e-06, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.2207404e-03, 9.6614128e-01, 4.8445232e-05, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [6.7598498e-03, 9.8572385e-01, 6.7823326e-05, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00]], dtype=float32)