In [3]:
# Dataset credit: CICIDS2017
# Iman Sharafaldin, Arash Habibi Lashkari, and Ali A. Ghorbani, “Toward Generating a New Intrusion Detection Dataset and Intrusion Traffic Characterization”, 4th International Conference on Information Systems Security and Privacy (ICISSP), Portugal, January 2018

In [4]:
import pandas as pd 
import numpy as np

In [5]:
df = pd.read_csv('ids_2017_concat.csv')

In [6]:
df = df.drop(columns='DestinationPort')

In [7]:
# 2.8M rows, 79 cols
df.shape
df

Unnamed: 0,FlowDuration,TotalFwdPackets,TotalBackwardPackets,TotalLengthofFwdPackets,TotalLengthofBwdPackets,FwdPacketLengthMax,FwdPacketLengthMin,FwdPacketLengthMean,FwdPacketLengthStd,BwdPacketLengthMax,...,min_seg_size_forward,ActiveMean,ActiveStd,ActiveMax,ActiveMin,IdleMean,IdleStd,IdleMax,IdleMin,Label
0,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,109,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,52,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,34,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2830738,32215,4,2,112,152,28,28,28.0,0.00000,76,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830739,324,2,2,84,362,42,42,42.0,0.00000,181,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830740,82,2,1,31,6,31,0,15.5,21.92031,6,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830741,1048635,6,2,192,256,32,32,32.0,0.00000,128,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [8]:
# 15 unique labels
labels = df['Label'].unique()

columns = df.columns

labels

array(['BENIGN', 'DDoS', 'PortScan', 'Bot', 'Infiltration',
       'WebAttack�BruteForce', 'WebAttack�XSS', 'WebAttack�SqlInjection',
       'FTP-Patator', 'SSH-Patator', 'DoSslowloris', 'DoSSlowhttptest',
       'DoSHulk', 'DoSGoldenEye', 'Heartbleed'], dtype=object)

In [9]:
# explore values
for column in columns:
    print(column, " ", df[column].unique())

FlowDuration   [       3      109       52 ... 11509095 11512230  1048635]
TotalFwdPackets   [   2    1    3 ... 6969 1158 9951]
TotalBackwardPackets   [    0     1     2 ...  4591  1029 14759]
TotalLengthofFwdPackets   [   12     6    37 ... 35380 33901 10775]
TotalLengthofBwdPackets   [     0      6    176 ...  74438 147609 267793]
FwdPacketLengthMax   [   6   31    0 ... 4204 3618 3419]
FwdPacketLengthMin   [   6    0   31   37   41   46   34   35   40   32   36   48   50   45
   43   47   60   54   55   52   56   59   51   65   49   38   74   78
   39   58   81   66   57   62   53   42   61   44   75   64   82   77
   84   80   73   94   79   71   29   30   63   27   67   28   33   70
   83   68   88   76   69   99   92   23  247  188  235  216  165   85
  201   26  187   89   90  108   93   96   97   87   98   72   86  214
  233   18 1472  252  178  176  250  174  248   22   24   12  930  421
  940  526  110   95 1375  164  163  319 1017  929  696  913 1016  914
  931  118  207  2

In [10]:
# missing values? FlowBytes/s - 1358
for column in columns:
    print(column, ": ", df[column].isnull().values.any(), ", total: ", df[column].isnull().sum())


FlowDuration :  False , total:  0
TotalFwdPackets :  False , total:  0
TotalBackwardPackets :  False , total:  0
TotalLengthofFwdPackets :  False , total:  0
TotalLengthofBwdPackets :  False , total:  0
FwdPacketLengthMax :  False , total:  0
FwdPacketLengthMin :  False , total:  0
FwdPacketLengthMean :  False , total:  0
FwdPacketLengthStd :  False , total:  0
BwdPacketLengthMax :  False , total:  0
BwdPacketLengthMin :  False , total:  0
BwdPacketLengthMean :  False , total:  0
BwdPacketLengthStd :  False , total:  0
FlowBytess :  True , total:  1358
FlowPacketss :  False , total:  0
FlowIATMean :  False , total:  0
FlowIATStd :  False , total:  0
FlowIATMax :  False , total:  0
FlowIATMin :  False , total:  0
FwdIATTotal :  False , total:  0
FwdIATMean :  False , total:  0
FwdIATStd :  False , total:  0
FwdIATMax :  False , total:  0
FwdIATMin :  False , total:  0
BwdIATTotal :  False , total:  0
BwdIATMean :  False , total:  0
BwdIATStd :  False , total:  0
BwdIATMax :  False , tot

The authors dropped the destination port as I have done above as it is the only non-quantative value, and replaced NaN values with 0, as I will below.

In [11]:
df = df.replace(np.nan, 0)

# Exploring the dataset class composition & preparing the dataset

Below, the class distribution is calculated and coincides with the authors' findings. 

In [12]:
(c, count) = np.unique(df['Label'], return_counts=True)

for i in range(len(c)):
    print(c[i], ": ", "{:.3f}".format((count[i]/df.shape[0] * 100)),"%")

BENIGN :  80.300 %
Bot :  0.069 %
DDoS :  4.523 %
DoSGoldenEye :  0.364 %
DoSHulk :  8.163 %
DoSSlowhttptest :  0.194 %
DoSslowloris :  0.205 %
FTP-Patator :  0.280 %
Heartbleed :  0.000 %
Infiltration :  0.001 %
PortScan :  5.614 %
SSH-Patator :  0.208 %
WebAttack�BruteForce :  0.053 %
WebAttack�SqlInjection :  0.001 %
WebAttack�XSS :  0.023 %


From the paper: 
```Most neural network models work better with databases with balanced samples in each class. However, neural network models also require a big dataset for better generalization of the task assigned to it. Therefore, it is not feasible to simply trim down the number of samples in the classes with a higher proportion to match with the minority classes. For the best of both worlds, the database is divided into two categories. Namely, the normal attack samples, and minority attack samples```


In [13]:
# normal and minority attack labels
normal_labels = ['BENIGN','DDoS','DoSGoldenEye','DoSHulk','DoSSlowhttptest','DoSslowloris','PortScan']
minority_labels = ['FTP-Patator','SSH-Patator','Bot','Heartbleed','Infiltration','WebAttack�BruteForce','WebAttack�SqlInjection','WebAttack�XSS']

In [14]:
# splitting the two sets
normal = df.loc[df['Label'].isin(normal_labels)]
minority = df.loc[df['Label'].isin(minority_labels)]

In [15]:
# count samples
for label in labels:
    print(label, ": ", df.loc[df['Label']==label].shape[0])

BENIGN :  2273097
DDoS :  128027
PortScan :  158930
Bot :  1966
Infiltration :  36
WebAttack�BruteForce :  1507
WebAttack�XSS :  652
WebAttack�SqlInjection :  21
FTP-Patator :  7938
SSH-Patator :  5897
DoSslowloris :  5796
DoSSlowhttptest :  5499
DoSHulk :  231073
DoSGoldenEye :  10293
Heartbleed :  11


# Inconsistency in BENIGN values - paper has 529,918, my dataset 2,273,097. The paper uses BENIGN samples from the Monday file only.

In [16]:
# get BENIGN samples from Monday file

df_monday = pd.read_csv('Monday-WorkingHours.pcap_ISCX.csv')
benign_monday = df_monday.loc[df_monday[' Label']=='BENIGN']
# remove space from col name
benign_monday = benign_monday.rename(columns={" Label": "Label"})
# remove destination port
benign_monday = benign_monday.drop(columns=' Destination Port')


In [50]:
# replace BENIGN in dataset with benign_monday

tmp = df.loc[df['Label'] != 'BENIGN']
frames = [benign_monday, tmp]
df2 = pd.concat(frames)
df2.reset_index(drop=True, inplace=True)

In [51]:
# count samples
for label in labels:
    print(label, ": ", df2.loc[df2['Label']==label].shape[0])

BENIGN :  529918
DDoS :  128027
PortScan :  158930
Bot :  1966
Infiltration :  36
WebAttack�BruteForce :  1507
WebAttack�XSS :  652
WebAttack�SqlInjection :  21
FTP-Patator :  7938
SSH-Patator :  5897
DoSslowloris :  5796
DoSSlowhttptest :  5499
DoSHulk :  231073
DoSGoldenEye :  10293
Heartbleed :  11


In [52]:
# construct training & test sets

train_labels = [train_benign, train_ftp, train_ssh, train_bot, train_ddos, train_goldeneye, train_hulk, \
    train_slowhttp, train_slowloris, train_heartbleed, train_infiltration, train_portscan, train_bruteforce, \
        train_sql, train_xss
    ]

test_labels = [test_benign, test_ftp, test_ssh, test_bot, test_ddos, test_goldeneye, test_hulk, \
    test_slowhttp, test_slowloris, test_heartbleed, test_infiltration, test_portscan, test_bruteforce, \
        test_sql, test_xss
    ]
# benign
train_benign = df2.loc[df2['Label']=='BENIGN'][:105983]
test_benign = df2.loc[df2['Label']=='BENIGN'][105984:]

# FTP-Patator
train_ftp = df2.loc[df2['Label']=='FTP-Patator'][:6350]
test_ftp = df2.loc[df2['Label']=='FTP-Patator']

# SSH-Patator
train_ssh = df2.loc[df2['Label']=='SSH-Patator'][:4717]
test_ssh = df2.loc[df2['Label']=='SSH-Patator']

# Bot
train_bot = df2.loc[df2['Label']=='Bot'][:1572]
test_bot = df2.loc[df2['Label']=='Bot']

# DDoS
train_ddos = df2.loc[df2['Label']=='DDoS'][:102422]
test_ddos = df2.loc[df2['Label']=='DDoS'][102423:]

# DoS Goldeneye
train_goldeneye = df2.loc[df2['Label']=='DoSGoldenEye'][:4118]
test_goldeneye = df2.loc[df2['Label']=='DoSGoldenEye'][4118:]

# DoS Hulk
train_hulk = df2.loc[df2['Label']=='DoSHulk'][:92430]
test_hulk = df2.loc[df2['Label']=='DoSHulk'][92430:]

# Dos Slowhttptest
train_slowhttp = df2.loc[df2['Label']=='DoSSlowhttptest'][:2200]
test_slowhttp = df2.loc[df2['Label']=='DoSSlowhttptest'][2200:]

# Dos slowloris
train_slowloris = df2.loc[df2['Label']=='DoSslowloris'][:2319]
test_slowloris = df2.loc[df2['Label']=='DoSslowloris'][2319:]

# Heartbleed 
train_heartbleed = df2.loc[df2['Label']=='Heartbleed'][:9]
test_heartbleed = df2.loc[df2['Label']=='Heartbleed']

# Infiltration
train_infiltration = df2.loc[df2['Label']=='Infiltration'][:28]
test_infiltration = df2.loc[df2['Label']=='Infiltration']

# PortScan
train_portscan = df2.loc[df2['Label']=='PortScan'][:103304]
test_portscan = df2.loc[df2['Label']=='PortScan'][103304:]

# BruteForce
train_bruteforce = df2.loc[df2['Label']=='WebAttack�BruteForce'][:1205]
test_bruteforce = df2.loc[df2['Label']=='WebAttack�BruteForce']

# SQL
train_sql = df2.loc[df2['Label']=='WebAttack�SqlInjection'][:16]
test_sql = df2.loc[df2['Label']=='WebAttack�SqlInjection']

# XSS
train_xss = df2.loc[df2['Label']=='WebAttack�XSS'][:521]
test_xss = df2.loc[df2['Label']=='WebAttack�XSS']

train = pd.concat(train_labels)
test = pd.concat(test_labels)

In [53]:
# missing values? 
for column in columns:
    print(column, ": ", df2[column].isnull().values.any(), ", total: ", df2[column].isnull().sum())


FlowDuration :  True , total:  529918
TotalFwdPackets :  True , total:  529918
TotalBackwardPackets :  True , total:  529918
TotalLengthofFwdPackets :  True , total:  529918
TotalLengthofBwdPackets :  True , total:  529918
FwdPacketLengthMax :  True , total:  529918
FwdPacketLengthMin :  True , total:  529918
FwdPacketLengthMean :  True , total:  529918
FwdPacketLengthStd :  True , total:  529918
BwdPacketLengthMax :  True , total:  529918
BwdPacketLengthMin :  True , total:  529918
BwdPacketLengthMean :  True , total:  529918
BwdPacketLengthStd :  True , total:  529918
FlowBytess :  True , total:  529918
FlowPacketss :  True , total:  529918
FlowIATMean :  True , total:  529918
FlowIATStd :  True , total:  529918
FlowIATMax :  True , total:  529918
FlowIATMin :  True , total:  529918
FwdIATTotal :  True , total:  529918
FwdIATMean :  True , total:  529918
FwdIATStd :  True , total:  529918
FwdIATMax :  True , total:  529918
FwdIATMin :  True , total:  529918
BwdIATTotal :  True , tota

In [55]:
# TODO fix concat - merge tmp & df2 without col duplication
df2.shape

(1087564, 154)