# ISCX-IDS-2012 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', None)

In [2]:
CSVS_PATH = 'iscxids2012-master/data/CSV/'

In [3]:
# Load all files within a single file
data = pd.DataFrame()

for file in os.listdir(CSVS_PATH):
    fullpath = os.path.join(CSVS_PATH, file)
    day_data =  pd.read_csv(fullpath)

    print(f'{file} : {len(day_data)}')
    data = pd.concat([data, day_data])

TestbedWedJun16Flows.csv : 522263
TestbedSunJun13Flows.csv : 275528
TestbedTueJun15Flows.csv : 571698
TestbedMonJun14Flows.csv : 171380
TestbedThuJun17Flows.csv : 397595
TestbedSatJun12Flows.csv : 133193


In [4]:
len(data)

2071657

In [5]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2071657 entries, 0 to 133192
Data columns (total 21 columns):
 #   Column                          Dtype 
---  ------                          ----- 
 0   generated                       object
 1   appName                         object
 2   totalSourceBytes                int64 
 3   totalDestinationBytes           int64 
 4   totalDestinationPackets         int64 
 5   totalSourcePackets              int64 
 6   sourcePayloadAsBase64           object
 7   sourcePayloadAsUTF              object
 8   destinationPayloadAsBase64      object
 9   destinationPayloadAsUTF         object
 10  direction                       object
 11  sourceTCPFlagsDescription       object
 12  destinationTCPFlagsDescription  object
 13  source                          object
 14  protocolName                    object
 15  sourcePort                      int64 
 16  destination                     object
 17  destinationPort                 int64 
 18  sta

In [6]:
data.describe()

Unnamed: 0,totalSourceBytes,totalDestinationBytes,totalDestinationPackets,totalSourcePackets,sourcePort,destinationPort
count,2071657.0,2071657.0,2071657.0,2071657.0,2071657.0,2071657.0
mean,2460.947,34489.11,30.30068,19.84019,14137.59,1882.091
std,751778.5,1187177.0,983.4982,667.9018,20142.97,8623.28
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,256.0,419.0,2.0,3.0,2302.0,80.0
50%,442.0,1177.0,5.0,6.0,3744.0,80.0
75%,845.0,7338.0,11.0,10.0,16993.0,80.0
max,763277600.0,1254005000.0,872224.0,514794.0,65535.0,65535.0


In [7]:
data.head(5)

Unnamed: 0,generated,appName,totalSourceBytes,totalDestinationBytes,totalDestinationPackets,totalSourcePackets,sourcePayloadAsBase64,sourcePayloadAsUTF,destinationPayloadAsBase64,destinationPayloadAsUTF,direction,sourceTCPFlagsDescription,destinationTCPFlagsDescription,source,protocolName,sourcePort,destination,destinationPort,startDateTime,stopDateTime,Label
0,3/11/2014 16:49,DNS,2073,15834,48,26,ISQBAAABAAAAAAAAA3d3dwdsZHVodHJwA25ldAAAAQABIS...,!$wwwlduhtrpnet!%wwwafcyhfcom!g-ecximages-amaz...,,,L2L,,,192.168.4.118,udp_ip,1266,192.168.5.122,53,6/15/2010 23:54,6/16/2010 0:06,Normal
1,3/11/2014 16:49,HTTPImageTransfer,64,0,0,1,,,,,L2R,"F,A",,192.168.4.118,tcp_ip,1592,198.173.85.138,80,6/15/2010 23:55,6/15/2010 23:59,Normal
2,3/11/2014 16:49,HTTPWeb,399,0,0,3,R0VUIC9pbWFnZXMvYXVmYXN0bG9va2Jhbm5lcjc2NS5KUE...,GET /images/aufastlookbanner765.JPG HTTP/1.1Ho...,,,L2R,"F,P,A",,192.168.4.118,tcp_ip,1595,115.178.18.2,80,6/15/2010 23:56,6/16/2010 0:00,Normal
3,3/11/2014 16:49,HTTPWeb,384,0,0,6,,,,,L2R,"F,A",,192.168.2.109,tcp_ip,1683,95.154.240.242,80,6/15/2010 23:56,6/15/2010 23:59,Normal
4,3/11/2014 16:49,HTTPImageTransfer,3303,39226,48,35,,,,,L2R,"P,A","P,A",192.168.4.119,tcp_ip,4576,72.247.111.188,80,6/15/2010 23:56,6/16/2010 0:03,Normal


In [8]:
#There are NaNs in the data, what is their count?
data.isna().sum(axis=0)

generated                               0
appName                                 0
totalSourceBytes                        0
totalDestinationBytes                   0
totalDestinationPackets                 0
totalSourcePackets                      0
sourcePayloadAsBase64             1098051
sourcePayloadAsUTF                1182855
destinationPayloadAsBase64        1188626
destinationPayloadAsUTF           1188687
direction                               0
sourceTCPFlagsDescription          430943
destinationTCPFlagsDescription     493422
source                                  0
protocolName                            0
sourcePort                              0
destination                             0
destinationPort                         0
startDateTime                           0
stopDateTime                            0
Label                                   0
dtype: int64

In [9]:
# Compute NaNs as a ratio relative to the whole dataset size
data.isna().sum(axis=0) / len(data)

generated                         0.000000
appName                           0.000000
totalSourceBytes                  0.000000
totalDestinationBytes             0.000000
totalDestinationPackets           0.000000
totalSourcePackets                0.000000
sourcePayloadAsBase64             0.530035
sourcePayloadAsUTF                0.570970
destinationPayloadAsBase64        0.573756
destinationPayloadAsUTF           0.573786
direction                         0.000000
sourceTCPFlagsDescription         0.208019
destinationTCPFlagsDescription    0.238177
source                            0.000000
protocolName                      0.000000
sourcePort                        0.000000
destination                       0.000000
destinationPort                   0.000000
startDateTime                     0.000000
stopDateTime                      0.000000
Label                             0.000000
dtype: float64

In [10]:
data['Label'].value_counts()

Normal    2002747
Attack      68910
Name: Label, dtype: int64

Although the labels in this case are binary, documentation distintinguishes between different attacks. Since only 1 unique attack type is performed per day, the labels can be directly distinguished without ambiguities when the files are loaded separately. In this brief analysis, we load them all at once.

In [11]:
# appName distribution
data['appName'].value_counts().head(50)

HTTPWeb               738833
HTTPImageTransfer     727295
DNS                   314326
Unknown_UDP            62682
SecureWeb              57216
NetBIOS-IP             29122
Unknown_TCP            28569
WindowsFileSharing     18037
POP                    16362
IMAP                   13763
FTP                    13475
BitTorrent             10195
SSH                     9992
SMTP                    9418
ICMP                    8278
WebMediaDocuments       4213
Flowgen                 2541
MiscApplication         1339
WebFileTransfer          580
IRC                      337
XWindows                 337
Oracle                   252
WebMediaVideo            251
Authentication           240
Yahoo                    222
Real                     183
RPC                      144
Telnet                   132
Filenet                  121
Webmin                   119
DNS-Port                 112
MSMQ                     106
MSN                      104
IPSec                     94
Timbuktu      

In [12]:
# Distribution of popular ports
data['destinationPort'].value_counts()

80       1470875
53        314449
443        53956
138        19972
137        17931
          ...   
33057          1
37112          1
50482          1
21828          1
51425          1
Name: destinationPort, Length: 24238, dtype: int64

In [13]:
data['source'].value_counts()

192.168.5.122     268267
192.168.2.107     208379
192.168.4.118     135374
192.168.1.101     116292
192.168.4.121     105454
                   ...  
93.70.125.65           1
96.47.114.210          1
78.84.208.212          1
58.153.70.147          1
84.221.137.176         1
Name: source, Length: 2478, dtype: int64

Despite the documentation declares the simulation network is small, there are over 2k unique IP addresses.

## Continuity Analysis

Documentation declares that the data consist of a 7-day continuous capture. Let's verify this claim.


In [14]:
# See how the timestamps look like
display(data['startDateTime'].head())

display(data['stopDateTime'].head())

0    6/15/2010 23:54
1    6/15/2010 23:55
2    6/15/2010 23:56
3    6/15/2010 23:56
4    6/15/2010 23:56
Name: startDateTime, dtype: object

0     6/16/2010 0:06
1    6/15/2010 23:59
2     6/16/2010 0:00
3    6/15/2010 23:59
4     6/16/2010 0:03
Name: stopDateTime, dtype: object

In [15]:
data['stime'] = pd.to_datetime(data['startDateTime'], format='%m/%d/%Y %H:%M')
data['etime'] = pd.to_datetime(data['stopDateTime'], format='%m/%d/%Y %H:%M')

data['stime'].head()

0   2010-06-15 23:54:00
1   2010-06-15 23:55:00
2   2010-06-15 23:56:00
3   2010-06-15 23:56:00
4   2010-06-15 23:56:00
Name: stime, dtype: datetime64[ns]

In [16]:
data_sorted = data.sort_values(by='stime', ascending=False)
data_sorted['stime'].head()

397553   2010-06-17 23:58:00
397548   2010-06-17 23:58:00
397563   2010-06-17 23:58:00
397562   2010-06-17 23:58:00
397561   2010-06-17 23:58:00
Name: stime, dtype: datetime64[ns]

In [17]:
# Compute timespan of the capture and the contiguity
# Based on the documentation, the capture time is 7 days -> 60 * 60 * 24 * 7 = 604800 -> 1% of if is 6048 - approx 100 minute gap is allowed
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 6048) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations


In [18]:
measure_real_capture_dur(data_sorted['stime'])

(Timedelta('6 days 00:02:00'), False, [Timedelta('6 days 00:02:00')])

There is only one gap at the very end of the capture -- 3h 46m. The data can then be, in a way, considered contiguous.