# ICS-Flow Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [2]:
import pandas as pd
import numpy
import os

pd.set_option('display.max_columns', None)

In [3]:
BASE_DIR = '/data/disk2/ics_flow'

PATH_DATA = os.path.join(BASE_DIR, 'Dataset.csv')
PATH_PCAP = os.path.join(BASE_DIR, 'traffic.pcap')

## PCAP Analysis

In [4]:
!capinfos -M $PATH_PCAP

File name:           /data/disk2/ics_flow/traffic.pcap
File type:           pcap
File encapsulation:  ether
File timestamp precision:  microseconds (6)
Packet size limit:   file hdr: 262144 bytes
Number of packets:   25161400
File size:           1971608339 bytes
Data size:           1569025915 bytes
Capture duration:    8382.579185 seconds
First packet time:   2022-09-16 11:14:39.510337
Last packet time:    2022-09-16 13:34:22.089522
Data byte rate:      187176.99 bytes/sec
Data bit rate:       1497415.90 bits/sec
Average packet size: 62.36 bytes
Average packet rate: 3001.63 packets/sec
SHA256:              9d2babd4aff66dd226a438a1b122c0a22ed07761df3653cb105a74e159956134
RIPEMD160:           614db5b29771d365120f68ec3a7f0553587e377e
SHA1:                77387051d58894341f7fdc18658609b9fef5e06b
Strict time order:   False
Number of interfaces in file: 1
Interface #0 info:
                     Encapsulation = Ethernet (1 - ether)
                     Capture length = 262144
              

# CSV Analysis

In [5]:
data = pd.read_csv(PATH_DATA)
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45718 entries, 0 to 45717
Data columns (total 64 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sAddress         45718 non-null  object 
 1   rAddress         45718 non-null  object 
 2   sMACs            45718 non-null  object 
 3   rMACs            45718 non-null  object 
 4   sIPs             44156 non-null  object 
 5   rIPs             44156 non-null  object 
 6   protocol         45718 non-null  object 
 7   startDate        45718 non-null  object 
 8   endDate          45718 non-null  object 
 9   start            45718 non-null  float64
 10  end              45718 non-null  float64
 11  startOffset      45718 non-null  float64
 12  endOffset        45718 non-null  float64
 13  duration         45718 non-null  float64
 14  sPackets         45718 non-null  int64  
 15  rPackets         45718 non-null  int64  
 16  sBytesSum        45718 non-null  int64  
 17  rBytesSum   

In [6]:
len(data)

45718

In [7]:
data.head()

Unnamed: 0,sAddress,rAddress,sMACs,rMACs,sIPs,rIPs,protocol,startDate,endDate,start,end,startOffset,endOffset,duration,sPackets,rPackets,sBytesSum,rBytesSum,sBytesMax,rBytesMax,sBytesMin,rBytesMin,sBytesAvg,rBytesAvg,sLoad,rLoad,sPayloadSum,rPayloadSum,sPayloadMax,rPayloadMax,sPayloadMin,rPayloadMin,sPayloadAvg,rPayloadAvg,sInterPacketAvg,rInterPacketAvg,sttl,rttl,sAckRate,rAckRate,sUrgRate,rUrgRate,sFinRate,rFinRate,sPshRate,rPshRate,sSynRate,rSynRate,sRstRate,rRstRate,sWinTCP,rWinTCP,sFragmentRate,rFragmentRate,sAckDelayMax,rAckDelayMax,sAckDelayMin,rAckDelayMin,sAckDelayAvg,rAckDelayAvg,IT_B_Label,IT_M_Label,NST_B_Label,NST_M_Label
0,192.168.0.11,192.168.0.21,02:42:c0:a8:00:0b,02:42:c0:a8:00:15,192.168.0.11,192.168.0.21,IPV4-TCP,2022-09-16 11:14:39.510337,2022-09-16 11:14:40.010330,1663320000.0,1663320000.0,0.0,0.499993,0.499994,9,17,585,992,65.0,64.0,65.0,52.0,65.0,58.353,9360.115,15872.195,117,108,13.0,12.0,13.0,0.0,13.0,6.353,0.062432,0.031225,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.529,0.0,0.0,0.0,0.0,509.0,502.0,0.0,0.0,0.001,0.497,0.0,0.0,0.000219,0.031154,0,Normal,0,Normal
1,192.168.0.12,192.168.0.21,02:42:c0:a8:00:0c,02:42:c0:a8:00:15,192.168.0.12,192.168.0.21,IPV4-TCP,2022-09-16 11:14:39.513421,2022-09-16 11:14:40.013394,1663320000.0,1663320000.0,0.003084,0.503057,0.499974,9,19,585,1108,65.0,64.0,65.0,52.0,65.0,58.316,9360.486,17728.92,117,120,13.0,12.0,13.0,0.0,13.0,6.316,0.06246,0.027776,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.526,0.0,0.0,0.0,0.0,509.0,502.0,0.0,0.0,0.0,0.497,0.0,0.0,0.000126,0.027727,0,Normal,0,Normal
2,192.168.0.11,192.168.0.12,02:42:c0:a8:00:0b,02:42:c0:a8:00:0c,192.168.0.11,192.168.0.12,IPV4-TCP,2022-09-16 11:14:39.601160,2022-09-16 11:14:40.054139,1663320000.0,1663320000.0,0.090823,0.543802,0.45298,20,15,1187,933,65.0,65.0,52.0,52.0,59.35,62.2,20963.405,16477.554,147,153,13.0,13.0,0.0,0.0,7.35,10.2,0.023833,0.029119,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,0.6,0.8,0.0,0.0,0.0,0.0,503.05,506.2,0.0,0.0,0.198,0.2,0.0,0.0,0.023117,0.030055,0,Normal,0,Normal
3,192.168.0.11,192.168.0.12,02:42:c0:a8:00:0b,02:42:c0:a8:00:0c,192.168.0.11,192.168.0.12,IPV4-TCP,2022-09-16 11:14:40.200719,2022-09-16 11:14:40.602902,1663320000.0,1663320000.0,0.690382,1.092565,0.402184,21,15,1239,933,65.0,65.0,52.0,52.0,59.0,62.2,24645.447,18558.678,147,153,13.0,13.0,0.0,0.0,7.0,10.2,0.020017,0.028723,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,0.571,0.8,0.0,0.0,0.0,0.0,503.0,506.2,0.0,0.0,0.197,0.2,0.0,0.0,0.02018,0.02853,0,Normal,0,Normal
4,192.168.0.11,192.168.0.21,02:42:c0:a8:00:0b,02:42:c0:a8:00:15,192.168.0.11,192.168.0.21,IPV4-TCP,2022-09-16 11:14:40.010348,2022-09-16 11:14:40.507654,1663320000.0,1663320000.0,0.500011,0.997317,0.497307,15,31,975,1792,65.0,64.0,65.0,52.0,65.0,57.806,15684.481,28827.271,195,180,13.0,12.0,13.0,0.0,13.0,5.806,0.035508,0.016577,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.484,0.0,0.0,0.0,0.0,509.0,502.0,0.0,0.0,0.0,0.493,0.0,0.0,9.4e-05,0.017102,0,Normal,0,Normal


In [8]:
data.describe()

Unnamed: 0,start,end,startOffset,endOffset,duration,sPackets,rPackets,sBytesSum,rBytesSum,sBytesMax,rBytesMax,sBytesMin,rBytesMin,sBytesAvg,rBytesAvg,sLoad,rLoad,sPayloadSum,rPayloadSum,sPayloadMax,rPayloadMax,sPayloadMin,rPayloadMin,sPayloadAvg,rPayloadAvg,sInterPacketAvg,rInterPacketAvg,sttl,rttl,sAckRate,rAckRate,sUrgRate,rUrgRate,sFinRate,rFinRate,sPshRate,rPshRate,sSynRate,rSynRate,sRstRate,rRstRate,sWinTCP,rWinTCP,sFragmentRate,rFragmentRate,sAckDelayMax,rAckDelayMax,sAckDelayMin,rAckDelayMin,sAckDelayAvg,rAckDelayAvg,IT_B_Label,NST_B_Label
count,45718.0,45718.0,45718.0,45718.0,45718.0,45718.0,45718.0,45718.0,45718.0,44986.0,45056.0,44986.0,45056.0,44986.0,45056.0,45718.0,45718.0,45718.0,45718.0,44986.0,45056.0,44986.0,45056.0,44986.0,45056.0,43317.0,44444.0,43860.0,43983.0,43860.0,43983.0,43860.0,43983.0,43860.0,43983.0,43860.0,43983.0,43860.0,43983.0,43860.0,43983.0,43860.0,43983.0,43870.0,44145.0,43450.0,42443.0,43450.0,42443.0,43450.0,42443.0,45718.0,45718.0
mean,1663324000.0,1663324000.0,4393.933046,4394.298533,0.365488,256.037688,294.323221,12085.275471,14537.262588,63.566221,63.265403,59.358223,51.882369,61.696048,58.893635,3983466.0,3847881.0,839.87191,780.525723,12.222514,11.840554,7.994065,0.47057,10.325785,7.444374,0.032138,0.02708105,63.921169,63.488083,0.995508,0.968312,0.0,0.0,0.000163,8.3e-05,0.824215,0.596936,0.004657,0.031683,0.031646,0.004522,504.042236,525.537224,0.0,0.0,0.071445,0.254581,0.000254,0.000367,0.010158,0.025562,0.338641,0.197121
std,2363.962,2363.97,2363.962213,2363.96999,0.184568,1031.987764,1096.329974,44666.593751,51409.321751,5.699014,5.332272,7.573017,3.693346,5.897171,4.794098,35975260.0,44174080.0,3733.929287,3444.934467,3.24684,4.409981,6.396682,4.187008,3.91005,4.360524,0.040336,0.03194589,1.071158,2.796627,0.066869,0.17485,0.0,0.0,0.011031,0.005224,0.25552,0.192508,0.067098,0.174851,0.175058,0.06678,379.558804,289.690578,0.0,0.0,0.096985,0.184493,0.00535,0.006174,0.018204,0.021981,0.473253,0.397829
min,1663320000.0,1663320000.0,0.0,0.499993,1e-06,0.0,0.0,0.0,0.0,40.0,40.0,40.0,40.0,40.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,9.536743e-07,47.698,47.633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-06,-2e-06,0.0,0.0
25%,1663322000.0,1663322000.0,2390.64598,2391.008651,0.401388,7.0,11.0,455.0,680.0,65.0,64.0,52.0,52.0,59.0,58.0,10528.17,17088.29,65.0,65.0,13.0,12.0,0.0,0.0,7.0,6.0,0.001232,0.00639505,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,0.571,0.5,0.0,0.0,0.0,0.0,503.0,502.0,0.0,0.0,0.001,0.154,0.0,0.0,0.000319,0.015734,0.0,0.0
50%,1663324000.0,1663324000.0,4577.24284,4577.515074,0.434423,11.0,15.0,715.0,933.0,65.0,64.0,65.0,52.0,65.0,58.4025,23350.22,18562.64,130.0,120.0,13.0,12.0,13.0,0.0,13.0,6.4,0.02065,0.02811996,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.529,0.0,0.0,0.0,0.0,509.0,502.0,0.0,0.0,0.002,0.201,0.0,0.0,0.000601,0.028446,0.0,0.0
75%,1663326000.0,1663326000.0,6431.504625,6431.981408,0.497859,21.0,20.0,1239.0,1160.0,65.0,65.0,65.0,52.0,65.0,62.2,76736.31,94609.96,147.0,153.0,13.0,13.0,13.0,0.0,13.0,10.2,0.04992,0.0294142,64.0,64.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.8,0.0,0.0,0.0,0.0,509.0,506.2,0.0,0.0,0.196,0.482,0.0,0.0,0.020373,0.029352,1.0,0.0
max,1663328000.0,1663328000.0,8382.533847,8382.579185,0.5,8642.0,8642.0,380248.0,380248.0,107.0,97.0,107.0,97.0,107.0,97.0,856000000.0,776000000.0,39715.0,36660.0,71.0,69.0,71.0,69.0,71.0,69.0,0.499984,0.4998679,64.0,64.0,1.0,1.0,0.0,0.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,37548.635,25364.34,0.0,0.0,0.496,0.5,0.351,0.407,0.351305,0.407444,1.0,1.0


In [9]:
# Source IP addresses
data['sAddress'].value_counts()

sAddress
192.168.0.11         29556
192.168.0.12         13425
02:42:c0:a8:00:0c      330
192.168.0.21           310
02:42:c0:a8:00:29      302
02:42:c0:a8:00:0b      277
192.168.0.1            264
192.168.0.23           207
192.168.0.41           197
192.168.0.22           197
02:42:c0:a8:00:17      189
02:42:c0:a8:00:15      183
02:42:c0:a8:00:16      181
02:42:23:55:81:7e      100
Name: count, dtype: int64

In [10]:
data['rAddress'].value_counts()

rAddress
192.168.0.21         24412
192.168.0.12         14016
192.168.0.41          4791
02:42:c0:a8:00:29     1049
192.168.0.23           730
02:42:c0:a8:00:17      198
192.168.0.42           197
ff:ff:ff:ff:ff:ff      172
02:42:c0:a8:00:2a      133
224.0.0.251             10
33:33:00:00:00:fb       10
Name: count, dtype: int64

In [11]:
# Protocols breakdown
data['protocol'].value_counts()

protocol
IPV4-TCP     43984
ARP           1552
IPV4-ICMP      162
IPV4-UDP        10
IPV6            10
Name: count, dtype: int64

In [9]:
# Binary labels distribution
data['NST_B_Label'].value_counts()

NST_B_Label
0    36706
1     9012
Name: count, dtype: int64

In [10]:
# Multiclass labels distribution
data['NST_M_Label'].value_counts()

NST_M_Label
Normal       36706
mitm          2584
replay        2358
port-scan     1944
ddos          1934
ip-scan        192
Name: count, dtype: int64

### Timestamps & Continuity Analysis

In [12]:
tstamps = pd.to_datetime(data['endDate'])
tstamps = tstamps.sort_values(ascending=False).reset_index(drop=True)
tstamps

0       2022-09-16 13:34:22.089522
1       2022-09-16 13:34:22.085566
2       2022-09-16 13:34:22.015860
3       2022-09-16 13:34:21.524326
4       2022-09-16 13:34:21.521619
                   ...            
45713   2022-09-16 11:14:40.510327
45714   2022-09-16 11:14:40.507654
45715   2022-09-16 11:14:40.054139
45716   2022-09-16 11:14:40.013394
45717   2022-09-16 11:14:40.010330
Name: endDate, Length: 45718, dtype: datetime64[ns]

In [13]:
# Compute the span, although we can clearly see it is 2 hours.
capture_span = tstamps.iloc[0] - tstamps.iloc[-1]
capture_span

Timedelta('0 days 02:19:42.079192')

In [14]:
# How big would a span of 1% look like (maximum gap)
capture_span * 0.01

Timedelta('0 days 00:01:23.820791920')

In [15]:
# Measure capture duration and continousness
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [16]:
measure_real_capture_dur(tstamps, int(capture_span.total_seconds() * 0.01))

(Timedelta('0 days 02:19:42.079192'),
 True,
 [Timedelta('0 days 02:19:42.079192')])

It is apparent that the capture is continous without any gaps.