# CIDDS-002 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

In [2]:
TRAFFIC_PATH = 'CIDDS-002/traffic'

In [3]:
files = os.listdir(TRAFFIC_PATH)
files

['week1.csv', 'week2.csv']

In [4]:
# Load the data
data = pd.DataFrame()

for file in files:
    filepath = os.path.join(TRAFFIC_PATH, file)

    data = pd.concat([data, pd.read_csv(filepath)])

  data = pd.concat([data, pd.read_csv(filepath)])
  data = pd.concat([data, pd.read_csv(filepath)])


In [5]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 16161183 entries, 0 to 7975190
Data columns (total 16 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Date first seen    object 
 1   Duration           float64
 2   Proto              object 
 3   Src IP Addr        object 
 4   Src Pt             int64  
 5   Dst IP Addr        object 
 6   Dst Pt             float64
 7   Packets            int64  
 8   Bytes              object 
 9   Flows              int64  
 10  Flags              object 
 11  Tos                int64  
 12  label              object 
 13  attackType         object 
 14  attackID           object 
 15  attackDescription  object 
dtypes: float64(2), int64(4), object(10)
memory usage: 2.0+ GB


In [6]:
len(data)

16161183

In [7]:
data.describe()

Unnamed: 0,Duration,Src Pt,Dst Pt,Packets,Flows,Tos
count,16161180.0,16161180.0,16161180.0,16161180.0,16161183.0,16161180.0
mean,0.2157401,24347.69,23611.18,23.05073,1.0,0.4813014
std,2.899081,24552.5,24467.8,1162.225,0.0,9.597083
min,0.0,0.0,0.0,1.0,1.0,0.0
25%,0.0,138.0,138.0,1.0,1.0,0.0
50%,0.0,33023.0,1900.0,2.0,1.0,0.0
75%,0.043,48706.0,48188.0,3.0,1.0,0.0
max,334.421,65535.0,65535.0,205049.0,1.0,192.0


In [8]:
data.head(10)

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,label,attackType,attackID,attackDescription
0,2017-08-02 00:00:00.419,0.003,TCP,192.168.210.55,44870,192.168.100.11,445.0,2,174,1,.AP...,0,normal,---,---,---
1,2017-08-02 00:00:00.421,0.0,TCP,192.168.100.11,445,192.168.210.55,44870.0,1,108,1,.AP...,0,normal,---,---,---
2,2017-08-02 00:00:02.593,0.004,TCP,192.168.220.47,55101,192.168.100.11,445.0,2,174,1,.AP...,0,normal,---,---,---
3,2017-08-02 00:00:02.859,0.0,TCP,10000_34,443,192.168.210.54,59628.0,1,100,1,.AP...,0,normal,---,---,---
4,2017-08-02 00:00:02.594,0.0,TCP,192.168.100.11,445,192.168.220.47,55101.0,1,108,1,.AP...,0,normal,---,---,---
5,2017-08-02 00:00:02.847,0.21,TCP,192.168.210.54,59628,10000_34,443.0,2,154,1,.AP...,0,normal,---,---,---
6,2017-08-02 00:00:03.568,0.0,TCP,192.168.100.11,445,192.168.220.42,58001.0,1,108,1,.AP...,0,normal,---,---,---
7,2017-08-02 00:00:03.567,0.002,TCP,192.168.220.42,58001,192.168.100.11,445.0,2,174,1,.AP...,0,normal,---,---,---
8,2017-08-02 00:00:04.193,0.0,TCP,192.168.100.11,445,192.168.220.46,49770.0,1,108,1,.AP...,0,normal,---,---,---
9,2017-08-02 00:00:04.192,0.001,TCP,192.168.220.46,49770,192.168.100.11,445.0,2,174,1,.AP...,0,normal,---,---,---


In [9]:
data['Proto'].value_counts()

Proto
TCP      13876697
UDP       2233750
ICMP        50025
IGMP          711
Name: count, dtype: int64

In [10]:
data['Src IP Addr'].value_counts()

Src IP Addr
192.168.220.51    1791847
192.168.220.47    1231061
DNS                988160
192.168.220.42     582085
192.168.220.45     477225
                   ...   
13743_25                1
11686_16                1
11686_149               1
13703_43                1
13551_12                1
Name: count, Length: 11388, dtype: int64

### Time Continuity Determination

In [11]:
data['timestamp'] = pd.to_datetime(data['Date first seen'], format='%Y-%m-%d %H:%M:%S.%f')

In [12]:
data_sorted = data.sort_values(by='timestamp', ascending=False).reset_index(drop=True)
data_sorted

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,label,attackType,attackID,attackDescription,timestamp
0,2017-08-16 23:58:27.084,0.000,TCP,192.168.210.46,49135,192.168.100.11,445.0,1,66,1,.A....,0,normal,---,---,---,2017-08-16 23:58:27.084
1,2017-08-16 23:58:24.178,0.000,TCP,192.168.100.11,445,192.168.220.44,47819.0,1,108,1,.AP...,0,normal,---,---,---,2017-08-16 23:58:24.178
2,2017-08-16 23:58:24.177,0.004,TCP,192.168.220.44,47819,192.168.100.11,445.0,2,174,1,.AP...,0,normal,---,---,---,2017-08-16 23:58:24.177
3,2017-08-16 23:58:23.180,0.001,UDP,192.168.210.57,64069,10004_250,1900.0,2,332,1,......,0,normal,---,---,---,2017-08-16 23:58:23.180
4,2017-08-16 23:58:21.409,0.000,TCP,192.168.100.11,445,192.168.210.55,56218.0,1,108,1,.AP...,0,normal,---,---,---,2017-08-16 23:58:21.409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16161178,2017-08-02 00:00:02.847,0.210,TCP,192.168.210.54,59628,10000_34,443.0,2,154,1,.AP...,0,normal,---,---,---,2017-08-02 00:00:02.847
16161179,2017-08-02 00:00:02.594,0.000,TCP,192.168.100.11,445,192.168.220.47,55101.0,1,108,1,.AP...,0,normal,---,---,---,2017-08-02 00:00:02.594
16161180,2017-08-02 00:00:02.593,0.004,TCP,192.168.220.47,55101,192.168.100.11,445.0,2,174,1,.AP...,0,normal,---,---,---,2017-08-02 00:00:02.593
16161181,2017-08-02 00:00:00.421,0.000,TCP,192.168.100.11,445,192.168.210.55,44870.0,1,108,1,.AP...,0,normal,---,---,---,2017-08-02 00:00:00.421


In [13]:
time_span = data_sorted['timestamp'].iloc[0] - data_sorted['timestamp'].iloc[-1]
time_span

Timedelta('14 days 23:58:26.665000')

In [14]:
# Span is 35 days -> convert to secs.
time_span.total_seconds()

1295906.665

In [15]:
# Are there gaps in the data (was the capture interrupted?)
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [16]:
measure_real_capture_dur(data_sorted['timestamp'], int(time_span.total_seconds() * 0.01))

(Timedelta('13 days 18:41:04.965000'),
 False,
 [Timedelta('6 days 23:58:27.023000'),
  Timedelta('4 days 14:51:06.598000'),
  Timedelta('2 days 03:51:31.344000')])

Due to the gaps, this cannot be really considered as continous capture.