# CIDDS-001 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

In [2]:
FOLDER_INTERNAL = 'CIDDS-001/traffic/OpenStack'
FOLDER_EXTERNAL = 'CIDDS-001/traffic/ExternalServer'

## Internal Traffic Analysis

In [3]:
internal_files = os.listdir(FOLDER_INTERNAL)
internal_files

['CIDDS-001-internal-week2.csv',
 'CIDDS-001-internal-week1.csv',
 'CIDDS-001-internal-week3.csv',
 'CIDDS-001-internal-week4.csv']

In [4]:
data = pd.DataFrame()

for data_path in internal_files:
    fullpath = os.path.join(FOLDER_INTERNAL, data_path)

    data = pd.concat([data, pd.read_csv(fullpath)])

  data = pd.concat([data, pd.read_csv(fullpath)])
  data = pd.concat([data, pd.read_csv(fullpath)])
  data = pd.concat([data, pd.read_csv(fullpath)])
  data = pd.concat([data, pd.read_csv(fullpath)])


In [5]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 31287933 entries, 0 to 6175896
Data columns (total 16 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Date first seen    object 
 1   Duration           float64
 2   Proto              object 
 3   Src IP Addr        object 
 4   Src Pt             int64  
 5   Dst IP Addr        object 
 6   Dst Pt             float64
 7   Packets            int64  
 8   Bytes              object 
 9   Flows              int64  
 10  Flags              object 
 11  Tos                int64  
 12  class              object 
 13  attackType         object 
 14  attackID           object 
 15  attackDescription  object 
dtypes: float64(2), int64(4), object(10)
memory usage: 4.0+ GB


In [6]:
data.describe()

Unnamed: 0,Duration,Src Pt,Dst Pt,Packets,Flows,Tos
count,31287930.0,31287930.0,31287930.0,31287930.0,31287933.0,31287930.0
mean,0.1232569,24631.38,24414.51,15.35089,1.0,9.817501
std,0.8024699,25096.84,25091.05,984.7063,0.0,15.04512
min,0.0,0.0,0.0,1.0,1.0,0.0
25%,0.0,80.0,80.0,1.0,1.0,0.0
50%,0.0,8082.0,8082.0,2.0,1.0,0.0
75%,0.031,50080.0,49996.0,3.0,1.0,32.0
max,238.008,65535.0,65535.0,208768.0,1.0,192.0


In [7]:
data.head(5)

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class,attackType,attackID,attackDescription
0,2017-03-22 00:00:00.071,0.0,TCP,192.168.100.5,445,192.168.220.14,42718.0,1,108,1,.AP...,0,normal,---,---,---
1,2017-03-22 00:00:00.070,0.002,TCP,192.168.220.14,42718,192.168.100.5,445.0,2,174,1,.AP...,0,normal,---,---,---
2,2017-03-22 00:00:05.953,0.002,TCP,192.168.220.10,59594,192.168.100.5,445.0,2,174,1,.AP...,0,normal,---,---,---
3,2017-03-22 00:00:05.955,0.0,TCP,192.168.100.5,445,192.168.220.10,59594.0,1,108,1,.AP...,0,normal,---,---,---
4,2017-03-22 00:00:06.468,0.0,TCP,10022_204,80,192.168.210.5,61252.0,1,66,1,.A....,32,normal,---,---,---


In [8]:
data['class'].value_counts()

class
normal      28051906
attacker     1656605
victim       1579422
Name: count, dtype: int64

In [9]:
data['attackType'].value_counts()

attackType
---           28051906
dos            2959027
portScan        265918
pingScan          6090
bruteForce        4992
Name: count, dtype: int64

In [10]:
data['attackID'].value_counts()

attackID
---    28051906
53       516299
60       333627
18       295302
44       261169
         ...   
70          240
25          201
54          183
15           64
30           46
Name: count, Length: 71, dtype: int64

In [11]:
data['attackDescription'].value_counts()

attackDescription
---                                      28235324
10000 connections on 192.168.100.6:80     2775655
nmap args: -sS -T 2                        103470
nmap args: -sS -T 3                         95361
nmap args: -sS -T 1                         67087
nmap args: -n -sP -PE -T 2                   2804
nmap args: -n -sP -PE -T 1                   2449
192.168.210.4                                1811
nmap args: -n -sP -PE -T 3                    837
192.168.210.5                                 705
192.168.200.8                                 680
192.168.200.4                                 565
192.168.210.3                                 427
192.168.100.4                                 335
192.168.100.2                                 240
192.168.210.2                                 183
Name: count, dtype: int64

### Time Continuity Determination

In [12]:
data['timestamp'] = pd.to_datetime(data['Date first seen'], format='%Y-%m-%d %H:%M:%S.%f')

In [13]:
data_sorted = data.sort_values(by='timestamp', ascending=False).reset_index(drop=True)
data_sorted

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class,attackType,attackID,attackDescription,timestamp
0,2017-04-18 23:59:30.203,0.083,TCP,10068_66,443,192.168.200.8,58432.0,13,2304,1,.AP.S.,32,normal,---,---,---,2017-04-18 23:59:30.203
1,2017-04-18 23:59:30.183,0.083,TCP,192.168.200.8,58432,10068_66,443.0,13,2168,1,.AP.S.,0,normal,---,---,---,2017-04-18 23:59:30.183
2,2017-04-18 23:59:30.175,0.000,UDP,DNS,53,192.168.200.8,54577.0,1,128,1,......,0,normal,---,---,---,2017-04-18 23:59:30.175
3,2017-04-18 23:59:30.175,0.000,UDP,DNS,53,192.168.200.8,62120.0,1,128,1,......,0,normal,---,---,---,2017-04-18 23:59:30.175
4,2017-04-18 23:59:30.173,0.000,UDP,192.168.200.8,54577,DNS,53.0,1,87,1,......,0,normal,---,---,---,2017-04-18 23:59:30.173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31287928,2017-03-15 00:01:16.631,0.004,TCP,192.168.220.16,58844,192.168.100.5,445.0,2,174,1,.AP...,0,normal,---,---,---,2017-03-15 00:01:16.631
31287929,2017-03-15 00:01:16.631,0.004,TCP,192.168.220.16,58844,192.168.100.5,445.0,2,174,1,.AP...,0,normal,---,---,---,2017-03-15 00:01:16.631
31287930,2017-03-15 00:01:16.552,0.000,TCP,192.168.100.5,445,192.168.220.15,48888.0,1,108,1,.AP...,0,normal,---,---,---,2017-03-15 00:01:16.552
31287931,2017-03-15 00:01:16.552,0.000,TCP,192.168.100.5,445,192.168.220.15,48888.0,1,108,1,.AP...,0,normal,---,---,---,2017-03-15 00:01:16.552


In [14]:
time_span = data_sorted['timestamp'].iloc[0] - data_sorted['timestamp'].iloc[-1]
time_span

Timedelta('34 days 23:58:13.652000')

In [15]:
# Span is 35 days -> convert to secs.
time_span.total_seconds()

3023893.652

In [16]:
# Are there gaps in the data (was the capture interrupted?)
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [17]:
measure_real_capture_dur(data_sorted['timestamp'], int(time_span.total_seconds() * 0.01))

(Timedelta('27 days 23:56:34.524000'),
 False,
 [Timedelta('13 days 23:59:29.937000'), Timedelta('13 days 23:57:04.587000')])

Only one discontinuous block directly at the middle. This can definitely be considered as continuous.

## External Traffic Analysis

In [18]:
external_files = os.listdir(FOLDER_EXTERNAL)
external_files

['CIDDS-001-external-week2.csv',
 'CIDDS-001-external-week1.csv',
 'CIDDS-001-external-week4.csv',
 'CIDDS-001-external-week3.csv']

In [19]:
data = pd.DataFrame()

for data_path in external_files:
    fullpath = os.path.join(FOLDER_EXTERNAL, data_path)

    data = pd.concat([data, pd.read_csv(fullpath)])

  data = pd.concat([data, pd.read_csv(fullpath)])


In [20]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 671241 entries, 0 to 153025
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date first seen    671241 non-null  object 
 1   Duration           671241 non-null  float64
 2   Proto              671241 non-null  object 
 3   Src IP Addr        671241 non-null  object 
 4   Src Pt             671241 non-null  int64  
 5   Dst IP Addr        671241 non-null  object 
 6   Dst Pt             671241 non-null  float64
 7   Packets            671241 non-null  int64  
 8   Bytes              671241 non-null  object 
 9   Flows              671241 non-null  int64  
 10  Flags              671241 non-null  object 
 11  Tos                671241 non-null  int64  
 12  class              671241 non-null  object 
 13  attackType         671241 non-null  object 
 14  attackID           671241 non-null  object 
 15  attackDescription  671241 non-null  object 
dtypes: floa

In [21]:
data.describe()

Unnamed: 0,Duration,Src Pt,Dst Pt,Packets,Flows,Tos
count,671241.0,671241.0,671241.0,671241.0,671241.0,671241.0
mean,109.457693,22546.25977,22280.26754,24.01006,1.0,0.0
std,5091.732802,23975.273445,23942.463329,1044.981629,0.0,0.0
min,0.0,0.0,0.0,1.0,1.0,0.0
25%,0.062,22.0,22.0,5.0,1.0,0.0
50%,7.018,8000.0,8000.0,8.0,1.0,0.0
75%,11.692,49302.0,49075.0,19.0,1.0,0.0
max,604817.074,65535.0,65535.0,176609.0,1.0,0.0


In [22]:
data.head(5)

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class,attackType,attackID,attackDescription
0,2017-03-22 00:00:14.461,0.0,TCP,19394_40,54719,EXT_SERVER,23.0,1,46,1,....S.,0,suspicious,---,---,---
1,2017-03-22 00:00:14.461,0.0,TCP,EXT_SERVER,23,19394_40,54719.0,1,40,1,.A.R..,0,suspicious,---,---,---
2,2017-03-22 00:00:33.525,0.0,TCP,19395_18,4622,EXT_SERVER,7547.0,1,46,1,....S.,0,suspicious,---,---,---
3,2017-03-22 00:00:33.525,0.0,TCP,EXT_SERVER,7547,19395_18,4622.0,1,40,1,.A.R..,0,suspicious,---,---,---
4,2017-03-22 00:00:40.205,0.0,TCP,11246_253,6000,EXT_SERVER,3306.0,1,46,1,....S.,0,suspicious,---,---,---


In [23]:
data['class'].value_counts()

class
suspicious    437911
normal        134240
unknown        77923
attacker       12260
victim          8907
Name: count, dtype: int64

In [24]:
data['attackType'].value_counts()

attackType
---           650074
portScan       18719
bruteForce      2448
Name: count, dtype: int64

In [25]:
data['attackID'].value_counts()

attackID
---    650074
7        6410
1        2008
2        2002
16       2002
10       1991
15       1984
12       1370
19        952
17        200
14        200
13        200
11        200
8         200
9         200
3         200
20        200
5         200
4         200
18        200
6         168
22         40
21         40
Name: count, dtype: int64

In [26]:
data['attackDescription'].value_counts()

attackDescription
---                    654852
nmap args: -sS -T 2     11348
nmap args: -sU -T 2      3361
100 passwords            1600
20 passwords               80
Name: count, dtype: int64

### Duration and Continuity Analysis

In [27]:
data['timestamp'] = pd.to_datetime(data['Date first seen'], format='%Y-%m-%d %H:%M:%S.%f')

In [28]:
data_sorted = data.sort_values(by='timestamp', ascending=False).reset_index(drop=True)
data_sorted

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class,attackType,attackID,attackDescription,timestamp
0,2017-04-18 23:59:55.951,11.320,TCP,EXT_SERVER,22,32955_33,30190.0,21,3493,1,.AP.S.,0,suspicious,---,---,---,2017-04-18 23:59:55.951
1,2017-04-18 23:59:55.951,11.320,TCP,32955_33,30190,EXT_SERVER,22.0,17,2267,1,.APRSF,0,suspicious,---,---,---,2017-04-18 23:59:55.951
2,2017-04-18 23:59:50.947,33.003,TCP,EXT_SERVER,22,32955_33,36255.0,8,480,1,.A..S.,0,suspicious,---,---,---,2017-04-18 23:59:50.947
3,2017-04-18 23:59:50.947,33.003,TCP,32955_33,36255,EXT_SERVER,22.0,3,180,1,....S.,0,suspicious,---,---,---,2017-04-18 23:59:50.947
4,2017-04-18 23:59:40.840,0.062,TCP,OPENSTACK_NET,54379,EXT_SERVER,8000.0,6,515,1,.AP.SF,0,normal,---,---,---,2017-04-18 23:59:40.840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671236,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,0,normal,---,---,---,2017-03-14 17:43:57.172
671237,2017-03-14 17:43:39.011,183418.493,TCP,EXT_SERVER,8082,OPENSTACK_NET,60802.0,13266,33.0 M,1,.AP...,0,normal,---,---,---,2017-03-14 17:43:39.011
671238,2017-03-14 17:43:39.011,183418.493,TCP,OPENSTACK_NET,60802,EXT_SERVER,8082.0,20751,5.8 M,1,.AP...,0,normal,---,---,---,2017-03-14 17:43:39.011
671239,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,0,normal,---,---,---,2017-03-14 17:43:26.135


In [29]:
time_span = data_sorted['timestamp'].iloc[0] - data_sorted['timestamp'].iloc[-1]
time_span

Timedelta('35 days 06:16:29.816000')

In [30]:
# Span is 35 days -> convert to secs.
time_span.total_seconds()

3046589.816

In [31]:
measure_real_capture_dur(data_sorted['timestamp'], int(time_span.total_seconds() * 0.01))

(Timedelta('28 days 06:16:06.029000'),
 False,
 [Timedelta('14 days 06:16:19.269000'), Timedelta('13 days 23:59:46.760000')])

Again - one gap appeared. However, at different time - so - if we put the data together, the capture itself is continuous.