# LYCOS-IDS2017 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

In [2]:
# Despite the folder being named PCAP, there are extracted CSVs after unzipping
DATAFOLDER = '/data/surv/lycos_ids2017/lycos-ids2017'

In [3]:
ALL_CSVS = [os.path.join(DATAFOLDER, file) for file in os.listdir(DATAFOLDER) if file.endswith('.csv')]
ALL_CSVS

['/data/surv/lycos_ids2017/lycos-ids2017/Monday-WorkingHours.pcap_lycos.csv',
 '/data/surv/lycos_ids2017/lycos-ids2017/Thursday-WorkingHours.pcap_lycos.csv',
 '/data/surv/lycos_ids2017/lycos-ids2017/Tuesday-WorkingHours.pcap_lycos.csv',
 '/data/surv/lycos_ids2017/lycos-ids2017/Friday-WorkingHours.pcap_lycos.csv',
 '/data/surv/lycos_ids2017/lycos-ids2017/Wednesday-WorkingHours.pcap_lycos.csv']

We have extracted the ZIP files into CSVs and run the `python labelling.py` command in the command line -- not visible there.

In [4]:
data = pd.DataFrame()

for csv_path in ALL_CSVS:
    data = pd.concat([data, pd.read_csv(csv_path)])

In [5]:
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1837498 entries, 0 to 495236
Data columns (total 83 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   flow_id                 1837498 non-null  object 
 1   src_addr                1837498 non-null  object 
 2   src_port                1837498 non-null  int64  
 3   dst_addr                1837498 non-null  object 
 4   dst_port                1837498 non-null  int64  
 5   ip_prot                 1837498 non-null  int64  
 6   timestamp               1837498 non-null  int64  
 7   flow_duration           1837498 non-null  int64  
 8   down_up_ratio           1837498 non-null  float64
 9   pkt_len_max             1837498 non-null  float64
 10  pkt_len_min             1837498 non-null  float64
 11  pkt_len_mean            1837498 non-null  float64
 12  pkt_len_var             1837498 non-null  float64
 13  pkt_len_std             1837498 non-null  float64
 14  bytes_pe

In [6]:
len(data)

1837498

In [7]:
# Does this length correspond to plainly counting the lines?
!cat $DATAFOLDER/*.csv | wc -l

1837503


Correct, as 5 headers were merged into 1 in Pandas.

In [8]:
data.head()

Unnamed: 0,flow_id,src_addr,src_port,dst_addr,dst_port,ip_prot,timestamp,flow_duration,down_up_ratio,pkt_len_max,pkt_len_min,pkt_len_mean,pkt_len_var,pkt_len_std,bytes_per_s,pkt_per_s,fwd_pkt_per_s,bwd_pkt_per_s,fwd_pkt_cnt,fwd_pkt_len_tot,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,fwd_pkt_hdr_len_tot,fwd_pkt_hdr_len_min,fwd_non_empty_pkt_cnt,bwd_pkt_cnt,bwd_pkt_len_tot,bwd_pkt_len_max,bwd_pkt_len_min,bwd_pkt_len_mean,bwd_pkt_len_std,bwd_pkt_hdr_len_tot,bwd_pkt_hdr_len_min,bwd_non_empty_pkt_cnt,iat_max,iat_min,iat_mean,iat_std,fwd_iat_tot,fwd_iat_max,fwd_iat_min,fwd_iat_mean,fwd_iat_std,bwd_iat_tot,bwd_iat_max,bwd_iat_min,bwd_iat_mean,bwd_iat_std,active_max,active_min,active_mean,active_std,idle_max,idle_min,idle_mean,idle_std,flag_SYN,flag_fin,flag_rst,flag_ack,flag_psh,fwd_flag_psh,bwd_flag_psh,flag_urg,fwd_flag_urg,bwd_flag_urg,flag_cwr,flag_ece,fwd_bulk_bytes_mean,fwd_bulk_pkt_mean,fwd_bulk_rate_mean,bwd_bulk_bytes_mean,bwd_bulk_pkt_mean,bwd_bulk_rate_mean,fwd_subflow_bytes_mean,fwd_subflow_pkt_mean,bwd_subflow_bytes_mean,bwd_subflow_pkt_mean,fwd_tcp_init_win_bytes,bwd_tcp_init_win_bytes,label
0,224.0.0.22-192.168.10.9-0-0-2,224.0.0.22,0,192.168.10.9,0,2,1499082997567312,84524,0.0,8.0,8.0,8.0,0.0,0.0,378.5907,47.323837,0.0,47.323837,0,0,0.0,0.0,0.0,0.0,0,0,0,4,32,8.0,8.0,8.0,0.0,32,8,4,83220.0,160.0,28174.67,47673.2,0,0.0,0.0,0.0,0.0,84524,83220.0,160.0,28174.67,47673.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,32.0,4.0,378.590696,0.0,0.0,32.0,4.0,-1,-1,benign
1,192.168.10.9-224.0.0.252-60372-5355-17,192.168.10.9,60372,224.0.0.252,5355,17,1499082997568863,98644,0.0,28.0,28.0,28.0,0.0,0.0,5109.282,182.474352,182.474352,0.0,18,504,28.0,28.0,28.0,0.0,144,8,18,0,0,0.0,0.0,0.0,0.0,0,0,0,98619.0,1.0,5802.588,23918.24,98644,98619.0,1.0,5802.588,23918.24,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,504.0,18.0,5109.282,0.0,0.0,0.0,504.0,18.0,0.0,0.0,-1,-1,benign
2,192.168.10.9-192.168.10.3-137-137-17,192.168.10.9,137,192.168.10.3,137,17,1499082997897192,83068408,1.105263,68.0,50.0,59.15,43.724051,6.612416,56.9651,0.963062,0.457454,0.505607,38,2224,68.0,50.0,58.526316,9.108169,304,8,38,42,2508,62.0,0.0,59.714286,2.949045,336,8,42,50778435.0,1.0,1051499.0,5971943.0,83068326,50778573.0,1.0,2245090.0,8632611.0,83068177,50778563.0,1.0,2026053.0,8217163.0,14874848.0,1502094.0,8188471.0,9455965.0,50778435.0,15888498.0,33333466.5,24670910.0,0,0,0,0,0,0,0,0,0,0,0,0,408.0,6.0,6181818.0,281.333333,4.666667,14842.170052,222.4,3.8,250.8,4.2,-1,-1,benign
3,192.168.10.9-192.168.10.3-63683-53-17,192.168.10.9,63683,192.168.10.3,53,17,1499082997908429,292,1.0,131.0,81.0,106.0,833.333333,28.867513,1452055.0,13698.630137,6849.315068,6849.315068,2,162,81.0,81.0,81.0,0.0,16,8,2,2,262,131.0,0.0,131.0,0.0,16,8,2,286.0,3.0,97.33333,163.3901,3,3.0,3.0,3.0,0.0,3,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,162.0,2.0,262.0,2.0,-1,-1,benign
4,192.168.10.9-192.168.10.3-63276-53-17,192.168.10.9,63276,192.168.10.3,53,17,1499082997908639,293,1.0,131.0,81.0,106.0,833.333333,28.867513,1447099.0,13651.877133,6825.938567,6825.938567,2,162,81.0,81.0,81.0,0.0,16,8,2,2,262,131.0,0.0,131.0,0.0,16,8,2,196.0,48.0,97.66667,85.16063,49,49.0,49.0,49.0,0.0,48,48.0,48.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,162.0,2.0,262.0,2.0,-1,-1,benign


In [9]:
# Analyze label distribution
data['label'].value_counts()

label
benign                     1395675
portscan                    160106
dos_hulk                    158988
ddos                         95683
dos_goldeneye                 6765
dos_slowloris                 5674
dos_slowhttptest              4866
ftp_patator                   4003
ssh_patator                   2959
webattack_bruteforce          1360
bot                            735
webattack_xss                  661
webattack_sql_injection         12
heartbleed                      11
Name: count, dtype: int64

In [10]:
# Analyze the number of unique source addresses and their communication
data['src_addr'].value_counts()

src_addr
172.16.0.1         442471
192.168.10.3       263146
192.168.10.5       135147
192.168.10.9       122744
192.168.10.15      115795
                    ...  
172.217.6.206           1
172.217.10.138          1
172.217.10.1            1
173.194.207.154         1
0.0.0.0                 1
Name: count, Length: 268, dtype: int64

In [11]:
# Timestamp analysis
data['timestamp']

0         1499082997567312
1         1499082997568863
2         1499082997897192
3         1499082997908429
4         1499082997908639
                ...       
495232    1499285266186369
495233    1499285267192695
495234    1499285269740925
495235    1499285269741570
495236    1499285279170242
Name: timestamp, Length: 1837498, dtype: int64

Seems like timestamps are iin the microsecond format, referring to July 2017 capture.

In [18]:
tstamps = data['timestamp'].sort_values(ascending=False).reset_index(drop=True)
tstamps = tstamps / 1e6
tstamps.head()

0    1.499458e+09
1    1.499458e+09
2    1.499458e+09
3    1.499458e+09
4    1.499458e+09
Name: timestamp, dtype: float64

In [19]:
# Compute timespan of the capture and the contiguity
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = 0
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= gap_max_secs:
            total_dur += dur_gap
        elif contiguous:
            contiguous = False

        last_tstamp = cur_tstamp

    return pd.Timedelta(seconds=total_dur), contiguous

In [20]:
measure_real_capture_dur(tstamps)

(Timedelta('1 days 11:40:46.796216964'), False)

According to CIC-IDS2017 documentation, the capture should go for 5 x 8h = 40h. In this analysis, we see it lasts for 35h 40m ~~ 36h. Given our strict criteria of maximumal 5 min gap, this can be considered somewhat accurate, although is highly unlikely that no traffic would be present on the network for 4 hours of 40 (10%) of the total capture time.

## Train-Test-Crossval Sets Analyses

After running the scripts (`create_datasets.py`), the dataset is split into train, test, and cross-validation sets. Be aware that you need to change all pandas append calls to pandas concat, as append call is deprecated since pandas 1.4.0.

In [22]:
LYCOS_SPLIT_FODER = '/data/surv/lycos_ids2017/datasets/lycos-ids2017'

TRAIN_SET = os.path.join(LYCOS_SPLIT_FODER, 'train_set.parquet')
TEST_SET = os.path.join(LYCOS_SPLIT_FODER, 'test_set.parquet')
CROSSVAL_SET = os.path.join(LYCOS_SPLIT_FODER, 'crossval_set.parquet')

In [23]:
train = pd.read_parquet(TRAIN_SET)
test = pd.read_parquet(TEST_SET)
valid = pd.read_parquet(CROSSVAL_SET)

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440632 entries, 11009 to 156543
Data columns (total 83 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   flow_id                 440632 non-null  object 
 1   src_addr                440632 non-null  object 
 2   src_port                440632 non-null  int64  
 3   dst_addr                440632 non-null  object 
 4   dst_port                440632 non-null  int64  
 5   ip_prot                 440632 non-null  int64  
 6   timestamp               440632 non-null  int64  
 7   flow_duration           440632 non-null  int64  
 8   down_up_ratio           440632 non-null  float64
 9   pkt_len_max             440632 non-null  float64
 10  pkt_len_min             440632 non-null  float64
 11  pkt_len_mean            440632 non-null  float64
 12  pkt_len_var             440632 non-null  float64
 13  pkt_len_std             440632 non-null  float64
 14  bytes_per_s          

In [28]:
display(len(train))
display(len(valid))
display(len(test))

440632

220312

220312

In [29]:
# For the purposes of the paper, training and validation set are considered as one
display(len(train) + len(valid))
display(len(test))

660944

220312

Since the data were taken from the CIC-IDS2017 data, capture continuity analysis is unecessary.