# UKM-IDS20 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', None)

In [2]:
DATAPATH = '/home/goldy/Documents/phd/papers/datasurv/data/ukm_ids20'

PATH_TRAIN = os.path.join(DATAPATH, 'UKM-IDS20 Training set.csv')
PATH_TEST  = os.path.join(DATAPATH, 'UKM-IDS20 Testing set.csv')

In [3]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)

## Train Subset

In [4]:
train.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10308 entries, 0 to 10307
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   dur                    10308 non-null  float64
 1   trnspt                 10308 non-null  int64  
 2   srvs                   10308 non-null  int64  
 3   flag_n                 10308 non-null  int64  
 4   flag_arst              10308 non-null  int64  
 5   flag_uc                10308 non-null  int64  
 6   flag_sign              10308 non-null  int64  
 7   flag_synrst            10308 non-null  int64  
 8   flag_a                 10308 non-null  int64  
 9   flag_othr              10308 non-null  int64  
 10  src_pkts               10308 non-null  int64  
 11  dst_pkts               10308 non-null  float64
 12  urg_bits               10308 non-null  int64  
 13  push_pkts              10308 non-null  int64  
 14  no_lnkd                10308 non-null  int64  
 15  ar

In [5]:
len(train)

10308

In [6]:
train.head()

Unnamed: 0,dur,trnspt,srvs,flag_n,flag_arst,flag_uc,flag_sign,flag_synrst,flag_a,flag_othr,src_pkts,dst_pkts,urg_bits,push_pkts,no_lnkd,arp,src_ttl,dst_ttl,pkts_dirctn,src_byts,dst_byts,src_avg_byts,dst_avg_byts,strt_t,end_t,dst_host_count,host_dst _count,rtt_first_ack,rtt_avg,avg_t_sent,avg_t_got,repeated,fst_src_sqc,fst_dst_sqc,src_re,dst_re,src_fast_re,dst_fast_re,ovrlp_count,long_frag_count,dns_ratio,avg_rr,http_rqsts_count,http_redirct_count,http_clnt_error_count,http_srv_error_count,Class name,Class binary
0,0.0,17,53,0,0,0,0,0,0,0,1,0.0,0,0,0,0,128,0,1,84,0.0,84.0,0.0,1073.051227,1073.051227,1665,0,0.0,0.0,1073.051227,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,Normal,1
1,89.814777,6,80,0,0,0,0,1,0,0,3,2.0,0,0,5005,0,128,127,2,174,120.0,58.0,60.0,2743.0,2832.814777,0,5006,0.000569,0.00398,1410.387432,1425.363018,0,3023308442,312736223,0,0,0,0,0,0,0.0,0.0,0,0,0,0,TCP flood,0
2,12.186268,17,53,0,0,0,0,0,0,0,5,1.0,0,0,91,0,127,128,2,425,85.0,85.0,85.0,71132.04391,71144.23018,0,1616,0.0,0.0,71135.0408,71144.23018,0,0,0,0,0,0,0,0,0,0.2,0.0,0,0,0,0,Normal,1
3,0.0,6,165,0,0,0,1,0,0,0,1,0.0,0,0,0,0,128,0,1,66,0.0,66.0,0.0,18.136116,18.136116,0,0,0.0,0.0,18.136116,0.0,0,2931862410,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,Port scanning,0
4,0.0,6,490,0,0,0,1,0,0,0,1,0.0,0,0,0,0,128,0,1,66,0.0,66.0,0.0,53.601844,53.601844,0,0,0.0,0.0,53.601844,0.0,0,740055462,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,Port scanning,0


In [7]:
train.describe()

Unnamed: 0,dur,trnspt,srvs,flag_n,flag_arst,flag_uc,flag_sign,flag_synrst,flag_a,flag_othr,src_pkts,dst_pkts,urg_bits,push_pkts,no_lnkd,arp,src_ttl,dst_ttl,pkts_dirctn,src_byts,dst_byts,src_avg_byts,dst_avg_byts,strt_t,end_t,dst_host_count,host_dst _count,rtt_first_ack,rtt_avg,avg_t_sent,avg_t_got,repeated,fst_src_sqc,fst_dst_sqc,src_re,dst_re,src_fast_re,dst_fast_re,ovrlp_count,long_frag_count,dns_ratio,avg_rr,http_rqsts_count,http_redirct_count,http_clnt_error_count,http_srv_error_count,Class binary
count,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0,10308.0
mean,4999.503404,33.953143,2995.874951,0.098273,0.005239,0.091967,0.046275,0.173166,0.121071,0.008828,4695.532305,233.185827,0.000679,405.38863,512.090609,0.16492,111.609818,79.236418,1.653473,291004.6,51323.0,240.054913,184.250021,27243.650301,32245.685553,181.816841,1067.102445,0.004089,0.007161,28811.908952,22735.38609,0.001067,1056877000.0,966101000.0,0.003492,0.160943,0.000194,0.001261,9.7e-05,0.000194,0.092876,0.117554,200.647264,131.01979,0.003686,0.000194,0.692666
std,17611.771441,73.250809,10893.250391,0.297698,0.072192,0.288994,0.21009,0.378409,0.326226,0.093547,16945.767663,862.271131,0.026052,1550.342678,1458.40501,0.521734,42.361419,61.503949,0.475887,808257.5,197321.9,377.894844,327.372974,27934.855128,29789.481351,523.378777,1505.490132,0.029562,0.016082,28114.176271,27970.133528,0.032651,1337864000.0,1318393000.0,0.08115,15.404944,0.013929,0.090803,0.009849,0.019699,0.283928,0.628787,775.814488,680.66184,0.060607,0.013929,0.461411
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42.0,0.0,8.800909,0.0,0.0,0.0069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000741,6.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,127.0,0.0,1.0,128.0,0.0,70.8,0.0,1112.145899,3367.374423,0.0,0.0,0.0,0.0,1409.268007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.260258,6.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,68.0,0.0,128.0,127.0,2.0,1522.0,194.0,113.666667,112.4,18228.67451,26457.542355,0.0,239.0,0.0,0.0,21174.624165,3862.082659,0.0,130589600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,93.410212,17.0,443.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,9.0,0.0,14.0,159.0,0.0,128.0,128.0,2.0,4661.0,1683.0,291.855054,197.684751,51061.38764,57857.108512,2.0,1664.0,0.000403,0.008768,54469.341125,50244.789155,0.0,2204841000.0,1919774000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,86373.29347,256.0,62700.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,164106.0,5388.0,1.0,10773.0,10000.0,2.0,128.0,248.0,2.0,6892452.0,7250290.0,3026.966292,2394.666667,86387.85577,86388.07017,1792.0,10001.0,1.287188,0.233336,86387.88241,86387.85976,1.0,4294720000.0,4293574000.0,2.0,1564.0,1.0,9.0,1.0,2.0,1.0,9.0,5388.0,5387.0,1.0,1.0,1.0


In [8]:
train['Class binary'].value_counts()

1    7140
0    3168
Name: Class binary, dtype: int64

In [9]:
train['Class name'].value_counts()

Normal                 7140
ARP poisoning           476
Port scanning           474
TCP flood               467
Mass HTTP requests      461
UDP data flood          445
Metasploit exploits     441
BeEF HTTP exploits      404
Name: Class name, dtype: int64

In [10]:
train_sorted = train.sort_values('strt_t', ascending=False)
train_sorted['strt_t']

5263    86387.85577
3423    86378.69484
4684    86378.69370
2915    86375.13499
8524    86375.13053
           ...     
2689        0.00000
9403        0.00000
2676        0.00000
2655        0.00000
9244        0.00000
Name: strt_t, Length: 10308, dtype: float64

In [11]:
train_sorted['tstamp_start'] = pd.to_datetime(train_sorted['strt_t'], unit='s', origin='unix')
train_sorted['tstamp_start']

5263   1970-01-01 23:59:47.855770
3423   1970-01-01 23:59:38.694840
4684   1970-01-01 23:59:38.693700
2915   1970-01-01 23:59:35.134990
8524   1970-01-01 23:59:35.130530
                  ...            
2689   1970-01-01 00:00:00.000000
9403   1970-01-01 00:00:00.000000
2676   1970-01-01 00:00:00.000000
2655   1970-01-01 00:00:00.000000
9244   1970-01-01 00:00:00.000000
Name: tstamp_start, Length: 10308, dtype: datetime64[ns]

In [12]:
# Compute the span (supposed to be 7 days)
train_span = train_sorted.iloc[0]['strt_t'] - train_sorted.iloc[-1]['strt_t']
pd.Timedelta(seconds=train_span)

Timedelta('0 days 23:59:47.855770')

As stated in the documentation, the capture took place over 7 days but the capture process was restarted after each 24-hour block, efficiently creating only a single 24h capture, as other data between days are indistinguishable. We wonder how would this be treated, since multiple communications and data streams will be running in parallel due to the merge of traffic from different days.

In [13]:
# Are there gaps in the data (was the capture interrupted?)
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [14]:
measure_real_capture_dur(train_sorted['tstamp_start'], int(train_span * 0.01))

(Timedelta('0 days 23:59:47.855770'),
 True,
 [Timedelta('0 days 23:59:47.855770')])

## Test Subset

In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2579 entries, 0 to 2578
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   dur                    2579 non-null   float64
 1   trnspt                 2579 non-null   int64  
 2   srvs                   2579 non-null   int64  
 3   flag_n                 2579 non-null   int64  
 4   flag_arst              2579 non-null   int64  
 5   flag_uc                2579 non-null   int64  
 6   flag_sign              2579 non-null   int64  
 7   flag_synrst            2579 non-null   int64  
 8   flag_a                 2579 non-null   int64  
 9   flag_othr              2579 non-null   int64  
 10  src_pkts               2579 non-null   int64  
 11  dst_pkts               2579 non-null   float64
 12  urg_bits               2579 non-null   int64  
 13  push_pkts              2579 non-null   int64  
 14  no_lnkd                2579 non-null   int64  
 15  arp 

In [16]:
len(test)

2579

In [17]:
test.head()

Unnamed: 0,dur,trnspt,srvs,flag_n,flag_arst,flag_uc,flag_sign,flag_synrst,flag_a,flag_othr,src_pkts,dst_pkts,urg_bits,push_pkts,no_lnkd,arp,src_ttl,dst_ttl,pkts_dirctn,src_byts,dst_byts,src_avg_byts,dst_avg_byts,strt_t,end_t,dst_host_count,host_dst _count,rtt_first_ack,rtt_avg,avg_t_sent,avg_t_got,repeated,fst_src_sqc,fst_dst_sqc,src_re,dst_re,src_fast_re,dst_fast_re,ovrlp_count,long_frag_count,dns_ratio,avg_rr,http_rqsts_count,http_redirct_count,http_clnt_error_count,http_srv_error_count,Class,Binary
0,1.022298,17,53,0,0,0,0,0,0,0,1,1.0,0,0,317,0,128,64,2,75,107.0,75.0,107.0,37.512741,37.537319,10,317,0.0,0.0,52914.00228,52915.02458,0,0,0,0,0,0,0,0,0,1.0,2.0,0,0,0,0,Normal,1
1,5731.939163,17,80,0,0,0,0,0,0,0,60300,0.0,0,0,1085,0,128,0,1,2662245,0.0,527.005768,0.0,3405.0,9136.939163,0,0,0.0,0.0,3405.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,UDP data flood,0
2,9838.0,256,0,0,0,0,0,0,0,0,19668,0.0,0,0,0,2,0,0,1,826056,0.0,42.0,0.0,74486.0,84324.0,1,0,0.0,0.0,1.99923,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,ARP poisining,0
3,97.072445,6,80,0,0,0,0,1,0,0,3,2.0,0,0,2002,0,128,127,2,174,120.0,58.0,60.0,1375.854703,1472.927148,0,2003,0.000751,0.001244,1408.210963,1424.391301,0,3696837187,31869368,0,0,0,0,0,0,0.0,0.0,0,0,0,0,TCP flood,0
4,11.999113,6,135,1,0,0,0,0,0,0,7,5.0,0,4,28,0,128,128,2,718,562.0,102.571429,112.4,81227.79288,81239.792,0,1616,0.000435,0.01759,81231.25153,81232.59329,0,2817478385,16893904,0,0,0,0,0,0,0.0,0.0,0,0,0,0,Normal,1


In [18]:
test.describe()

Unnamed: 0,dur,trnspt,srvs,flag_n,flag_arst,flag_uc,flag_sign,flag_synrst,flag_a,flag_othr,src_pkts,dst_pkts,urg_bits,push_pkts,no_lnkd,arp,src_ttl,dst_ttl,pkts_dirctn,src_byts,dst_byts,src_avg_byts,dst_avg_byts,strt_t,end_t,dst_host_count,host_dst _count,rtt_first_ack,rtt_avg,avg_t_sent,avg_t_got,repeated,fst_src_sqc,fst_dst_sqc,src_re,dst_re,src_fast_re,dst_fast_re,ovrlp_count,long_frag_count,dns_ratio,avg_rr,http_rqsts_count,http_redirct_count,http_clnt_error_count,http_srv_error_count,Binary
count,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0,2579.0
mean,5153.332897,33.259791,3194.594416,0.104692,0.003877,0.08259,0.048081,0.177588,0.115549,0.010081,4919.341993,257.923226,0.002326,453.953083,569.191159,0.158201,112.28577,80.135712,1.659558,310404.7,56444.72,248.742181,198.598833,27649.127505,32797.758109,172.836371,1125.826289,0.005281,0.007734,29441.53497,23245.383905,0.001163,1035162000.0,940377000.0,0.003877,0.008918,0.001551,0.001551,0.001163,0.000775,0.099147,0.099069,225.063591,155.22722,0.003877,0.0,0.685925
std,17871.460583,72.369296,11367.978365,0.306215,0.062161,0.275315,0.213978,0.38224,0.319745,0.099918,17942.213164,904.840162,0.048187,1642.152418,1577.993466,0.509588,41.610063,61.543051,0.47395,853784.6,194435.9,401.455444,350.762259,28442.599429,30172.574058,513.459433,1601.973568,0.03224,0.017206,28611.53933,28525.800877,0.034093,1334357000.0,1317272000.0,0.087994,0.131817,0.03936,0.03936,0.034093,0.027842,0.293859,0.485011,821.736245,731.023148,0.062161,0.0,0.464237
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42.0,0.0,10.548855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000737,6.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,127.0,0.0,1.0,128.0,0.0,70.8,0.0,1171.861929,3233.288973,0.0,0.0,0.0,0.0,1409.723741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.062456,6.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,0.0,0.0,68.0,0.0,128.0,127.0,2.0,1524.0,180.0,114.909981,123.0,18230.48102,27530.65042,0.0,317.0,0.0,0.0,21491.03257,3595.5,0.0,105857200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,95.486427,17.0,443.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,9.0,0.0,14.0,168.0,0.0,128.0,128.0,2.0,4661.0,1690.0,291.656008,199.4,52801.43935,59497.5,2.0,1664.0,0.000402,0.008706,55301.236715,51913.919655,0.0,2155427000.0,1837236000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,86373.29334,256.0,65275.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,170046.0,5356.5,1.0,10710.0,9871.0,2.0,128.0,248.0,2.0,7141932.0,1660502.0,3009.269663,2380.666667,86375.13356,86387.85616,1792.0,9872.0,0.302257,0.100642,86375.13356,86375.13398,1.0,4292711000.0,4288634000.0,2.0,4.0,1.0,1.0,1.0,1.0,1.0,8.0,5356.0,5355.0,1.0,0.0,1.0


In [19]:
test['Class'].value_counts()

Normal                 1769
Mass HTTP requests      140
Port scanning           123
TCP flood               121
ARP poisining           116
UDP data flood          108
Metasploit exploits     106
BeEF HTTP exploits       96
Name: Class, dtype: int64

In [20]:
test['Binary'].value_counts()

1    1769
0     810
Name: Binary, dtype: int64

### Test Time Continuity Analysis

Repeat the same process as for the train data.

In [21]:
test_sorted = test.sort_values(by='strt_t', ascending=False)

In [22]:
test_sorted['tstamp_start'] = pd.to_datetime(test_sorted['strt_t'], unit='s', origin='unix')
test_sorted['tstamp_start']

1929   1970-01-01 23:59:35.133560
841    1970-01-01 23:59:35.055170
526    1970-01-01 23:58:05.952370
245    1970-01-01 23:57:33.662060
158    1970-01-01 23:57:30.051010
                  ...            
877    1970-01-01 00:00:00.000000
270    1970-01-01 00:00:00.000000
102    1970-01-01 00:00:00.000000
100    1970-01-01 00:00:00.000000
391    1970-01-01 00:00:00.000000
Name: tstamp_start, Length: 2579, dtype: datetime64[ns]

In [23]:
test_span = test_sorted['strt_t'].iloc[0] - test_sorted['strt_t'].iloc[-1]
pd.Timedelta(seconds=test_span)

Timedelta('0 days 23:59:35.133560')

In [24]:
measure_real_capture_dur(train_sorted['tstamp_start'], int(test_span * 0.01))

(Timedelta('0 days 23:59:47.855770'),
 True,
 [Timedelta('0 days 23:59:47.855770')])

As we can see, both train and test and be considered continous, but given the timing statistics, their range can be considered only as 1 day instead of 7 as stated in the documentation.