# UWF-Zeekdata22 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

Data source: [https://datasets.uwf.edu/](https://datasets.uwf.edu/)

Late 2024, ZeekDataFall22 and TestZeekData24 were also added to the repository. Nevertheless, this analysis only focuses on the original `ZeekData22` files. The other data have the same features, but might vary in captured traffic patterns and attacks.

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

In [2]:
# We will work with data distributed in parquet files - as other folders (ZeekDataFall22, TestZeekData24) provide only parquets
DATA_FOLDER = '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet'

In [3]:
# Discover all parquet files within the dataset
parquet_files = []

for path, dirs, files in os.walk(DATA_FOLDER):
    for file in files:
        if file.endswith('.parquet'):
            parquet_files.append(os.path.join(path, file))

parquet_files.sort()
parquet_files

['/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2021-12-12 - 2021-12-19/part-00000-7c2e9adb-5430-4792-a42b-10ff5bbd46e8-c000.snappy.parquet',
 '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2021-12-19 - 2021-12-26/part-00000-3f86626a-1225-47f9-a5a2-0170b737e404-c000.snappy.parquet',
 '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2021-12-26 - 2022-01-02/part-00000-b1a9fc13-8068-4a5d-91b2-871438709e81-c000.snappy.parquet',
 '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2022-01-02 - 2022-01-09/part-00000-26e9208e-7819-451b-b23f-2e47f6d1e834-c000.snappy.parquet',
 '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2022-01-09 - 2022-01-16/part-00000-36240b61-b84f-4164-a873-d7973e652780-c000.snappy.parquet',
 '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2022-01-16 - 2022-01-23/part-00000-cbf26680-106d-40e7-8278-60520afdbb0e-c000.snappy.parquet',
 '/data/data_surv/uwf-zeekdata22/UWF-ZeekData22/parquet/2022-02-06 - 2022-02-13/part-00000-df678a79-

In [4]:
# Read all the parquet files into a single DataFrame
data_all = pd.DataFrame()

for parq_file in parquet_files:
    data_all = pd.concat([data_all, pd.read_parquet(parq_file)])

In [5]:
data_all.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 18562468 entries, 0 to 62
Data columns (total 23 columns):
 #   Column          Non-Null Count     Dtype         
---  ------          --------------     -----         
 0   resp_pkts       18562468 non-null  int32         
 1   service         9425321 non-null   object        
 2   orig_ip_bytes   18562468 non-null  int32         
 3   local_resp      18562468 non-null  bool          
 4   missed_bytes    18562468 non-null  int32         
 5   proto           18562468 non-null  object        
 6   duration        18052993 non-null  float64       
 7   conn_state      18562468 non-null  object        
 8   dest_ip_zeek    18562468 non-null  object        
 9   orig_pkts       18562468 non-null  int32         
 10  community_id    18562468 non-null  object        
 11  resp_ip_bytes   18562468 non-null  int32         
 12  dest_port_zeek  18562468 non-null  int32         
 13  orig_bytes      18052993 non-null  float64       
 14  local_orig 

In [6]:
len(data_all)

18562468

In [7]:
data_all.head()

Unnamed: 0,resp_pkts,service,orig_ip_bytes,local_resp,missed_bytes,proto,duration,conn_state,dest_ip_zeek,orig_pkts,community_id,resp_ip_bytes,dest_port_zeek,orig_bytes,local_orig,datetime,history,resp_bytes,uid,src_port_zeek,ts,src_ip_zeek,label_tactic
0,1689365,,141906660,False,0,icmp,1729902.0,OTH,143.88.0.1,1689365,1:L8d7Q/9CNOBVbhfRe7yv3s8A24A=,141906660,0,94604440.0,False,2021-12-17 07:00:45.896,,94604440.0,Caxllt2aEzXOEZnpii,8,1639746000.0,143.88.255.50,none
1,0,,462247452,False,0,icmp,1730391.0,OTH,143.88.255.10,4576716,1:IrLgQEMN3Jxld9DJo9XTcVvDsHU=,0,1,334099404.0,False,2021-12-17 07:00:47.630,,0.0,CEx0rX1Yic8axI4sw1,3,1639746000.0,143.88.255.1,none
2,0,,22008480,False,0,icmp,1730410.0,OTH,ff02::1,229255,1:yBZ3xWqbIFuQFB44sXmyC45EtSs=,0,133,11004240.0,False,2021-12-17 07:00:45.879,,0.0,Cpn5bT1QW9MPFXikn4,134,1639746000.0,fe80::250:56ff:fe9e:da15,none
3,0,,34970488,False,0,icmp,1730411.0,OTH,ff02::16,460138,1:ogw3gR/knTC3m00NBC6RR+vqjXc=,0,0,9202760.0,False,2021-12-17 07:00:45.888,,0.0,Ceo6B73UmSZVe8vsD4,143,1639746000.0,fe80::250:56ff:fe9e:ef90,none
4,0,dhcp,66459795,False,0,udp,1731504.0,S0,255.255.255.255,202621,1:t9O1j0qj71O4wJM7gnaHtgmfev8=,0,67,60786407.0,False,2021-12-17 07:01:10.260,D,0.0,CTCPKc2faq2rLYJA4f,68,1639746000.0,0.0.0.0,none


In [8]:
# Label distribution
data_all['label_tactic'].value_counts()

label_tactic
none                    9281599
Reconnaissance          9278722
Discovery                  2086
Credential Access            31
Privilege Escalation         13
Exfiltration                  7
Lateral Movement              4
Resource Development          3
Defense Evasion               1
Initial Access                1
Persistence                   1
Name: count, dtype: int64

In [9]:
# Protocol distribution
data_all['proto'].value_counts()

proto
udp     9547112
tcp     8962641
icmp      52715
Name: count, dtype: int64

In [10]:
# Services breakdown
data_all['service'].value_counts()

service
dns                9377942
dhcp                 22269
http                 10000
ntp                   7466
ssl                   5542
gssapi,ntlm,smb        769
gssapi                 512
krb_tcp                256
smb                    256
smb,ntlm,gssapi        256
radius                  27
ssh                     14
ftp                      7
smb,gssapi,ntlm          3
gssapi,smb,ntlm          2
Name: count, dtype: int64

In [11]:
# Are there any columns with only 1 unique value?
data_all.columns[data_all.nunique() <= 1]

Index([], dtype='object')

In [12]:
# Source IP addresses
data_all['src_ip_zeek'].value_counts()

src_ip_zeek
143.88.2.10        8250880
143.88.255.10      6725201
143.88.11.10       1170061
143.88.7.10         523557
143.88.5.12         502528
                    ...   
143.88.9.18              2
143.88.255.51            2
255.255.255.255          2
143.88.2.12              1
143.88.8.12              1
Name: count, Length: 62, dtype: int64

In [13]:
# Number of unique source IPs
data_all['src_ip_zeek'].nunique()

62

## Timespan Computation

In [14]:
data_all['datetime']

0    2021-12-17 07:00:45.896
1    2021-12-17 07:00:47.630
2    2021-12-17 07:00:45.879
3    2021-12-17 07:00:45.888
4    2021-12-17 07:01:10.260
               ...          
58   2022-02-18 09:45:03.308
59   2022-02-18 11:28:45.529
60   2022-02-18 11:28:45.531
61   2022-02-18 10:02:58.996
62   2022-02-18 10:44:44.092
Name: datetime, Length: 18562468, dtype: datetime64[ns]

In [15]:
# Timestamps
timestamps = data_all['datetime'].sort_values(ascending=False)
timestamps

11       2022-02-19 19:16:36.163
12       2022-02-19 19:16:36.163
43       2022-02-19 13:25:11.369
44       2022-02-19 13:25:11.369
47       2022-02-19 13:23:56.794
                   ...          
2        2021-12-17 07:00:45.879
423577   2021-12-17 07:00:45.251
209052   2021-12-17 07:00:45.251
423576   2021-12-17 07:00:45.213
209051   2021-12-17 07:00:45.213
Name: datetime, Length: 18562468, dtype: datetime64[ns]

In [16]:
# Compute capture span
capture_span = timestamps.iloc[0] - timestamps.iloc[-1]
capture_span

Timedelta('64 days 12:15:50.950000')

In [17]:
# Measure capture duration and continousness
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [18]:
measure_real_capture_dur(timestamps, int(capture_span.total_seconds() * 0.01))

(Timedelta('33 days 00:29:21.604000'),
 False,
 [Timedelta('30 days 15:59:13.562000'),
  Timedelta('1 days 06:38:25.054000'),
  Timedelta('0 days 13:52:02.154000'),
  Timedelta('0 days 06:58:07.706000'),
  Timedelta('0 days 05:01:33.106000'),
  Timedelta('0 days 00:00:00.022000')])

In [19]:
# Lowering the maximum gap to 1 hour
measure_real_capture_dur(timestamps, 3600)

(Timedelta('30 days 21:37:27.338000'),
 False,
 [Timedelta('28 days 01:59:04.615000'),
  Timedelta('2 days 12:57:14.937000'),
  Timedelta('0 days 01:47:36.364000'),
  Timedelta('0 days 01:43:06.911000'),
  Timedelta('0 days 01:24:28.670000'),
  Timedelta('0 days 01:06:42.912000'),
  Timedelta('0 days 00:14:57.704000'),
  Timedelta('0 days 00:12:47.315000'),
  Timedelta('0 days 00:05:17.559000'),
  Timedelta('0 days 00:03:21.345000'),
  Timedelta('0 days 00:01:47.691000'),
  Timedelta('0 days 00:00:51.256000'),
  Timedelta('0 days 00:00:10.037000'),
  Timedelta('0 days 00:00:00.022000'),
  Timedelta('0 days 00:00:00')])