# WUSTL-IIoT 2021 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

Dataset source: [https://www.cse.wustl.edu/~jain/iiot2/index.html](https://www.cse.wustl.edu/~jain/iiot2/index.html)

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

In [2]:
DATAPATH = '/data/surv/wustl_iiot_2021/wustl_iiot_2021.csv'

In [3]:
data = pd.read_csv(DATAPATH)
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194464 entries, 0 to 1194463
Data columns (total 49 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   StartTime   1194464 non-null  object 
 1   LastTime    1194464 non-null  object 
 2   SrcAddr     1194464 non-null  object 
 3   DstAddr     1194464 non-null  object 
 4   Mean        1194464 non-null  int64  
 5   Sport       1194464 non-null  int64  
 6   Dport       1194464 non-null  int64  
 7   SrcPkts     1194464 non-null  int64  
 8   DstPkts     1194464 non-null  int64  
 9   TotPkts     1194464 non-null  int64  
 10  DstBytes    1194464 non-null  int64  
 11  SrcBytes    1194464 non-null  int64  
 12  TotBytes    1194464 non-null  int64  
 13  SrcLoad     1194464 non-null  float64
 14  DstLoad     1194464 non-null  float64
 15  Load        1194464 non-null  float64
 16  SrcRate     1194464 non-null  float64
 17  DstRate     1194464 non-null  float64
 18  Rate        1194464 no

In [4]:
len(data)

1194464

In [5]:
data.head()

Unnamed: 0,StartTime,LastTime,SrcAddr,DstAddr,Mean,Sport,Dport,SrcPkts,DstPkts,TotPkts,DstBytes,SrcBytes,TotBytes,SrcLoad,DstLoad,Load,SrcRate,DstRate,Rate,SrcLoss,DstLoss,Loss,pLoss,SrcJitter,DstJitter,SIntPkt,DIntPkt,Proto,Dur,TcpRtt,IdleTime,Sum,Min,Max,sDSb,sTtl,dTtl,sIpId,dIpId,SAppBytes,DAppBytes,TotAppByte,SynAck,RunTime,sTos,SrcJitAct,DstJitAct,Traffic,Target
0,2019-08-19 12:23:28,2019-08-19 12:23:28,192.168.0.20,192.168.0.2,0,59034,502,10,8,18,508,644,1152,87486.09375,67122.953125,154609.046875,169.692856,131.983337,320.530945,2,2,4,18.181818,527.431726,11.523097,5.893,7.406429,6,0.053037,0.001266,1548786176,0.053037,0.053037,0.053037,0,128,64,53331,64402,24,20,44,0.001176,0.053037,0,0.0,0.0,normal,0
1,2019-08-19 15:13:24,2019-08-19 15:13:24,192.168.0.20,192.168.0.2,0,55841,502,10,8,18,508,644,1152,88077.296875,67576.546875,155653.84375,170.839584,132.875229,322.696991,2,2,4,18.181818,17.234379,13.246678,7.525857,7.338714,6,0.052681,0.00131,1548882816,0.052681,0.052681,0.052681,0,128,64,37167,31590,24,20,44,0.001308,0.052681,0,0.0,0.0,normal,0
2,2019-08-19 13:41:31,2019-08-19 13:41:31,192.168.0.20,192.168.0.2,0,63774,502,10,8,18,508,644,1152,89587.390625,68735.15625,158322.546875,173.768646,135.153397,328.229675,2,2,4,18.181818,522.98724,12.307223,5.754778,7.299143,6,0.051793,0.000766,1548877312,0.051793,0.051793,0.051793,0,128,64,58712,22717,24,20,44,0.00069,0.051793,0,0.0,0.0,normal,0
3,2019-08-19 12:43:19,2019-08-19 12:43:20,209.240.235.92,192.168.0.2,0,61771,80,4,0,4,0,248,248,1672.746582,0.0,1672.746582,3.372473,0.0,3.372473,3,0,3,42.857143,419.338813,0.0,296.518344,0.0,6,0.889555,0.0,1548787456,0.889555,0.889555,0.889555,0,140,0,21629,0,0,0,0,0.0,0.889555,0,419.338813,0.0,DoS,1
4,2019-08-19 14:49:44,2019-08-19 14:49:48,192.168.0.20,192.168.0.1,3,0,0,14,0,14,0,868,868,1842.256714,0.0,1842.256714,3.714227,0.0,3.714227,0,0,0,0.0,525.146562,0.0,321.429844,0.0,2054,3.500055,0.0,1548881408,3.500055,3.500055,3.500055,0,0,0,0,0,476,0,476,0.0,3.500055,0,525.146562,0.0,normal,0


In [6]:
# Traffic types distribution
data['Traffic'].value_counts()

Traffic
normal      1107448
DoS           78305
Reconn         8240
CommInj         259
Backdoor        212
Name: count, dtype: int64

In [7]:
# Malicious ground truth distribution
data['Target'].value_counts()

Target
0    1107448
1      87016
Name: count, dtype: int64

In [8]:
# How big is the network
data['SrcAddr'].value_counts()

SrcAddr
192.168.0.20                 1090574
209.240.235.92                 46615
192.168.0.10                   26797
192.168.0.44                    8317
20.1.249.77                     7220
49.48.134.64                    6320
192.168.0.2                     4397
01:80:c2:00:00:0e               2529
0.0.0.0                          573
0                                423
fe80::e9ed:931f:c2e0:1333        354
fe80::9bc:3b2b:78d3:855c         321
fe80::dacb:8aff:fe08:ed2a         16
192.168.0.4                        8
Name: count, dtype: int64

## Duration Analysis

In [9]:
tstamps = pd.to_datetime(data['LastTime'], format='%Y-%m-%d %H:%M:%S')
tstamps = tstamps.sort_values(ascending=False).reset_index(drop=True)
tstamps.head()

0   2019-08-19 16:48:15
1   2019-08-19 16:48:11
2   2019-08-19 16:48:11
3   2019-08-19 16:48:11
4   2019-08-19 16:48:11
Name: LastTime, dtype: datetime64[ns]

In [10]:
# Maximum gap for 7h capture is 4.2 minutes.
def measure_real_capture_dur(data: pd.Series, gap_minutes: int = 5) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    gap_max = pd.Timedelta(minutes=gap_minutes)
    total_dur = pd.Timedelta(seconds=0)
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap < gap_max:
            total_dur += dur_gap
        elif contiguous:
            contiguous = False

        last_tstamp = cur_tstamp

    return total_dur, contiguous

In [11]:
measure_real_capture_dur(tstamps, 5)

(Timedelta('0 days 07:02:57'), True)