# IoT Host-Based ID Dataset

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

Available on request. Download link: (https://drive.google.com/drive/folders/1hQi8D0FkPi0zhPm9T66X5aDhxgl6X6u0)[https://drive.google.com/drive/folders/1hQi8D0FkPi0zhPm9T66X5aDhxgl6X6u0]

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

In [2]:
# 3 scenarios are emulated, and their associated folder structure corresponds
!ls

2017-iot_hostbased_id.ipynb  features_netflow.txt  MC  SC  ST


In [3]:
!cat features_netflow.txt

%Label    1: attack; 0: normal
%ts       Start Time - first seen
%te       End Time - last seen
%td       Duration
%sa       Source Address
%da       Destination Address
%sp       Source Port
%dp       Destination Port
%pr       Protocol
%flg      TCP Flags
%fwd      Forwarding Status
%stos     Src Tos
%ipkt     Input Packets
%ibyt     Input Bytes
%opkt     Output Packets
%obyt     Output Bytes
%in       Input Interface num
%out      Output Interface num
%sas      Source AS
%das      Destination AS
%smk      Src mask
%dmk      Dst mask
%dtos     Dst Tos
%dir      Direction: ingress, egress
%nh       Next-hop IP Address
%nhb      BGP Next-hop IP Address
%svln     Src vlan label
%dvln     Dst vlan label
%idmc     Input Dst Mac Addr
%osmc     Output Src Mac Addr
%mpls1    MPLS label 1
%mpls2    MPLS label 2
%mpls3    MPLS label 3
%mpls4    MPLS label 4
%mpls5    MPLS label 5
%mpls6    MPLS label 6
%mpls7    MPLS label 7
%mpls8    MPLS label 8
%mpls9    MPLS label 9
%mpls10   MPLS label 10

In [4]:
# Since netflow features are the same, we will analyze a single scenario
# We chose MT (Multimedial Center)
DATAPATH = 'SC/NetFlow/'

os.listdir(DATAPATH)

['SC_I1.csv', 'SC_I3.csv', 'SC_I2.csv', 'SC_L.csv']

So, there are 3 intrusive and 1 legitimate scenarios. Load them all at once

In [5]:
data = pd.DataFrame()

for file in os.listdir(DATAPATH):
    fullpath = os.path.join(DATAPATH, file)

    print(file)
    data = pd.concat([data, pd.read_csv(fullpath)])

SC_I1.csv
SC_I3.csv
SC_I2.csv
SC_L.csv


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 423739 entries, 0 to 760
Data columns (total 50 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  423739 non-null  int64 
 1   Label       423739 non-null  int64 
 2   ts          423739 non-null  object
 3   te          423739 non-null  object
 4   td          423739 non-null  int64 
 5   sa          423739 non-null  object
 6   da          423739 non-null  object
 7   sp          423739 non-null  int64 
 8   dp          423739 non-null  int64 
 9   pr          423739 non-null  object
 10  flg         423739 non-null  object
 11  fwd         423739 non-null  int64 
 12  stos        423739 non-null  int64 
 13  ipkt        423739 non-null  int64 
 14  ibyt        423739 non-null  int64 
 15  opkt        423739 non-null  int64 
 16  obyt        423739 non-null  int64 
 17  in          423739 non-null  int64 
 18  out         423739 non-null  int64 
 19  sas         423739 non-nul

In [7]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Label,td,sp,dp,fwd,stos,ipkt,ibyt,opkt,obyt,in,out,sas,das,smk,dmk,dtos,dir,svln,dvln,cl,sl,al,exid
count,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0,423739.0
mean,205095.927559,0.99675,1647.802,32370.484043,3280.642577,0.0,0.0,4.751288,2993.857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
std,122160.075225,0.056913,31182.59,19949.117325,11735.122985,0.0,0.0,405.777496,537622.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,99064.5,1.0,0.0,14689.0,23.0,0.0,0.0,1.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,204999.0,1.0,0.0,35060.0,23.0,0.0,0.0,1.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,310933.5,1.0,0.0,49389.0,81.0,0.0,0.0,1.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,416868.0,1.0,3507705.0,65535.0,65527.0,0.0,0.0,91522.0,131970100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,Label,ts,te,td,sa,da,sp,dp,pr,flg,fwd,stos,ipkt,ibyt,opkt,obyt,in,out,sas,das,smk,dmk,dtos,dir,nh,nhb,svln,dvln,ismc,odmc,idmc,osmc,mpls1,mpls2,mpls3,mpls4,mpls5,mpls6,mpls7,mpls8,mpls9,mpls10,cl,sl,al,ra,eng,exid,tr
0,0,0,2018-11-08 03:39:11,2018-11-08 03:39:11,39,192.168.1.109,192.168.1.120,59828,80,TCP,.AP.SF,0,0,7,530,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
1,1,0,2018-11-08 03:39:11,2018-11-08 03:39:11,39,192.168.1.120,192.168.1.109,80,59828,TCP,.AP.SF,0,0,5,2288,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
2,2,0,2018-11-08 03:40:11,2018-11-08 03:40:11,57,192.168.1.109,192.168.1.120,59830,80,TCP,.AP.SF,0,0,7,530,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
3,3,0,2018-11-08 03:40:11,2018-11-08 03:40:11,57,192.168.1.120,192.168.1.109,80,59830,TCP,.AP.SF,0,0,5,2288,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
4,4,0,2018-11-08 03:41:11,2018-11-08 03:41:11,299,192.168.1.109,192.168.1.120,59832,80,TCP,.AP.SF,0,0,7,530,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
5,5,0,2018-11-08 03:41:11,2018-11-08 03:41:11,299,192.168.1.120,192.168.1.109,80,59832,TCP,.AP.SF,0,0,5,2288,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
6,6,0,2018-11-08 03:42:11,2018-11-08 03:42:11,242,192.168.1.109,192.168.1.120,59834,80,TCP,.AP.SF,0,0,8,594,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
7,7,0,2018-11-08 03:42:11,2018-11-08 03:42:11,242,192.168.1.120,192.168.1.109,80,59834,TCP,.AP.SF,0,0,6,2912,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
8,8,0,2018-11-08 03:43:11,2018-11-08 03:43:11,63,192.168.1.109,192.168.1.120,59836,80,TCP,.AP.SF,0,0,7,530,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000
9,9,0,2018-11-08 03:43:11,2018-11-08 03:43:11,63,192.168.1.120,192.168.1.109,80,59836,TCP,.A..SF,0,0,4,1664,0,0,0,0,0,0,0,0,0,0,0.0.0.0,0.0.0.0,0,0,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,00:00:00:00:00:00,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0-0-0,0,0,0,0.0.0.0,0/0,1,1969-12-31 21:00:00.000


In [9]:
# IP addresses distribution
data['sa'].value_counts()

192.168.1.109     386076
30154244175         1686
130108234239         873
212.33.93.64         871
192.168.1.120        381
                   ...  
49157136144            1
148.134.191.37         1
51.37.157.193          1
122145121219           1
192.168.20.168         1
Name: sa, Length: 21418, dtype: int64

In [10]:
# Destination ports distribution
data['dp'].value_counts().head(10)

23       178838
81       148874
22        50315
769        2608
2816       2542
48080      2383
778        2214
781        2030
39552      1668
10505       734
Name: dp, dtype: int64

In [11]:
data['sp'].value_counts()

0        8959
23       8454
22       7402
81       5987
39552    3575
         ... 
53567       1
7313        1
35767       1
54445       1
58667       1
Name: sp, Length: 65165, dtype: int64

In [12]:
# Protocols distribution
data['pr'].value_counts()

TCP     408169
ICMP      8799
UDP       6629
IGMP       141
GRE          1
Name: pr, dtype: int64

In [13]:
# Label distribution
data['Label'].value_counts()

1    422362
0      1377
Name: Label, dtype: int64

## Time Continuity Analysis

In [14]:
data['tstamp'] = pd.to_datetime(data['ts'])

In [15]:
data_sorted = data.sort_values(by='tstamp', ascending=False).reset_index(drop=False)

In [16]:
# Compute span
data_span = data_sorted['tstamp'].iloc[0] - data_sorted['tstamp'].iloc[-1]
data_span

Timedelta('4 days 23:35:41')

In [17]:
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [18]:
measure_real_capture_dur(data_sorted['tstamp'], int(data_span.total_seconds() * 0.01))

(Timedelta('0 days 03:59:22'),
 False,
 [Timedelta('0 days 01:00:18'),
  Timedelta('0 days 01:00:14'),
  Timedelta('0 days 00:59:24')])

Although the capture span of a single scenario is almost 5 days, its continous blocks are 3x by one hour. Probably, they correspond to each intrusious/benign scenario, and would be considered continous if considered separate. However, for our purposes (also due to 3 different scenarios), we consider the capture is discontinous.