# IOTID20 Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

Data source: [https://sites.google.com/view/iot-network-intrusion-dataset/](https://sites.google.com/view/iot-network-intrusion-dataset/)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', None)

In [2]:
DATA_PATH = 'iotid20/IoT Network Intrusion Dataset.csv'

In [3]:
data = pd.read_csv(DATA_PATH)

In [4]:
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625783 entries, 0 to 625782
Data columns (total 86 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Flow_ID            625783 non-null  object 
 1   Src_IP             625783 non-null  object 
 2   Src_Port           625783 non-null  int64  
 3   Dst_IP             625783 non-null  object 
 4   Dst_Port           625783 non-null  int64  
 5   Protocol           625783 non-null  int64  
 6   Timestamp          625783 non-null  object 
 7   Flow_Duration      625783 non-null  int64  
 8   Tot_Fwd_Pkts       625783 non-null  int64  
 9   Tot_Bwd_Pkts       625783 non-null  int64  
 10  TotLen_Fwd_Pkts    625783 non-null  float64
 11  TotLen_Bwd_Pkts    625783 non-null  float64
 12  Fwd_Pkt_Len_Max    625783 non-null  float64
 13  Fwd_Pkt_Len_Min    625783 non-null  float64
 14  Fwd_Pkt_Len_Mean   625783 non-null  float64
 15  Fwd_Pkt_Len_Std    625783 non-null  float64
 16  Bw

In [5]:
len(data)

625783

In [6]:
data.describe()

Unnamed: 0,Src_Port,Dst_Port,Protocol,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,TotLen_Fwd_Pkts,TotLen_Bwd_Pkts,Fwd_Pkt_Len_Max,Fwd_Pkt_Len_Min,Fwd_Pkt_Len_Mean,Fwd_Pkt_Len_Std,Bwd_Pkt_Len_Max,Bwd_Pkt_Len_Min,Bwd_Pkt_Len_Mean,Bwd_Pkt_Len_Std,Flow_Byts/s,Flow_Pkts/s,Flow_IAT_Mean,Flow_IAT_Std,Flow_IAT_Max,Flow_IAT_Min,Fwd_IAT_Tot,Fwd_IAT_Mean,Fwd_IAT_Std,Fwd_IAT_Max,Fwd_IAT_Min,Bwd_IAT_Tot,Bwd_IAT_Mean,Bwd_IAT_Std,Bwd_IAT_Max,Bwd_IAT_Min,Fwd_PSH_Flags,Bwd_PSH_Flags,Fwd_URG_Flags,Bwd_URG_Flags,Fwd_Header_Len,Bwd_Header_Len,Fwd_Pkts/s,Bwd_Pkts/s,Pkt_Len_Min,Pkt_Len_Max,Pkt_Len_Mean,Pkt_Len_Std,Pkt_Len_Var,FIN_Flag_Cnt,SYN_Flag_Cnt,RST_Flag_Cnt,PSH_Flag_Cnt,ACK_Flag_Cnt,URG_Flag_Cnt,CWE_Flag_Count,ECE_Flag_Cnt,Down/Up_Ratio,Pkt_Size_Avg,Fwd_Seg_Size_Avg,Bwd_Seg_Size_Avg,Fwd_Byts/b_Avg,Fwd_Pkts/b_Avg,Fwd_Blk_Rate_Avg,Bwd_Byts/b_Avg,Bwd_Pkts/b_Avg,Bwd_Blk_Rate_Avg,Subflow_Fwd_Pkts,Subflow_Fwd_Byts,Subflow_Bwd_Pkts,Subflow_Bwd_Byts,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Act_Data_Pkts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min
count,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0,625783.0
mean,35026.15619,16387.027479,9.971436,635.422865,1.675566,1.46853,570.73898,929.280973,392.489726,348.126571,373.556651,28.161094,681.424563,588.665338,637.107254,62.991166,inf,inf,483.462553,63.136879,565.820169,443.103039,102.157949,52.515505,27.68925,85.024112,34.679327,516.792743,446.964592,28.790664,486.704898,428.857417,0.0,0.02641,0.0,5.1e-05,22.490408,33.719612,50307.27,27818.105271,511.645564,700.089229,633.666814,102.553417,69676.670746,0.000527,0.092946,0.000486,0.02641,0.530475,5.1e-05,0.000206,6.7e-05,0.364652,915.246266,373.556651,637.107254,0.0,0.0,0.0,0.0,0.0,0.0,1.675566,570.73898,1.46853,929.280973,-1.0,5880.924744,1.509913,0.0,3.764405,0.353385,4.248735,3.462159,502.503832,52.403995,561.540512,467.264459
std,24721.047752,17550.363037,5.379857,3496.740723,4.30997,1.21949,1161.873195,1731.760875,619.575865,588.161845,596.552835,144.35619,694.997621,682.938908,669.563565,227.773332,,,1892.861521,1159.713184,2866.325298,1719.332813,2216.30446,1212.002429,959.023292,1935.611451,1000.385742,3147.286789,2242.463582,808.672338,2751.967326,2177.830162,0.0,0.160352,0.0,0.007151,41.530486,40.283654,164411.8,78481.469928,653.951208,696.842817,652.306047,243.227387,179138.678822,0.022958,0.290357,0.022035,0.160352,0.499071,0.007151,0.014356,0.008192,0.499153,948.679333,596.552835,669.563565,0.0,0.0,0.0,0.0,0.0,0.0,4.30997,1161.873195,1.21949,1731.760875,0.0,11529.622909,4.332737,0.0,68.064508,20.72337,88.934148,64.111043,2112.95736,1153.184897,2866.497606,1931.909971
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9020.0,8899.0,6.0,76.0,0.0,1.0,0.0,18.0,0.0,0.0,0.0,0.0,18.0,0.0,18.0,0.0,248062.0,12552.3,73.0,0.0,74.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,5154.639175,0.0,20.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.5,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.0,74.0,71.0
50%,51991.0,9020.0,6.0,132.0,1.0,1.0,32.0,104.0,30.0,30.0,30.0,0.0,40.0,32.0,36.0,0.0,10834590.0,17241.38,95.0,0.0,115.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,32.0,7633.588,10989.010989,32.0,167.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,139.333333,30.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,32.0,1.0,104.0,-1.0,252.0,1.0,0.0,0.0,0.0,0.0,0.0,93.5,0.0,114.0,78.0
75%,56361.0,10101.0,17.0,221.0,2.0,2.0,1388.0,1441.0,1388.0,386.0,864.0,0.0,1430.0,1430.0,1430.0,0.0,20689190.0,26666.67,142.5,2.081666,157.0,132.0,10.0,3.0,0.0,5.0,1.0,119.0,83.0,0.0,85.0,78.0,0.0,0.0,0.0,0.0,32.0,44.0,13422.82,19867.549669,1388.0,1430.0,1388.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1951.0,864.0,1430.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1388.0,2.0,1441.0,-1.0,1869.0,1.0,0.0,0.0,0.0,0.0,0.0,141.0,1.527525,154.0,130.0
max,65500.0,65371.0,17.0,99984.0,186.0,560.0,109846.0,773284.0,1464.0,1464.0,1464.0,1032.375901,1464.0,1460.0,1460.0,1032.375901,inf,inf,99973.0,67901.342877,99973.0,99973.0,99676.0,98135.0,70374.095291,99600.0,98135.0,99973.0,99973.0,68998.065495,99973.0,99973.0,0.0,1.0,0.0,1.0,3832.0,17920.0,4000000.0,1000000.0,1460.0,1464.0,1460.0,842.931393,710533.333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,14.0,2190.0,1464.0,1460.0,0.0,0.0,0.0,0.0,0.0,0.0,186.0,109846.0,560.0,773284.0,-1.0,65535.0,186.0,0.0,9044.625,8598.65825,26785.0,6659.0,99973.0,67071.906623,99973.0,99973.0


In [7]:
data.head(5)

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,TotLen_Fwd_Pkts,TotLen_Bwd_Pkts,Fwd_Pkt_Len_Max,Fwd_Pkt_Len_Min,Fwd_Pkt_Len_Mean,Fwd_Pkt_Len_Std,Bwd_Pkt_Len_Max,Bwd_Pkt_Len_Min,Bwd_Pkt_Len_Mean,Bwd_Pkt_Len_Std,Flow_Byts/s,Flow_Pkts/s,Flow_IAT_Mean,Flow_IAT_Std,Flow_IAT_Max,Flow_IAT_Min,Fwd_IAT_Tot,Fwd_IAT_Mean,Fwd_IAT_Std,Fwd_IAT_Max,Fwd_IAT_Min,Bwd_IAT_Tot,Bwd_IAT_Mean,Bwd_IAT_Std,Bwd_IAT_Max,Bwd_IAT_Min,Fwd_PSH_Flags,Bwd_PSH_Flags,Fwd_URG_Flags,Bwd_URG_Flags,Fwd_Header_Len,Bwd_Header_Len,Fwd_Pkts/s,Bwd_Pkts/s,Pkt_Len_Min,Pkt_Len_Max,Pkt_Len_Mean,Pkt_Len_Std,Pkt_Len_Var,FIN_Flag_Cnt,SYN_Flag_Cnt,RST_Flag_Cnt,PSH_Flag_Cnt,ACK_Flag_Cnt,URG_Flag_Cnt,CWE_Flag_Count,ECE_Flag_Cnt,Down/Up_Ratio,Pkt_Size_Avg,Fwd_Seg_Size_Avg,Bwd_Seg_Size_Avg,Fwd_Byts/b_Avg,Fwd_Pkts/b_Avg,Fwd_Blk_Rate_Avg,Bwd_Byts/b_Avg,Bwd_Pkts/b_Avg,Bwd_Blk_Rate_Avg,Subflow_Fwd_Pkts,Subflow_Fwd_Byts,Subflow_Bwd_Pkts,Subflow_Bwd_Byts,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Act_Data_Pkts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,192.168.0.13-192.168.0.16-10000-10101-17,192.168.0.13,10000,192.168.0.16,10101,17,25/07/2019 03:25:53 AM,75,1,1,982.0,1430.0,982.0,982.0,982.0,0.0,1430.0,1430.0,1430.0,0.0,32160000.0,26666.666667,75.0,0.0,75.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,8,8,13333.333333,13333.333333,982.0,1430.0,1280.666667,258.652921,66901.333333,0,0,0,0,0,0,0,0,1.0,1921.0,982.0,1430.0,0,0,0,0,0,0,1,982,1,1430,-1,-1,1,0,0.0,0.0,0.0,0.0,75.0,0.0,75.0,75.0,Anomaly,Mirai,Mirai-Ackflooding
1,192.168.0.13-222.160.179.132-554-2179-6,222.160.179.132,2179,192.168.0.13,554,6,26/05/2019 10:11:06 PM,5310,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,564.971751,2655.0,2261.327486,4254.0,1056.0,0.0,0.0,0.0,0.0,0.0,5310.0,5310.0,0.0,5310.0,5310.0,0,0,0,0,20,44,188.323917,376.647834,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,2.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,2,0,-1,14600,0,0,0.0,0.0,0.0,0.0,2655.0,2261.327486,4254.0,1056.0,Anomaly,DoS,DoS-Synflooding
2,192.168.0.13-192.168.0.16-9020-52727-6,192.168.0.16,52727,192.168.0.13,9020,6,11/07/2019 01:24:48 AM,141,0,3,0.0,2806.0,0.0,0.0,0.0,0.0,1388.0,30.0,935.333333,784.041666,19900710.0,21276.595745,70.5,0.707107,71.0,70.0,0.0,0.0,0.0,0.0,0.0,141.0,70.5,0.707107,71.0,70.0,0,0,0,0,0,96,0.0,21276.595745,30.0,1388.0,1048.5,679.0,461041.0,0,0,0,0,1,0,0,0,0.0,1398.0,0.0,935.333333,0,0,0,0,0,0,0,0,3,2806,-1,1869,0,0,0.0,0.0,0.0,0.0,70.5,0.707107,71.0,70.0,Anomaly,Scan,Scan Port OS
3,192.168.0.13-192.168.0.16-9020-52964-6,192.168.0.16,52964,192.168.0.13,9020,6,04/09/2019 03:58:17 AM,151,0,2,0.0,2776.0,0.0,0.0,0.0,0.0,1388.0,1388.0,1388.0,0.0,18384110.0,13245.033113,151.0,0.0,151.0,151.0,0.0,0.0,0.0,0.0,0.0,151.0,151.0,0.0,151.0,151.0,0,0,0,0,0,64,0.0,13245.033113,1388.0,1388.0,1388.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,2082.0,0.0,1388.0,0,0,0,0,0,0,0,0,2,2776,-1,1869,0,0,0.0,0.0,0.0,0.0,151.0,0.0,151.0,151.0,Anomaly,Mirai,Mirai-Hostbruteforceg
4,192.168.0.1-239.255.255.250-36763-1900-17,192.168.0.1,36763,239.255.255.250,1900,17,10/09/2019 01:41:18 AM,153,2,1,886.0,420.0,452.0,434.0,443.0,12.727922,420.0,420.0,420.0,0.0,8535948.0,19607.843137,76.5,0.707107,77.0,76.0,76.0,76.0,0.0,76.0,76.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,16,8,13071.895425,6535.947712,420.0,452.0,431.5,15.176737,230.333333,0,0,0,0,0,0,0,0,0.0,575.333333,443.0,420.0,0,0,0,0,0,0,2,886,1,420,-1,-1,2,0,0.0,0.0,0.0,0.0,76.5,0.707107,77.0,76.0,Anomaly,Mirai,Mirai-Hostbruteforceg


In [8]:
# Label distribution
data['Label'].value_counts()

Anomaly    585710
Normal      40073
Name: Label, dtype: int64

In [9]:
# Categories label distribution
data['Cat'].value_counts()

Mirai                415677
Scan                  75265
DoS                   59391
Normal                40073
MITM ARP Spoofing     35377
Name: Cat, dtype: int64

In [10]:
# Sub category label distribution
data['Sub_Cat'].value_counts()

Mirai-UDP Flooding       183554
Mirai-Hostbruteforceg    121181
DoS-Synflooding           59391
Mirai-HTTP Flooding       55818
Mirai-Ackflooding         55124
Scan Port OS              53073
Normal                    40073
MITM ARP Spoofing         35377
Scan Hostport             22192
Name: Sub_Cat, dtype: int64

In [11]:
# Source IP address distribution
data['Src_IP'].value_counts()

192.168.0.13       222096
192.168.0.16       125890
192.168.0.24       122846
104.118.134.215     46092
104.74.213.186      23308
                    ...  
111.246.29.11           1
222.63.54.185           1
222.136.12.181          1
111.76.36.36            1
222.131.171.244         1
Name: Src_IP, Length: 57985, dtype: int64

In [12]:
# Destination IP address distribution
data['Dst_IP'].value_counts()

192.168.0.13       164532
192.168.0.16       143150
210.89.164.90      131170
192.168.0.24        85813
222.239.240.107     11240
                    ...  
0.40.121.31             1
222.184.51.143          1
222.100.196.201         1
222.166.100.56          1
222.200.112.240         1
Name: Dst_IP, Length: 478, dtype: int64

In [13]:
# Destination port distribution
data['Dst_Port'].value_counts()

9020     114464
10101     94753
8899      89518
443       55358
43238     46092
          ...  
33778         1
56710         1
64538         1
52577         1
4549          1
Name: Dst_Port, Length: 1034, dtype: int64

In [14]:
# Protocol breakdown
data['Protocol'].value_counts()

6     390091
17    229377
0       6315
Name: Protocol, dtype: int64

## Continuity Analysis

In [15]:
# What is the timestamp format
data['Timestamp'].head()

0    25/07/2019 03:25:53 AM
1    26/05/2019 10:11:06 PM
2    11/07/2019 01:24:48 AM
3    04/09/2019 03:58:17 AM
4    10/09/2019 01:41:18 AM
Name: Timestamp, dtype: object

Apparently unsorted... - Convert to datetime format

In [16]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%d/%m/%Y %I:%M:%S %p')
data['Timestamp'].head()

0   2019-07-25 03:25:53
1   2019-05-26 22:11:06
2   2019-07-11 01:24:48
3   2019-09-04 03:58:17
4   2019-09-10 01:41:18
Name: Timestamp, dtype: datetime64[ns]

In [17]:
# Sort the dataframe based on the timestamp
data_sorted = data.sort_values(by='Timestamp', ascending=False)
data_sorted['Timestamp'].head()

600010   2019-09-10 01:54:23
542496   2019-09-10 01:54:23
111061   2019-09-10 01:54:23
188832   2019-09-10 01:54:23
350686   2019-09-10 01:54:23
Name: Timestamp, dtype: datetime64[ns]

In [18]:
data_sorted['Timestamp'].iloc[-100:-80]

307040   2019-05-20 04:56:15
334843   2019-05-20 04:56:15
597225   2019-05-20 04:56:15
378080   2019-05-20 04:56:15
443939   2019-05-20 04:56:15
299186   2019-05-20 04:56:15
133318   2019-05-20 04:56:15
413810   2019-05-20 04:56:15
87229    2019-05-20 04:56:15
105919   2019-05-20 04:56:15
337534   2019-05-20 04:56:15
591874   2019-05-20 04:56:15
174706   2019-05-20 04:56:15
547925   2019-05-20 04:56:15
243752   2019-05-20 04:56:15
498642   2019-05-20 04:56:15
85249    2019-05-20 04:56:15
272284   2019-05-20 04:56:15
5220     2019-05-20 04:56:15
60906    2019-05-20 04:56:15
Name: Timestamp, dtype: datetime64[ns]

In [19]:
# Compute the span of the capture
timegap = data_sorted['Timestamp'].iloc[0] - data_sorted['Timestamp'].iloc[-1]
timegap

Timedelta('112 days 20:58:09')

In [20]:
# Are there gaps in the data (was the capture interrupted?)
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [21]:
measure_real_capture_dur(data_sorted['Timestamp'], int(timegap.total_seconds() * 0.01))

(Timedelta('0 days 02:58:24'),
 False,
 [Timedelta('0 days 00:53:18'),
  Timedelta('0 days 00:40:56'),
  Timedelta('0 days 00:16:14'),
  Timedelta('0 days 00:16:11'),
  Timedelta('0 days 00:14:31'),
  Timedelta('0 days 00:14:13'),
  Timedelta('0 days 00:07:45'),
  Timedelta('0 days 00:05:11'),
  Timedelta('0 days 00:05:04'),
  Timedelta('0 days 00:05:01')])

As apparent, the data contain multiple more than 1% duration gaps, making them uncontinous. For the measurement in the paper (2h 13m, we used significantly lower gap value to make the duration more realistic due to a huge time span.)