# OPC-UA Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
DATAPATH = 'OPCUA_dataset_public.csv'

In [3]:
data = pd.read_csv(DATAPATH)

In [4]:
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107633 entries, 0 to 107632
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   src_ip                       107633 non-null  object 
 1   src_port                     107633 non-null  int64  
 2   dst_ip                       107633 non-null  object 
 3   dst_port                     107633 non-null  int64  
 4   flags                        107633 non-null  int64  
 5   pktTotalCount                107633 non-null  int64  
 6   octetTotalCount              107633 non-null  int64  
 7   avg_ps                       107633 non-null  float64
 8   proto                        107633 non-null  object 
 9   service                      107633 non-null  object 
 10  service_errors               107633 non-null  int64  
 11  status_errors                107633 non-null  int64  
 12  msg_size                     107633 non-null  int64  
 13 

In [5]:
len(data)

107633

In [6]:
data.describe()

Unnamed: 0,src_port,dst_port,flags,pktTotalCount,octetTotalCount,avg_ps,service_errors,status_errors,msg_size,min_msg_size,flowStart,flowEnd,flowDuration,avg_flowDuration,flowInterval,count,srv_count,same_srv_rate,dst_host_same_src_port_rate,f_pktTotalCount,f_octetTotalCount,f_flowStart,f_rate,b_pktTotalCount,b_octetTotalCount,b_flowStart,label
count,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0,107633.0
mean,34670.758949,4840.465322,0.311912,1.311884,171.168712,115.207459,0.0,0.0,92.841433,51.927346,1583857000.0,1583857000.0,0.009347,0.001731,0.201213,354.21263,354.219106,99.92064,99.91955,1.000139,120.723152,1583857000.0,1.206957e+19,0.311745,50.44556,493759100.0,0.688144
std,18903.468159,152.660361,0.463276,0.465665,126.993029,43.537471,0.0,0.0,90.847418,29.724366,1820.011,1820.014,2.18794,0.128715,5.457089,271.964182,271.967328,2.811976,2.830037,0.045721,52.345723,1820.011,5.152434e+18,0.463208,75.20872,733656000.0,0.463254
min,1.0,4840.0,0.0,1.0,86.0,86.0,0.0,0.0,32.0,28.0,1583855000.0,1583855000.0,0.0,0.0,-3e-06,0.0,0.0,0.0,0.0,1.0,86.0,1583855000.0,8.6e+18,0.0,0.0,0.0,0.0
25%,18381.0,4840.0,0.0,1.0,86.0,86.0,0.0,0.0,32.0,32.0,1583856000.0,1583856000.0,0.0,0.0,0.003332,17.0,17.0,100.0,100.0,1.0,86.0,1583856000.0,8.6e+18,0.0,0.0,0.0,0.0
50%,37994.0,4840.0,0.0,1.0,86.0,86.0,0.0,0.0,32.0,32.0,1583856000.0,1583856000.0,0.0,0.0,0.003749,561.0,562.0,100.0,100.0,1.0,86.0,1583856000.0,8.6e+18,0.0,0.0,0.0,1.0
75%,48970.0,4840.0,1.0,2.0,358.0,179.0,0.0,0.0,226.0,95.0,1583857000.0,1583857000.0,0.007922,0.003961,0.146403,576.0,576.0,100.0,100.0,1.0,197.0,1583857000.0,1.97e+19,1.0,161.0,1583856000.0,1.0
max,65534.0,54924.0,1.0,17.0,3314.0,490.5,0.0,0.0,2192.0,272.0,1583862000.0,1583862000.0,717.808954,42.224056,1552.136166,1216.0,1216.0,100.0,100.0,16.0,3152.0,1583862000.0,3.38e+19,1.0,643.0,1583862000.0,1.0


In [7]:
data.head(10)

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,flags,pktTotalCount,octetTotalCount,avg_ps,proto,service,service_errors,status_errors,msg_size,min_msg_size,flowStart,flowEnd,flowDuration,avg_flowDuration,flowInterval,count,srv_count,same_srv_rate,dst_host_same_src_port_rate,f_pktTotalCount,f_octetTotalCount,f_flowStart,f_rate,b_pktTotalCount,b_octetTotalCount,b_flowStart,label,multi_label
0,192.168.1.16,40656,192.168.1.17,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.009868,0.004934,0.0,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
1,192.168.1.16,44854,192.168.1.18,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.002942,0.001471,0.002473,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
2,192.168.1.16,44854,192.168.1.18,4840,1,2,399,199.5,OPCUA,SecureChannel,0,0,267,132,1583855000.0,1583855000.0,0.017495,0.008748,0.008226,0,0,0.0,0.0,1,198,1583855000.0,1.98e+19,1,201,1583855000.0,0,Normal
3,192.168.1.16,40656,192.168.1.17,4840,1,2,399,199.5,OPCUA,SecureChannel,0,0,267,132,1583855000.0,1583855000.0,0.008005,0.004002,0.003791,0,0,0.0,0.0,1,198,1583855000.0,1.98e+19,1,201,1583855000.0,0,Normal
4,192.168.1.18,45388,192.168.1.16,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.034876,0.017438,0.017547,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
5,192.168.1.19,50892,192.168.1.16,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.039711,0.019855,0.0,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
6,192.168.1.19,58654,192.168.1.17,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.007861,0.00393,0.001325,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
7,192.168.1.18,35708,192.168.1.17,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.003973,0.001987,0.001951,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
8,192.168.1.19,58654,192.168.1.17,4840,1,2,219,109.5,OPCUA,StartRawConnection,0,0,87,28,1583855000.0,1583855000.0,0.00842,0.00421,0.002119,0,0,0.0,0.0,1,125,1583855000.0,1.25e+19,1,94,1583855000.0,0,Normal
9,192.168.1.16,40656,192.168.1.17,4840,1,2,981,490.5,OPCUA,Session,0,0,849,272,1583855000.0,1583855000.0,0.016294,0.008147,0.006017,0,0,0.0,0.0,1,338,1583855000.0,3.38e+19,1,643,1583855000.0,0,Normal


In [8]:
# Binary labels distribution
data['label'].value_counts()

1    74067
0    33566
Name: label, dtype: int64

In [9]:
# Multi-label distribution
data['multi_label'].value_counts()

DoS              74012
Normal           33566
Impersonation       49
MITM                 6
Name: multi_label, dtype: int64

In [10]:
# Source IP addresses
data['src_ip'].value_counts()

217.107.146.21     65534
192.168.1.19        8176
192.168.1.18        7227
192.168.1.16        7134
192.168.1.22        3699
                   ...  
51.142.155.228         1
117.242.52.97          1
30.108.18.221          1
204.118.46.99          1
187.147.201.247        1
Name: src_ip, Length: 8486, dtype: int64

In [11]:
# Destination IP addresses
data['dst_ip'].value_counts()

192.168.1.17    76308
192.168.1.22     6087
192.168.1.20     6032
192.168.1.21     6023
192.168.1.16     6005
192.168.1.18     5976
192.168.1.19     1202
Name: dst_ip, dtype: int64

In [12]:
# Destination ports
data['dst_port'].value_counts()

4840     107632
54924         1
Name: dst_port, dtype: int64

In [13]:
# Protocol distribution
data['proto'].value_counts()

OPCUA    107633
Name: proto, dtype: int64

In [14]:
# Services distribution
data['service'].value_counts()

StartRawConnection    74029
Attribute             33515
SecureChannel            55
Session                  34
Name: service, dtype: int64

## Time and Continuity Analysis

In [15]:
# Sort values based on their end times
data_end_sort = data.sort_values('flowEnd', ascending=False)

In [16]:
data_end_sort.iloc[-1]['flowEnd']

1583854854.0936646

In [17]:
# Convert floating timestamp into regular datetime format
data_end_sort['tstamp'] = pd.to_datetime(data['flowEnd'], unit='s', origin='unix')
data_end_sort['tstamp']

107632   2020-03-10 17:44:11.686178048
107631   2020-03-10 17:44:11.686173440
107629   2020-03-10 17:44:07.659437056
107630   2020-03-10 17:44:07.656013312
107628   2020-03-10 17:44:03.633358592
                      ...             
7        2020-03-10 15:40:54.129051392
2        2020-03-10 15:40:54.116443392
3        2020-03-10 15:40:54.114434048
0        2020-03-10 15:40:54.098117120
1        2020-03-10 15:40:54.093664768
Name: tstamp, Length: 107633, dtype: datetime64[ns]

In [18]:
# Compute capture time span
span = data_end_sort['tstamp'].iloc[0] - data_end_sort['tstamp'].iloc[-1]
span

Timedelta('0 days 02:03:17.592513280')

In [19]:
# Are there gaps in the data (was the capture interrupted?)
def measure_real_capture_dur(data: pd.Series, gap_max_secs: int = 300) -> float:
    """Computes total timespan of the capture. Expects iterable containing timestamps objects sorted in a descending manner"""
    total_dur = pd.Timedelta(seconds=0)
    current_dur = pd.Timedelta(seconds=0)
    cont_durations = []
    last_tstamp = data.iloc[0]
    contiguous = True

    # Iterate through the dataframe to find out gaps
    for cur_tstamp in data:
        dur_gap = last_tstamp - cur_tstamp

        if dur_gap <= pd.Timedelta(seconds=gap_max_secs):
            total_dur += dur_gap
            current_dur += dur_gap
        else:
            cont_durations.append(current_dur)
            current_dur = pd.Timedelta(seconds=0)

            if contiguous:
                contiguous = False

        last_tstamp = cur_tstamp

    # Log the final continuous block and sort them via their length
    cont_durations.append(current_dur)
    cont_durations.sort(reverse=True)

    return total_dur, contiguous, cont_durations

In [20]:
measure_real_capture_dur(data_end_sort['tstamp'], int(span.total_seconds() * 0.01))

(Timedelta('0 days 02:03:17.592513280'),
 True,
 [Timedelta('0 days 02:03:17.592513280')])