# Kyoto 2006+ Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

This file presents analysis of a selected day (2015-01-01) to get the grasp of the data. Due to the amount of data, we did not load all into the memory, but rather performed the counting via a command line script.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
# Select a sample -- the first day of 2015
KYOTO_PATH = 'Kyoto2016/2015/01/20150101.txt'

In [3]:
KYOTO_FEATURES = [
    # 14 Conventional features
    'duration',
    'service',
    'src_bytes',
    'dst_bytes',
    'count',
    'same_srv_rate',
    'serror_rate',
    'srv_serror_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_src_port_rate',
    'dst_host_serror_rate',
    'dst_host_src_serror_rate',
    'flag',
    # 10 Additional features
    'ids_detection',
    'malware_detection',
    'ashula_detection',
    'label',
    'src_ip',
    'src_port',
    'dst_ip',
    'dst_port',
    'start_time',
    'protocol'
]

In [4]:
data = pd.read_csv(KYOTO_PATH, delimiter='\t', names=KYOTO_FEATURES)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381105 entries, 0 to 381104
Data columns (total 24 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     381105 non-null  float64
 1   service                      381105 non-null  object 
 2   src_bytes                    381105 non-null  int64  
 3   dst_bytes                    381105 non-null  int64  
 4   count                        381105 non-null  int64  
 5   same_srv_rate                381105 non-null  float64
 6   serror_rate                  381105 non-null  float64
 7   srv_serror_rate              381105 non-null  float64
 8   dst_host_count               381105 non-null  int64  
 9   dst_host_srv_count           381105 non-null  int64  
 10  dst_host_same_src_port_rate  381105 non-null  float64
 11  dst_host_serror_rate         381105 non-null  float64
 12  dst_host_src_serror_rate     381105 non-null  float64
 13 

In [6]:
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_src_serror_rate,flag,ids_detection,malware_detection,ashula_detection,label,src_ip,src_port,dst_ip,dst_port,start_time,protocol
0,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fd95:ec1e:6a61:f55c:1fa3:15ee:2e7a:0044,54992,fd95:ec1e:6a61:9478:7d39:2713:60a1:0514,23,00:00:00,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fd95:ec1e:6a61:f55c:1fa3:15ee:2e7a:0044,47904,fd95:ec1e:6a61:9c93:7df6:27d3:6096:040d,23,00:00:00,tcp
2,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fd95:ec1e:6a61:f55c:1fa3:15ee:2e7a:0044,58974,fd95:ec1e:6a61:6135:7d3a:2712:6027:0328,23,00:00:00,tcp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fd95:ec1e:6a61:f55c:1fa3:15ee:2e7a:0044,37174,fd95:ec1e:6a61:b941:7d6b:27a6:6074:02ab,23,00:00:00,tcp
4,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fd95:ec1e:6a61:5cd4:2a7f:0315:78bc:7c66,40711,fd95:ec1e:6a61:a188:7d54:27f4:60c1:220d,3389,00:00:00,tcp


In [7]:
len(data)

381105

In [8]:
# Protocols
data['service'].value_counts()

other       195736
dns         122458
ssh          56813
rdp           3586
http           910
snmp           873
smtp           352
sip            323
smtp,ssl        50
ssl              4
Name: service, dtype: int64

In [9]:
# What IDS detection values are typically there?
data['ids_detection'].value_counts()

0                                                     365479
6-128-1(1)                                              5429
1917-1-11(1)                                            2155
384-1-8(1)                                              1471
19559-1-5(1)                                            1109
                                                       ...  
6-128-1(1),17317-1-7(1)                                    1
6-128-1(2),19559-1-5(1),1325-1-10(1)                       1
12946-1-8(1),3-133-1(1),14782-1-17(1),7209-1-16(1)         1
4-138-1(1)                                                 1
22114-1-5(1),4-138-1(1)                                    1
Name: ids_detection, Length: 71, dtype: int64

In [10]:
# Malware detection column
data['malware_detection'].value_counts()

0                               380916
Win.Worm.Kido-113(1)                14
Win.Worm.Kido-200(1)                14
Win.Exploit.Fnstenv_mov-1(1)        14
Win.Dropper.Agent-35454(1)          13
Win.Worm.Kido-297(1)                11
Win.Trojan.Agent-128992(1)           9
Win.Worm.Mydoom-90(1)                8
Win.Worm.Kido-197(1)                 8
Win.Worm.Kido-249(1)                 8
Win.Worm.Agent-35431(1)              6
Win.Trojan.Agent-550646(1)           6
Win.Trojan.Agent-129152(1)           5
Win.Worm.Kido-360(1)                 5
Win.Worm.Kido-355(1)                 4
Win.Worm.Conficker-260(1)            3
Win.Worm.Kido-223(1)                 3
Win.Worm.Kido-214(1)                 3
Win.Worm.Downadup-115(1)             3
Win.Worm.Downadup-5(1)               3
Win.Worm.Kido-273(1)                 3
Win.Worm.Kido-29(1)                  3
Win.Worm.Downadup-75(1)              3
Win.Worm.Kido-37(1)                  3
Win.Worm.Kido-266(1)                 3
Win.Worm.Kido-159(1)     

In [11]:
# Ashula detection column
data['ashula_detection'].value_counts()

0         380646
349(1)       437
809(1)        12
349(2)         5
350(1)         3
810(1)         2
Name: ashula_detection, dtype: int64

In [12]:
# Distribution of labels
# 1 - known attack detected, -1 - no attack detected, -2 - unknown attack detected
data['label'].value_counts()

-1    339593
 1     41495
-2        17
Name: label, dtype: int64

In [13]:
data['src_ip'].value_counts()

fd95:ec1e:6a61:df6b:7de2:27ad:6105:3709    86255
fd95:ec1e:6a61:ccd9:184f:0d6f:03bd:70da    52891
fd95:ec1e:6a61:b804:7dcb:276f:0751:0ff5    26384
fd95:ec1e:6a61:c226:35fe:4156:005d:2d14    25893
fd95:ec1e:6a61:49ee:0191:4307:2301:036e    13225
                                           ...  
fd95:ec1e:6a61:8e4a:0f10:5117:4d61:2f23        1
fd95:ec1e:6a61:d0eb:2bfb:0030:2e81:1792        1
fd95:ec1e:6a61:428d:0758:17f7:05ed:2b8e        1
fd95:ec1e:6a61:28ab:492c:3650:2126:7aad        1
fd95:ec1e:6a61:66bc:0052:2b51:6098:1375        1
Name: src_ip, Length: 16198, dtype: int64

In [14]:
data['dst_ip'].value_counts()

fd95:ec1e:6a61:435f:7de5:27b0:7d84:3c0d    107513
fd95:ec1e:6a61:05d3:7dd2:270d:61ec:03f4     40557
fd95:ec1e:6a61:b804:7dcb:276f:0751:0ff5     15111
fd95:ec1e:6a61:df6b:7de2:27ad:6105:3709     12470
fd95:ec1e:6a61:c7bb:7d9d:2783:614d:3364     10323
                                            ...  
fd95:ec1e:6a61:a78a:3c90:2397:2719:1e2d         1
fd95:ec1e:6a61:b065:270a:10cd:1136:030c         1
fd95:ec1e:6a61:946c:0f1b:4baa:0301:3be4         1
fd95:ec1e:6a61:f138:549e:0ad3:78f0:4d99         1
fd95:ec1e:6a61:e50f:4f7b:0736:2a8b:003c         1
Name: dst_ip, Length: 1043, dtype: int64

In [15]:
data['src_port'].value_counts()

48059    25898
6000     16314
7678      3999
12200     2895
8         1558
         ...  
18782        1
17631        1
14955        1
31762        1
15576        1
Name: src_port, Length: 57508, dtype: int64

In [16]:
data['dst_port'].value_counts()

53       122367
22        67368
445       45499
23        16721
8080      11621
          ...  
49140         1
6042          1
11833         1
3516          1
9896          1
Name: dst_port, Length: 2777, dtype: int64