# ASNM Datasets Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
PATH_FOLDER = '/data/kinit/asnm/'
FNAME_CDX   = 'ASNM-CDX-2009.csv'
FNAME_NPBO  = 'ASNM-NBPOv2.csv'
FNAME_TUN   = 'ASNM-TUN.csv'

In [14]:
data_cdx  = pd.read_csv(os.path.join(PATH_FOLDER, FNAME_CDX), delimiter=';')
data_npbo = pd.read_csv(os.path.join(PATH_FOLDER, FNAME_NPBO), delimiter=';')
data_tun  = pd.read_csv(os.path.join(PATH_FOLDER, FNAME_TUN), delimiter=';')

## ANSM-CDX-2009 Data

In [7]:
data_cdx.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5771 entries, 0 to 5770
Data columns (total 877 columns):
 #    Column                 Dtype  
---   ------                 -----  
 0    id                     int64  
 1    label_2                bool   
 2    label_poly             object 
 3    SrcIP                  object 
 4    DstIP                  object 
 5    SrcPort                int64  
 6    DstPort                int64  
 7    SrcMAC                 object 
 8    DstMAC                 object 
 9    SrcIPInVlan            bool   
 10   DstIPInVlan            bool   
 11   InPkt1s10i[0]          int64  
 12   InPkt1s10i[1]          int64  
 13   InPkt1s10i[2]          int64  
 14   InPkt1s10i[3]          int64  
 15   InPkt1s10i[4]          int64  
 16   InPkt1s10i[5]          int64  
 17   InPkt1s10i[6]          int64  
 18   InPkt1s10i[7]          int64  
 19   InPkt1s10i[8]          int64  
 20   InPkt1s10i[9]          int64  
 21   InPkt4s10i[0]          int64  
 22 

In [10]:
data_cdx['SrcIP'].value_counts()

10.2.197.241    3148
10.2.195.251    1226
10.1.60.203      280
10.2.199.236     213
10.2.190.254     136
10.1.40.155      102
10.1.60.253       72
10.1.80.9         66
10.2.15.16        61
10.1.30.5         58
10.2.198.239      56
10.1.10.20        44
10.1.40.166       37
10.1.90.5         34
10.1.10.101       34
10.1.10.64        32
10.2.197.240      28
10.1.60.191       27
10.1.10.104       23
10.1.40.70        15
10.2.191.254      14
10.1.10.69        13
10.1.40.80         7
10.2.192.246       7
10.1.10.63         6
10.1.10.100        6
10.1.10.10         5
10.2.195.239       4
10.1.60.251        4
10.1.60.25         4
10.1.100.8         3
10.1.100.6         3
10.2.197.235       2
10.1.70.71         1
Name: SrcIP, dtype: int64

In [11]:
data_cdx['DstIP'].value_counts()

10.1.60.187    5437
10.1.60.25      182
10.1.60.73      117
10.1.20.4        16
10.1.100.4        6
10.1.10.20        5
10.1.10.10        4
10.2.20.60        4
Name: DstIP, dtype: int64

In [8]:
data_cdx['label_2'].value_counts()

False    5727
True       44
Name: label_2, dtype: int64

In [9]:
data_cdx['label_poly'].value_counts()

0_apache     2911
0_other      2637
0_postfix     179
1_apache       37
1_postfix       7
Name: label_poly, dtype: int64

## ASNM-TUN Data

In [15]:
data_tun.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 896 columns):
 #    Column                 Dtype  
---   ------                 -----  
 0    id                     int64  
 1    label_2                bool   
 2    label_3                int64  
 3    label_poly             object 
 4    label_poly_s           object 
 5    SrcIP                  object 
 6    DstIP                  object 
 7    SrcPort                int64  
 8    DstPort                int64  
 9    SrcIPInVlan            bool   
 10   DstIPInVlan            bool   
 11   ConTcpFinCntIn         int64  
 12   ConTcpSynCntIn         int64  
 13   ConTcpRstCntIn         int64  
 14   ConTcpPshCntIn         int64  
 15   ConTcpAckCntIn         int64  
 16   ConTcpUrgCntIn         int64  
 17   ConTcpEceCntIn         int64  
 18   ConTcpCwrCntIn         int64  
 19   ConTcpRstAckIn         int64  
 20   ConTcpFinCntOut        int64  
 21   ConTcpSynCntOut        int64  
 22   

In [17]:
data_tun['label_2'].value_counts()

True     217
False    177
Name: label_2, dtype: int64

In [16]:
data_tun['label_3'].value_counts()

3    177
1    130
2     87
Name: label_3, dtype: int64

In [18]:
data_tun['label_poly'].value_counts()

1_Apache      102
3_BadBlue      95
2_Apache       61
3_Apache       38
3_Other        25
1_Samba        20
3_Samba        15
2_BadBlue      10
2_Samba         8
2_DCOM_RPC      8
1_DCOM_RPC      4
1_BadBlue       4
3_DCOM_RPC      4
Name: label_poly, dtype: int64

In [19]:
data_tun['label_poly_s'].value_counts()

1_Apache_b      51
1_Apache_a      46
2_Apache_c      40
3_BadBlue_a     33
3_BadBlue_b     33
3_BadBlue_d     29
3_Other_a       25
3_Apache_d      20
2_Apache_d      17
3_Apache_c      17
1_Samba_a       10
1_Samba_b       10
3_Samba_d        4
3_Samba_a        4
3_Samba_b        4
2_BadBlue_c      4
3_DCOM_RPC_a     4
1_Apache_c       3
3_Samba_c        3
2_DCOM_RPC_b     2
2_BadBlue_a      2
2_Apache_b       2
2_BadBlue_b      2
1_DCOM_RPC_a     2
1_Apache_d       2
2_Samba_a        2
2_DCOM_RPC_d     2
2_Apache_a       2
2_Samba_c        2
2_BadBlue_d      2
2_DCOM_RPC_c     2
2_Samba_b        2
1_DCOM_RPC_b     2
2_Samba_d        2
1_BadBlue_b      2
2_DCOM_RPC_a     2
1_BadBlue_a      2
3_Apache_b       1
Name: label_poly_s, dtype: int64

## ASNM-NPBO

In [20]:
data_npbo.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11445 entries, 0 to 11444
Data columns (total 904 columns):
 #    Column                         Dtype  
---   ------                         -----  
 0    id                             int64  
 1    label_2                        bool   
 2    label_poly                     object 
 3    label_poly_o                   object 
 4    srcIP                          object 
 5    dstIP                          object 
 6    srcPort                        int64  
 7    dstPort                        int64  
 8    srcMAC                         object 
 9    dstMAC                         object 
 10   srcIPInVlan                    bool   
 11   dstIPInVlan                    bool   
 12   finCnt<In>                     int64  
 13   synCnt<In>                     int64  
 14   rstCnt<In>                     int64  
 15   pshCnt<In>                     int64  
 16   ackCnt<In>                     int64  
 17   urgCnt<In>                   

In [23]:
data_npbo['label'].value_counts()

3    10805
2      478
1      162
Name: label, dtype: int64

In [24]:
data_npbo['label_2'].value_counts()

False    10805
True       640
Name: label_2, dtype: int64

In [26]:
data_npbo['label_poly'].value_counts()

3_Samba         4641
3_Server        3339
3_Apache         809
3_PostgreSQL     737
3_Other          647
3_MSSQL          532
2_Apache         163
2_MSSQL          103
2_Server         100
3_DistCC         100
1_Apache          61
2_PostgreSQL      45
2_Samba           44
1_MSSQL           31
1_Server          26
2_DistCC          23
1_Samba           19
1_PostgreSQL      13
1_DistCC          12
Name: label_poly, dtype: int64

In [27]:
data_tun['label_poly_s'].value_counts()

1_Apache_b      51
1_Apache_a      46
2_Apache_c      40
3_BadBlue_a     33
3_BadBlue_b     33
3_BadBlue_d     29
3_Other_a       25
3_Apache_d      20
2_Apache_d      17
3_Apache_c      17
1_Samba_a       10
1_Samba_b       10
3_Samba_d        4
3_Samba_a        4
3_Samba_b        4
2_BadBlue_c      4
3_DCOM_RPC_a     4
1_Apache_c       3
3_Samba_c        3
2_DCOM_RPC_b     2
2_BadBlue_a      2
2_Apache_b       2
2_BadBlue_b      2
1_DCOM_RPC_a     2
1_Apache_d       2
2_Samba_a        2
2_DCOM_RPC_d     2
2_Apache_a       2
2_Samba_c        2
2_BadBlue_d      2
2_DCOM_RPC_c     2
2_Samba_b        2
1_DCOM_RPC_b     2
2_Samba_d        2
1_BadBlue_b      2
2_DCOM_RPC_a     2
1_BadBlue_a      2
3_Apache_b       1
Name: label_poly_s, dtype: int64

In [None]:
len(data_tun.columns) - len(data_cdx.columns)

19

In [29]:
len(data_npbo.columns) - len(data_cdx.columns)

27

In [31]:
len(data_npbo.columns) - len(data_tun.columns)

8

In [32]:
FTR_LIST = ['OutPkt64s20iTr2KB[0]',
            'OutPkt64s20iTr2KB[1]',
            'OutPkt64s20iTr2KB[2]',
            'OutPkt64s20iTr2KB[3]',
            'OutPkt64s20iTr2KB[4]',
            'OutPkt64s20iTr2KB[5]',
            'OutPkt64s20iTr2KB[6]',
            'OutPkt64s20iTr2KB[7]',
            'OutPkt64s20iTr2KB[8]',
            'OutPkt64s20iTr2KB[9]',
            'OutPkt64s20iTr2KB[10]',
            'OutPkt64s20iTr2KB[11]',
            'OutPkt64s20iTr2KB[12]',
            'OutPkt64s20iTr2KB[13]',
            'OutPkt64s20iTr2KB[14]',
            'OutPkt64s20iTr2KB[15]',
            'OutPkt64s20iTr2KB[16]',
            'OutPkt64s20iTr2KB[17]',
            'OutPkt64s20iTr2KB[18]',
            'OutPkt64s20iTr2KB[19]',
]

data_tun[FTR_LIST].head()

Unnamed: 0,OutPkt64s20iTr2KB[0],OutPkt64s20iTr2KB[1],OutPkt64s20iTr2KB[2],OutPkt64s20iTr2KB[3],OutPkt64s20iTr2KB[4],OutPkt64s20iTr2KB[5],OutPkt64s20iTr2KB[6],OutPkt64s20iTr2KB[7],OutPkt64s20iTr2KB[8],OutPkt64s20iTr2KB[9],OutPkt64s20iTr2KB[10],OutPkt64s20iTr2KB[11],OutPkt64s20iTr2KB[12],OutPkt64s20iTr2KB[13],OutPkt64s20iTr2KB[14],OutPkt64s20iTr2KB[15],OutPkt64s20iTr2KB[16],OutPkt64s20iTr2KB[17],OutPkt64s20iTr2KB[18],OutPkt64s20iTr2KB[19]
0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,9,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
