# NSL-KDD Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

## Generic Settings

We utilize the custom files and classes (e.g., `Dataset`) from other project. Nevertheless, the data contents is not affected, although it adds some clutter into the commands.

In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)

import sys
sys.path.append('..')

import matplotlib.pyplot as plt

import config.dataset as dsetcfg
import dataset as dset

%load_ext autoreload
%autoreload 2

In [2]:
# Paths to data
DATA_NSL_TRAIN = dsetcfg.DATASET_NSLKDD_PATH_TRAIN
DATA_NSL_TEST  = dsetcfg.DATASET_NSLKDD_PATH_TEST

## Take a Look

In [3]:
data = dset.Dataset('nslkdd', data_path=(DATA_NSL_TRAIN, DATA_NSL_TEST), random_state=42)
df_train = data.data_train
df_test  = data.data_test

In [4]:
print(len(df_train))
print(len(df_test))

125973
22544


In [20]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   dur                       125973 non-null  int64  
 1   proto                     125973 non-null  object 
 2   service                   125973 non-null  object 
 3   flag                      125973 non-null  object 
 4   sbytes                    125973 non-null  int64  
 5   dbytes                    125973 non-null  int64  
 6   land                      125973 non-null  int64  
 7   wrong_fragment            125973 non-null  int64  
 8   urgent                    125973 non-null  int64  
 9   hot                       125973 non-null  int64  
 10  num_failed_logins         125973 non-null  int64  
 11  logged_in                 125973 non-null  int64  
 12  num_compromised           125973 non-null  int64  
 13  root_shell                125973 non-null  i

In [21]:
df_train.describe()

Unnamed: 0,dur,sbytes,dbytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_hot_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,save_srv_rate,diff_srv_rate,srv diffhost rate,dhost_count,dhost_srv_cnt,dhost_same_srv_rate,dhost_diffsrv_rate,dhost_same_src_port_rate,dhost_srv_diffhost_rate,dhost_serror_rate,dhost_srv_serror_rate,dhost_rerror_rate,dhost_srv_rerror_rate,difficulty
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.000198,0.022687,0.000111,0.204409,0.001222,0.395736,0.27925,0.001342,0.001103,0.302192,0.012669,0.000413,0.004096,0.0,8e-06,0.009423,84.107555,27.737888,0.284485,0.282485,0.119958,0.121183,0.660928,0.063053,0.097322,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,19.50406
std,2604.51531,5870331.0,4021269.0,0.014086,0.25353,0.014366,2.149968,0.045239,0.48901,23.942042,0.036603,0.045154,24.399618,0.483935,0.022181,0.09937,0.0,0.002817,0.096612,114.508607,72.63584,0.446456,0.447022,0.320436,0.323647,0.439623,0.180314,0.25983,99.206213,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459,2.291503
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,82.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0,20.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,143.0,18.0,1.0,1.0,0.0,0.0,1.0,0.06,0.0,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0,21.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,1.0,2.0,7468.0,43.0,2.0,9.0,0.0,1.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


In [22]:
df_train.head()

Unnamed: 0,dur,proto,service,flag,sbytes,dbytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_hot_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,save_srv_rate,diff_srv_rate,srv diffhost rate,dhost_count,dhost_srv_cnt,dhost_same_srv_rate,dhost_diffsrv_rate,dhost_same_src_port_rate,dhost_srv_diffhost_rate,dhost_serror_rate,dhost_srv_serror_rate,dhost_rerror_rate,dhost_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


Just to verify that the last columns is really `difficulty`, because documentation of NSL-KDD was like non-existent.

In [23]:
print(len(df_test[df_test['difficulty'] == 21]))
print(len(df_train[df_train['difficulty'] == 21]))

10694
62557


See target classes

In [24]:
print(df_train['label'].unique())
len(df_train['label'].unique())

['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy' 'perl']


23

In [25]:
print(df_test['label'].unique())
len(df_test['label'].unique())

['neptune' 'normal' 'saint' 'mscan' 'guess_passwd' 'smurf' 'apache2'
 'satan' 'buffer_overflow' 'back' 'warezmaster' 'snmpgetattack'
 'processtable' 'pod' 'httptunnel' 'nmap' 'ps' 'snmpguess' 'ipsweep'
 'mailbomb' 'portsweep' 'multihop' 'named' 'sendmail' 'loadmodule' 'xterm'
 'worm' 'teardrop' 'rootkit' 'xlock' 'perl' 'land' 'xsnoop' 'sqlattack'
 'ftp_write' 'imap' 'udpstorm' 'phf']


38

There seem to be more attack classes in test dataset than in train. This is not e

In [26]:
# How many of samples are in particular classes - train?
df_train.groupby('label').size().sort_values(ascending=False)

label
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
dtype: int64

In [27]:
# How many of samples are in particular classes - test?
df_test.groupby('label').size().sort_values(ascending=False)

label
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
xterm                13
rootkit              13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
worm                  2
phf                   2
udpstorm              2
sqlattack             2
perl                  2
loadmodule            2
imap                  1
dtype: int64

In [28]:
## Temporarily merge the train and test sets
merged = pd.concat([df_train, df_test])

In [30]:
# Print the total number of classes in the train + test
len(merged['label'].unique())

40

In [31]:
# Print how many of them are in train/test
print(len(df_train['label'].unique()))
print(len(df_test['label'].unique()))

23
38
