# NetML Challenge Datasets Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [2]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [3]:
# Set file paths
NETML_PATH_TEST_STD  = '/data/kinit/netml_2020/NetML-Competition2020/data/NetML/1_test-std_set/1_test-std_set.json.gz'
NETML_PATH_TEST_CHAL = '/data/kinit/netml_2020/NetML-Competition2020/data/NetML/0_test-challenge_set/0_test-challenge_set.json.gz'
NETML_PATH_TRAIN     = '/data/kinit/netml_2020/NetML-Competition2020/data/NetML/2_training_set/2_training_set.json.gz'

In [4]:
# Load each dataset part
netml_test_std   = pd.read_json(NETML_PATH_TEST_STD, compression='gzip', lines=True)
netml_test_chal  = pd.read_json(NETML_PATH_TEST_CHAL, compression='gzip', lines=True)
netml_train = pd.read_json(NETML_PATH_TRAIN, compression='gzip', lines=True)

In [8]:
netml_total_size = len(netml_train) + len(netml_test_chal) + len(netml_test_chal)

display(len(netml_test_chal))
display(len(netml_test_chal))
display(len(netml_train))

print(f"Sum: {netml_total_size}")

48394

48394

387268

Sum: 484056


In [9]:
# Let's see the labels
NETML_PATH_LABELS = '/data/kinit/netml_2020/NetML-Competition2020/data/NetML/2_training_annotations/2_training_anno_fine.json.gz'

In [14]:
netml_train_labels = pd.read_json(NETML_PATH_LABELS, typ='series')

In [18]:
netml_train_labels.value_counts()

benign              75995
Trickster           57796
Emotet              45907
TrickBot            37271
MagicHound          31458
HTBot               30442
Tinba               18627
WebCompanion        15767
Adload              15289
Dridex               9208
Ursnif               8482
Artemis              8238
CCleaner             8139
PUA                  6162
MinerTrojan          4732
BitCoinMiner         4074
TrojanDownloader     4020
Downware             3849
Cobalt               1379
Ramnit                400
Sality                 33
dtype: int64

In [17]:
len(netml_train_labels.value_counts())

21

In [None]:
netml_train.head()

Unnamed: 0,sa,intervals_ccnt,rev_hdr_distinct,pr,rev_pld_max,rev_pld_mean,pld_mean,rev_pld_ccnt,pld_bin_inf,rev_ack_psh_rst_syn_fin_cnt,rev_intervals_ccnt,hdr_ccnt,rev_pld_distinct,dst_port,pld_median,ack_psh_rst_syn_fin_cnt,bytes_in,rev_hdr_ccnt,hdr_mean,rev_hdr_bin_40,rev_pld_var,pld_distinct,pld_max,num_pkts_in,num_pkts_out,rev_pld_bin_128,time_length,bytes_out,hdr_bin_40,pld_ccnt,src_port,hdr_distinct,da,id,dns_query_type,dns_query_class,dns_query_name_len,dns_query_name,dns_query_cnt,http_method,http_uri,http_host,http_code,http_content_len,http_content_type,tls_len,tls_key_exchange_len,tls_svr_ext_cnt,tls_svr_len,tls_svr_cs_cnt,tls_ext_cnt,tls_cnt,tls_svr_cs,tls_cs_cnt,tls_ext_types,tls_svr_key_exchange_len,tls_svr_ext_types,tls_svr_cnt,tls_cs,dns_answer_ip,dns_answer_ttl,dns_answer_cnt
0,IP_masked,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,17,0,0.0,174.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1900,174,"[0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8.0,0,0.0,1,174,0,4,0,3.041219,696,0,"[0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",61706,1,IP_masked,1898859,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,IP_masked,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,17,0,0.0,521.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1900,521,"[0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8.0,0,0.0,1,521,0,1,0,0.0,521,0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",34229,1,IP_masked,9164771,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,IP_masked,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,17,0,0.0,172.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1900,172,"[0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8.0,0,0.0,1,172,0,4,0,3.002623,688,0,"[0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",33130,1,IP_masked,2098703,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,IP_masked,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,17,0,0.0,174.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1900,174,"[0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8.0,0,0.0,1,174,0,4,0,3.026999,696,0,"[0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",64978,1,IP_masked,2019540,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,IP_masked,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,17,0,0.0,174.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1900,174,"[0, 0, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",8.0,0,0.0,1,174,0,4,0,3.044839,696,0,"[0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",52182,1,IP_masked,2857244,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
len(netml_train.dropna()) / len(netml_train)

0.0

In [None]:
netml_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387268 entries, 0 to 387267
Data columns (total 62 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   sa                           387268 non-null  object 
 1   intervals_ccnt               387268 non-null  object 
 2   rev_hdr_distinct             387268 non-null  int64  
 3   pr                           387268 non-null  int64  
 4   rev_pld_max                  387268 non-null  int64  
 5   rev_pld_mean                 387268 non-null  float64
 6   pld_mean                     387268 non-null  float64
 7   rev_pld_ccnt                 387268 non-null  object 
 8   pld_bin_inf                  387268 non-null  int64  
 9   rev_ack_psh_rst_syn_fin_cnt  387268 non-null  object 
 10  rev_intervals_ccnt           387268 non-null  object 
 11  hdr_ccnt                     387268 non-null  object 
 12  rev_pld_distinct             387268 non-null  int64  
 13 

In [None]:
netml_train['sa'].value_counts()

IP_masked    387268
Name: sa, dtype: int64

In [None]:
netml_train['da'].value_counts()

IP_masked    387268
Name: da, dtype: int64

## CIC-IDS2017

In [19]:
CICIDS_PATH_TEST_STD  = '/data/kinit/netml_2020/NetML-Competition2020/data/CICIDS2017/1_test-std_set/1_test-std_set.json.gz'
CICIDS_PATH_TEST_CHAL = '/data/kinit/netml_2020/NetML-Competition2020/data/CICIDS2017/0_test-challenge_set/0_test-challenge_set.json.gz'
CICIDS_PATH_TRAIN     = '/data/kinit/netml_2020/NetML-Competition2020/data/CICIDS2017/2_training_set/2_training_set.json.gz'
CICIDS_PATH_LABELS = '/data/kinit/netml_2020/NetML-Competition2020/data/CICIDS2017/2_training_annotations/2_training_anno_fine.json.gz'

In [20]:
# Load each dataset part
cicids_test_std   = pd.read_json(CICIDS_PATH_TEST_STD, compression='gzip', lines=True)
cicids_test_chal  = pd.read_json(CICIDS_PATH_TEST_CHAL, compression='gzip', lines=True)
cicids_train = pd.read_json(CICIDS_PATH_TRAIN, compression='gzip', lines=True)
cicids_train_labels = pd.read_json(CICIDS_PATH_LABELS, typ='series')

In [22]:
cicids_total_size = len(cicids_test_std) + len(cicids_test_chal) + len(cicids_train)

display(len(cicids_test_std))
display(len(cicids_test_chal))
display(len(cicids_train))

print(f"Sum: {cicids_total_size}")

55128

55128

441116

Sum: 551372


In [26]:
cicids_train_labels.value_counts()

benign          198455
portScan        122430
infiltration     53532
DDoS             36136
DoS              23806
ftp-patator       3168
ssh-patator       1972
webAttack         1617
dtype: int64

In [27]:
len(cicids_train_labels.value_counts())

8