# PWNJUTSU Dataset Analysis

* **Author:** Patrik Goldschmidt (igoldschmidt@fit.vut.cz)
* **Project:** Network Intrusion Datasets: A Survey, Limitations, and Recommendations
* **Date:** 2024

In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# We will focus on a brief analysis of network and system logs via the JSON format
PATH_NETWORK = '/data/hypermnt/pwnjutsu/network/json/'
PATH_SYSTEM = '/data/hypermnt/pwnjutsu/system/json'

In [3]:
# List IDs players whose data are available
PLAYER_IDS = ["11", "12", "13", "15", "16", "18", "19", "21", "22", "23", "24", "26", "27", "28", "32", "33", "34", "35", "36", "37", "38", "39"]

In [4]:
# Read All JSON Files from player 11 as a sample
p11_data = pd.DataFrame()

for file in os.scandir(PATH_NETWORK):
    if f'player_{PLAYER_IDS[0]}_' in file.name:
        p11_data = pd.concat([p11_data, pd.read_json(file.path, lines=True)])

In [5]:
p11_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 794175 entries, 0 to 1
Data columns (total 133 columns):
 #    Column                    Non-Null Count   Dtype  
---   ------                    --------------   -----  
 0    ts                        794175 non-null  float64
 1    uid                       715829 non-null  object 
 2    id.orig_h                 715829 non-null  object 
 3    id.orig_p                 715829 non-null  float64
 4    id.resp_h                 715829 non-null  object 
 5    id.resp_p                 715829 non-null  float64
 6    path                      12 non-null      object 
 7    service                   23373 non-null   object 
 8    share_type                12 non-null      object 
 9    fuid                      78346 non-null   object 
 10   tx_hosts                  78346 non-null   object 
 11   rx_hosts                  78346 non-null   object 
 12   conn_uids                 78346 non-null   object 
 13   source                    78594 non-n

In [6]:
p11_data.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,path,service,share_type,fuid,tx_hosts,rx_hosts,conn_uids,source,depth,analyzers,mime_type,duration,is_orig,seen_bytes,total_bytes,missing_bytes,overflow_bytes,timedout,filename,name,addl,notice,peer,proto,trans_id,query,qclass,qclass_name,qtype,qtype_name,AA,TC,RD,RA,Z,rejected,client_major_version,client_minor_version,server_major_version,server_minor_version,authentication_method,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,orig_bytes,resp_bytes,version,cipher,server_name,resumed,established,ssl_history,cert_chain_fps,client_cert_chain_fps,sni_matches_cert,trans_depth,method,host,uri,user_agent,request_body_len,response_body_len,status_code,status_msg,tags,resp_fuids,resp_mime_types,proxied,origin,referrer,username,orig_fuids,orig_filenames,orig_mime_types,rtt,named_pipe,endpoint,operation,analyzer,failure_reason,command,value,user,nick,request_type,client,till,forwardable,renewable,auth_attempts,server,cipher_alg,mac_alg,compression_alg,kex_alg,host_key_alg,host_key,auth_success,rcode,rcode_name,hostname,domainname,server_nb_computer_name,server_dns_computer_name,success,answers,TTLs,mode,stratum,poll,precision,root_delay,root_disp,ref_id,ref_time,org_time,rec_time,xmt_time,num_exts,cookie,result,security_protocol,cert_count
0,1620591000.0,C0xi3k36oBXMXiTvBl,10.11.1.1,43138.0,10.11.1.2,445.0,IPC$,IPC,PIPE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1620591000.0,C24VwD1f6cVtEmtgm3,10.11.1.1,43139.0,10.11.1.2,445.0,IPC$,IPC,PIPE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1620591000.0,ClYNHo1nIh4Dk3udcg,10.11.1.1,43140.0,10.11.1.2,445.0,IPC$,IPC,PIPE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1620635000.0,CSbVAPIX3qaZqxt82,10.11.1.1,43185.0,10.11.1.2,445.0,\\10.11.1.2\IPC$,,PIPE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1620635000.0,CWqKuxigpxkF2sbci,10.11.1.1,43186.0,10.11.1.2,445.0,\\10.11.1.2\IPC$,,PIPE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
!cat $PATH_NETWORK/* | wc -l

45120244


In [8]:
# Apparently, we cannot load all data at once due to memory restrictions, let's do it by players
data = {}   # dictionary of player_id : pd.DataFrame

for player in PLAYER_IDS:
    print(f"Processing #{player}")

    net_data = pd.DataFrame()

    for file in os.scandir(PATH_NETWORK):
        if f'player_{player}_' in file.name:
            net_data = pd.concat([net_data, pd.read_json(file.path, lines=True)])

    data[player] = net_data

Processing #11


Processing #12
Processing #13
Processing #15
Processing #16
Processing #18
Processing #19
Processing #21
Processing #22
Processing #23
Processing #24
Processing #26


: 

In [8]:
# Apparently, in order to determine the final number of features and non NaNs, we have to code our own algo
# Remove player 26 from features analysis because it is too big to fit into the memory
cols_nonans_tot = {}   # dictionary of col_name : non_nan_count

for player in [player for player in PLAYER_IDS if player != '26']:
    print(f"Processing #{player}")

    net_data = pd.DataFrame()

    for file in os.scandir(PATH_NETWORK):
        if f'player_{player}_' in file.name:
            net_data = pd.concat([net_data, pd.read_json(file.path, lines=True)])

    cols_nonans_cur = net_data.count().to_dict()

    cols_nonans_tot = {k : cols_nonans_tot.get(k, 0) + cols_nonans_cur.get(k, 0)
                       for k in set(cols_nonans_tot) | set(cols_nonans_cur)}

Processing #11
Processing #12
Processing #13
Processing #15
Processing #16
Processing #18
Processing #19
Processing #21
Processing #22
Processing #23
Processing #24
Processing #27
Processing #28
Processing #32
Processing #33
Processing #34
Processing #35
Processing #36
Processing #37
Processing #38
Processing #39


In [9]:
nonans_total = pd.Series(cols_nonans_tot).sort_index()
nonans_total

AA                            446187
RA                            446187
RD                            446187
TC                            446187
TTLs                              68
Z                             446187
action                           124
addl                             908
analyzer                         632
analyzers                    4022394
answers                           68
arg                               92
auth                              90
auth_attempts                   4260
auth_success                    2068
authentication_method            330
call_id                           24
cert_chain_fps                   110
cert_count                       104
cipher                           110
cipher_alg                      3528
client                          4092
client_cert_chain_fps            110
client_major_version             616
client_minor_version             616
cmd                               92
command                          856
c

In [10]:
len(nonans_total.index)

160

So 160 total features, but very sparse...

In [11]:
# According to analysis, each red team player is comprised of 4 PCAP files (probably captured at different sensors)
# So, we selected 1 file from each actor to analyze their duration. Only 1 was needed, as all 4 had approximately
# the same durations for all the actors
SELECTED_PCAPS = '/data/hypermnt/pwnjutsu/network/pcap_dur_selection'

!capinfos -a -e $SELECTED_PCAPS/*

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_11_503317411.pcap
First packet time:   2021-05-09 11:56:12.148505
Last packet time:    2021-05-10 14:18:34.367668

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_12_536871850.pcap
First packet time:   2021-05-09 12:30:19.322607
Last packet time:    2021-05-10 21:29:12.672149

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_13_570426287.pcap
First packet time:   2021-05-09 13:24:33.129385
Last packet time:    2021-05-10 19:14:28.760362

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_15_637535169.pcap
First packet time:   2021-05-10 07:09:08.484604
Last packet time:    2021-05-12 16:17:43.398603

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_16_671089606.pcap
First packet time:   2021-05-10 21:58:40.004643
Last packet time:    2021-05-13 21:28:54.039336

File name:           /data/hyp

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_36_1375732806.pcap
First packet time:   2021-06-02 16:47:41.962689
Last packet time:    2021-06-09 14:30:55.144759

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_37_1409287248.pcap
First packet time:   2021-06-02 18:14:43.011042
Last packet time:    2021-06-09 09:25:21.651736

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_38_1442841683.pcap
First packet time:   2021-06-05 09:41:22.042158
Last packet time:    2021-06-12 23:57:50.598646

File name:           /data/hypermnt/pwnjutsu/network/pcap_dur_selection/player_39_1476396120.pcap
First packet time:   2021-06-08 10:35:24.717519
Last packet time:    2021-06-20 19:39:16.298602


In [12]:
# Compute the total time of the cptures
!/data/AAAA_SCRIPTS/countdur.sh $SELECTED_PCAPS

10058133.778334


In [13]:
# Print the selected value as days and hours
import math

TOTAL_SECS  = 10058133.778334
TOTAL_DAYS  = TOTAL_SECS / 60 / 60 / 24
TOTAL_HOURS = (TOTAL_DAYS % math.floor(TOTAL_DAYS)) * 24

display(TOTAL_DAYS)
display(TOTAL_HOURS)

116.41358539738424

9.92604953722173