In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

In [2]:
df = pd.read_csv('data/Train_Test_datasets/Train_Test_Network_dataset/Train_Test_Network.csv', skipinitialspace=True)

df.head()

Unnamed: 0,ts,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,...,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,1554198358,3.122.49.24,1883,192.168.1.152,52976,tcp,-,80549.53026,1762852,41933215,...,0,0,-,-,-,bad_TCP_checksum,-,F,0,normal
1,1554198358,192.168.1.79,47260,192.168.1.255,15600,udp,-,0.0,0,0,...,0,0,-,-,-,-,-,-,0,normal
2,1554198359,192.168.1.152,1880,192.168.1.152,51782,tcp,-,0.0,0,0,...,0,0,-,-,-,bad_TCP_checksum,-,F,0,normal
3,1554198359,192.168.1.152,34296,192.168.1.152,10502,tcp,-,0.0,0,0,...,0,0,-,-,-,-,-,-,0,normal
4,1554198362,192.168.1.152,46608,192.168.1.190,53,udp,dns,0.000549,0,298,...,0,0,-,-,-,bad_UDP_checksum,-,F,0,normal


In [3]:
df.drop(columns=['ts', 'src_ip', 'src_port', 'dst_ip', 'dst_port'], inplace=True)

In [4]:
df.head()

Unnamed: 0,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,...,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,tcp,-,80549.53026,1762852,41933215,OTH,0,252181,14911156,2,...,0,0,-,-,-,bad_TCP_checksum,-,F,0,normal
1,udp,-,0.0,0,0,S0,0,1,63,0,...,0,0,-,-,-,-,-,-,0,normal
2,tcp,-,0.0,0,0,OTH,0,0,0,0,...,0,0,-,-,-,bad_TCP_checksum,-,F,0,normal
3,tcp,-,0.0,0,0,OTH,0,0,0,0,...,0,0,-,-,-,-,-,-,0,normal
4,udp,dns,0.000549,0,298,SHR,0,0,0,2,...,0,0,-,-,-,bad_UDP_checksum,-,F,0,normal


In [5]:
df.columns

Index(['proto', 'service', 'duration', 'src_bytes', 'dst_bytes', 'conn_state',
       'missed_bytes', 'src_pkts', 'src_ip_bytes', 'dst_pkts', 'dst_ip_bytes',
       'dns_query', 'dns_qclass', 'dns_qtype', 'dns_rcode', 'dns_AA', 'dns_RD',
       'dns_RA', 'dns_rejected', 'ssl_version', 'ssl_cipher', 'ssl_resumed',
       'ssl_established', 'ssl_subject', 'ssl_issuer', 'http_trans_depth',
       'http_method', 'http_uri', 'http_version', 'http_request_body_len',
       'http_response_body_len', 'http_status_code', 'http_user_agent',
       'http_orig_mime_types', 'http_resp_mime_types', 'weird_name',
       'weird_addl', 'weird_notice', 'label', 'type'],
      dtype='object')

In [6]:
df.dtypes

proto                      object
service                    object
duration                  float64
src_bytes                   int64
dst_bytes                   int64
conn_state                 object
missed_bytes                int64
src_pkts                    int64
src_ip_bytes                int64
dst_pkts                    int64
dst_ip_bytes                int64
dns_query                  object
dns_qclass                  int64
dns_qtype                   int64
dns_rcode                   int64
dns_AA                     object
dns_RD                     object
dns_RA                     object
dns_rejected               object
ssl_version                object
ssl_cipher                 object
ssl_resumed                object
ssl_established            object
ssl_subject                object
ssl_issuer                 object
http_trans_depth           object
http_method                object
http_uri                   object
http_version               object
http_request_b

In [7]:
df.weird_name.value_counts()

-                                   459749
DNS_RR_unknown_type                    753
active_connection_reuse                275
data_before_established                109
bad_UDP_checksum                        68
bad_TCP_checksum                        52
connection_originator_SYN_ack           20
above_hole_data_without_any_acks         6
inappropriate_FIN                        3
dnp3_corrupt_header_checksum             3
possible_split_routing                   3
TCP_ack_underflow_or_misorder            2
Name: weird_name, dtype: int64

In [8]:
categorical_columns = [
    'proto',
    'service',
    'conn_state',
    'dns_AA',
    'dns_RD',
    'dns_RA',
    'dns_rejected',
    'dns_query',
    'ssl_version',
    'ssl_cipher',
    'ssl_resumed',
    'ssl_established',
    'ssl_subject',
    'ssl_issuer',
    'http_trans_depth',
    'http_method',
    'http_uri',
    'http_version',
    'http_user_agent',
    'http_orig_mime_types',
    'http_resp_mime_types',
    'weird_name',
    'weird_addl',
    'weird_notice',
]
numerical_columns = set(df.columns) - set(categorical_columns)
numerical_columns

{'dns_qclass',
 'dns_qtype',
 'dns_rcode',
 'dst_bytes',
 'dst_ip_bytes',
 'dst_pkts',
 'duration',
 'http_request_body_len',
 'http_response_body_len',
 'http_status_code',
 'label',
 'missed_bytes',
 'src_bytes',
 'src_ip_bytes',
 'src_pkts',
 'type'}

In [9]:
total_unique_values = 0
for col in categorical_columns:
    num_unique_values = df[col].nunique()
    print(col, '(', num_unique_values, "unique values )", '\n', df[col].value_counts(), "\n")
    total_unique_values += num_unique_values

print(total_unique_values, "unique categorical values")

proto ( 3 unique values ) 
 tcp     282076
udp     173087
icmp      5880
Name: proto, dtype: int64 

service ( 10 unique values ) 
 -             280216
dns           116480
http           60720
ssl             2070
ftp             1065
gssapi           184
dce_rpc          136
smb              108
dhcp              46
smb;gssapi        18
Name: service, dtype: int64 

conn_state ( 13 unique values ) 
 SF        123512
S0        113495
OTH       111842
REJ        45036
SHR        22053
SH         18138
S1         13843
S3          6642
RSTR        2360
RSTRH       1724
RSTO        1518
S2           708
RSTOS0       172
Name: conn_state, dtype: int64 

dns_AA ( 3 unique values ) 
 -    365158
F     81279
T     14606
Name: dns_AA, dtype: int64 

dns_RD ( 3 unique values ) 
 -    365158
F     63106
T     32779
Name: dns_RD, dtype: int64 

dns_RA ( 3 unique values ) 
 -    365158
F     85997
T      9888
Name: dns_RA, dtype: int64 

dns_rejected ( 3 unique values ) 
 -    365158
F     89392

In [10]:
# drop the dns_query column for now - it has 14372 unique values, so we can't really one-hot encode it
df.drop(columns=['dns_query'], inplace=True)

In [11]:
categorical_columns = list(set(categorical_columns) - set(['dns_query']))
print(categorical_columns)

['http_version', 'ssl_issuer', 'http_resp_mime_types', 'dns_RD', 'weird_notice', 'ssl_cipher', 'http_user_agent', 'ssl_resumed', 'proto', 'conn_state', 'ssl_version', 'weird_addl', 'http_trans_depth', 'http_uri', 'ssl_established', 'dns_RA', 'dns_AA', 'dns_rejected', 'http_method', 'weird_name', 'ssl_subject', 'http_orig_mime_types', 'service']


In [12]:
pd.get_dummies(df, columns=categorical_columns)

Unnamed: 0,duration,src_bytes,dst_bytes,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_qclass,dns_qtype,...,service_-,service_dce_rpc,service_dhcp,service_dns,service_ftp,service_gssapi,service_http,service_smb,service_smb;gssapi,service_ssl
0,80549.530260,1762852,41933215,0,252181,14911156,2,236,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0.000000,0,0,0,1,63,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0.000000,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0.000000,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0.000549,0,298,0,0,0,2,354,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461038,0.000000,0,0,0,1,60,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
461039,0.000000,0,0,0,0,0,1,103,0,0,...,1,0,0,0,0,0,0,0,0,0
461040,290.371539,101568,2592,0,108,108064,31,3832,0,0,...,1,0,0,0,0,0,0,0,0,0
461041,23.190902,32,31,0,8,411,7,395,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
df[list(numerical_columns)]

Unnamed: 0,http_request_body_len,dns_qtype,dst_ip_bytes,src_pkts,http_response_body_len,src_bytes,dst_bytes,type,missed_bytes,src_ip_bytes,dns_rcode,http_status_code,dst_pkts,dns_qclass,duration,label
0,0,0,236,252181,0,1762852,41933215,normal,0,14911156,0,0,2,0,80549.530260,0
1,0,0,0,1,0,0,0,normal,0,63,0,0,0,0,0.000000,0
2,0,0,0,0,0,0,0,normal,0,0,0,0,0,0,0.000000,0
3,0,0,0,0,0,0,0,normal,0,0,0,0,0,0,0.000000,0
4,0,0,354,0,0,0,298,normal,0,0,0,0,2,0,0.000549,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461038,0,0,0,1,0,0,0,xss,0,60,0,0,0,0,0.000000,1
461039,0,0,103,0,0,0,0,ransomware,0,0,0,0,1,0,0.000000,1
461040,0,0,3832,108,0,101568,2592,backdoor,0,108064,0,0,31,0,290.371539,1
461041,0,0,395,8,0,32,31,mitm,0,411,0,0,7,0,23.190902,1


In [14]:
pd.concat(
    [
        pd.get_dummies(df, columns=categorical_columns),
        df[list(numerical_columns)]
    ],
    axis=1
)

Unnamed: 0,duration,src_bytes,dst_bytes,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_qclass,dns_qtype,...,dst_bytes.1,type,missed_bytes.1,src_ip_bytes.1,dns_rcode,http_status_code,dst_pkts.1,dns_qclass.1,duration.1,label
0,80549.530260,1762852,41933215,0,252181,14911156,2,236,0,0,...,41933215,normal,0,14911156,0,0,2,0,80549.530260,0
1,0.000000,0,0,0,1,63,0,0,0,0,...,0,normal,0,63,0,0,0,0,0.000000,0
2,0.000000,0,0,0,0,0,0,0,0,0,...,0,normal,0,0,0,0,0,0,0.000000,0
3,0.000000,0,0,0,0,0,0,0,0,0,...,0,normal,0,0,0,0,0,0,0.000000,0
4,0.000549,0,298,0,0,0,2,354,0,0,...,298,normal,0,0,0,0,2,0,0.000549,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461038,0.000000,0,0,0,1,60,0,0,0,0,...,0,xss,0,60,0,0,0,0,0.000000,1
461039,0.000000,0,0,0,0,0,1,103,0,0,...,0,ransomware,0,0,0,0,1,0,0.000000,1
461040,290.371539,101568,2592,0,108,108064,31,3832,0,0,...,2592,backdoor,0,108064,0,0,31,0,290.371539,1
461041,23.190902,32,31,0,8,411,7,395,0,0,...,31,mitm,0,411,0,0,7,0,23.190902,1
