In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Reading headers
path = '/Users/tristangarcia/desktop/Network Traffic Classification/data/'
df_headers = pd.read_csv(f'{path}NUSW-NB15_features.csv', encoding='ISO-8859-1')
headers = [h.lower().strip() for h in df_headers['Name']]

# Loading data
'''dfs = []
for i in range(4):
    df = pd.read_csv(f'{path}UNSW-NB15_{i+1}.csv', names = headers)
    dfs.append(df)
data = pd.concat(dfs, ignore_index = True)'''

data = pd.read_csv(f'{path}UNSW-NB15_4.csv', names=headers)

In [3]:
print(data.shape)
data.head()

(440044, 49)


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.9,7045,149.171.126.7,25,tcp,FIN,0.201886,37552,3380,31,...,,2,2,7,4,1,1,3,,0
1,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
2,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0
3,59.166.0.2,21553,149.171.126.2,25,tcp,FIN,0.053948,37812,3380,31,...,,1,1,4,7,1,1,3,,0
4,59.166.0.8,45212,149.171.126.4,53,udp,CON,0.000953,146,178,31,...,,2,5,2,1,1,1,2,,0


In [4]:
data = data.drop(['srcip', 'sport', 'dstip', 'dsport'], axis=1)

# Missing Entries

In [5]:
data.isnull().sum()

proto                    0
state                    0
dur                      0
sbytes                   0
dbytes                   0
sttl                     0
dttl                     0
sloss                    0
dloss                    0
service                  0
sload                    0
dload                    0
spkts                    0
dpkts                    0
swin                     0
dwin                     0
stcpb                    0
dtcpb                    0
smeansz                  0
dmeansz                  0
trans_depth              0
res_bdy_len              0
sjit                     0
djit                     0
stime                    0
ltime                    0
sintpkt                  0
dintpkt                  0
tcprtt                   0
synack                   0
ackdat                   0
is_sm_ips_ports          0
ct_state_ttl             0
ct_flw_http_mthd    406484
is_ftp_login        433412
ct_ftp_cmd               0
ct_srv_src               0
c

In [6]:
data['attack_cat'].value_counts()

attack_cat
Generic             61878
Exploits            11439
 Fuzzers             5390
DoS                  4907
 Reconnaissance      3530
Analysis              670
Backdoor              666
 Shellcode            371
Worms                  43
Name: count, dtype: int64

In [7]:
# Filling missing entries with normal category
data['attack_cat'] = data['attack_cat'].fillna('normal')
# Reformatting values
data['attack_cat'] = [d.lower().strip() for d in data['attack_cat']]
data['attack_cat'] = ['backdoor' if d=='backdoors' else d for d in data['attack_cat']]

In [8]:
data['ct_flw_http_mthd'].value_counts()

ct_flw_http_mthd
1.0    32314
4.0     1084
2.0      102
9.0       54
6.0        6
Name: count, dtype: int64

In [9]:
# Filling null values with 0
data['ct_flw_http_mthd'] = data['ct_flw_http_mthd'].fillna(0)

In [10]:
data['is_ftp_login'].value_counts()

is_ftp_login
1.0    6578
4.0      40
2.0      14
Name: count, dtype: int64

In [11]:
# Filling null values with 0 and converting to int
data['is_ftp_login'] = data['is_ftp_login'].fillna(0)
data['is_ftp_login'] = data['is_ftp_login'].astype(int)
# is_ftp_login can only take values of [0,1]
data['is_ftp_login'] = [0 if (d == 2 or d == 4) else d for d in data['is_ftp_login']]

In [12]:
data['service'].value_counts()

service
dns         183525
-           178204
http         33317
ftp-data     18472
smtp         12224
ftp           6957
ssh           6865
pop3           363
dhcp            45
ssl             30
snmp            26
radius           9
irc              7
Name: count, dtype: int64

In [13]:
# Replacing '-' with 'unknown'
data['service'] = ['unknown' if (i=='-') else i for i in data['service']]

# Data Types

In [14]:
# Count of all the data types 
data.dtypes.value_counts()

int64      29
float64    11
object      5
Name: count, dtype: int64

In [15]:
# Categorical variables
data.select_dtypes(exclude=np.number).columns

Index(['proto', 'state', 'service', 'ct_ftp_cmd', 'attack_cat'], dtype='object')

In [16]:
# Research paper states that this is a numerical feature
data['ct_ftp_cmd'].value_counts()

ct_ftp_cmd
     433412
1      6578
4        40
2        14
Name: count, dtype: int64

In [17]:
# Filling white spaces as 0 and converting to int
data['ct_ftp_cmd'] = [0 if (i==' ') else i for i in data['ct_ftp_cmd']]
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].astype(int)

In [18]:
# Numerical
data.select_dtypes(include=np.number).columns

Index(['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'sload',
       'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'],
      dtype='object')

In [19]:
# Binarized categorical variables
print(data['is_sm_ips_ports'].value_counts())
print(data['is_ftp_login'].value_counts())
print(data['label'].value_counts())

is_sm_ips_ports
0    439610
1       434
Name: count, dtype: int64
is_ftp_login
0    433466
1      6578
Name: count, dtype: int64
label
0    351150
1     88894
Name: count, dtype: int64


# Train Test split

In [20]:
train, test = train_test_split(data, stratify=data['attack_cat'], test_size=0.3, random_state=123)

# Outputting Cleaned Data

In [21]:
train.to_pickle(f'{path}train.pkl')
test.to_pickle(f'{path}test.pkl')