In [139]:
import pandas as pd
import numpy as np
import glob
import os

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from fast_ml.model_development import train_valid_test_split
from skimpy import clean_columns

DATA_DIR  = os.path.join(os.path.abspath("."), "data")

In [140]:
data_path='/home/wendyunji/MLAC/data/unsw-data/raw'

filenames = glob.glob(os.path.join(data_path, 'set', '*.csv'))
datasets = [pd.read_csv(filename) for filename in filenames]

# Concatenate the datasets
data = pd.concat(datasets, axis=0, ignore_index=True)
data = clean_columns(data)

In [141]:
len(data)

print(len(data.columns))
raw_col = data.columns.to_list()
print(raw_col)

data['attack_cat'].value_counts()

45
['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']


Normal            93000
Generic           58871
Exploits          44525
Fuzzers           24246
DoS               16353
Reconnaissance    13987
Analysis           2677
Backdoor           2329
Shellcode          1511
Worms               174
Name: attack_cat, dtype: int64

In [142]:
#
data.drop_duplicates(inplace=True, keep=False, ignore_index=True)
#
data.dropna(axis=0, inplace=True, how="any")
#
data.replace([-np.inf, np.inf], np.nan, inplace=True)
data.dropna(axis=0, how='any', inplace=True)
#
data_std =data.std(numeric_only=True)
threshold=0.01
constant_features = [column for column, std in data_std.iteritems() if std < threshold]
data.drop(labels=constant_features, axis=1, inplace=True)
#
threshold=0.98
data_corr = data.corr()
mask = np.triu(np.ones_like(data_corr, dtype=bool))
tri_df = data_corr.mask(mask)
correlated_features = [c for c in tri_df.columns if any(tri_df[c] > threshold)]
data.drop(labels=correlated_features, axis=1, inplace=True)

data.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,rate,sttl,dttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,74.08749,252,254,...,1,1,1,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,78.473372,62,252,...,1,1,2,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,14.170161,62,252,...,1,1,3,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,13.677108,62,252,...,1,1,3,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,33.373826,254,252,...,2,1,40,0,0,2,39,0,Normal,0


In [143]:
len(data)

print(len(data.columns))
pro_col = data.columns.to_list()
print(pro_col)

print(list(set(raw_col) -set(pro_col)))

data['attack_cat'].value_counts()


41
['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']
['sbytes', 'swin', 'is_ftp_login', 'dbytes']


Normal            93000
Generic           58871
Exploits          44525
Fuzzers           24246
DoS               16353
Reconnaissance    13987
Analysis           2677
Backdoor           2329
Shellcode          1511
Worms               174
Name: attack_cat, dtype: int64

In [144]:
print(data)

           id       dur proto service state  spkts  dpkts           rate  \
0           1  0.121478   tcp       -   FIN      6      4      74.087490   
1           2  0.649902   tcp       -   FIN     14     38      78.473372   
2           3  1.623129   tcp       -   FIN      8     16      14.170161   
3           4  1.681642   tcp     ftp   FIN     12     12      13.677108   
4           5  0.449454   tcp       -   FIN     10      6      33.373826   
...       ...       ...   ...     ...   ...    ...    ...            ...   
257668  82328  0.000005   udp       -   INT      2      0  200000.005100   
257669  82329  1.106101   tcp       -   FIN     20      8      24.410067   
257670  82330  0.000000   arp       -   INT      1      0       0.000000   
257671  82331  0.000000   arp       -   INT      1      0       0.000000   
257672  82332  0.000009   udp       -   INT      2      0  111111.107200   

        sttl  dttl  ...  ct_src_dport_ltm  ct_dst_sport_ltm  ct_dst_src_ltm  \
0       

### Scale

In [145]:
x_data = data.drop(['label','attack_cat'], axis=1)

print(len(x_data.columns))
categorical_features = x_data.select_dtypes(exclude=["number"]).columns
print(len(categorical_features))
print(categorical_features)
categorical_features

numeric_features = x_data.select_dtypes(exclude=[object]).columns
print(len(numeric_features))
print(numeric_features)

39
3
Index(['proto', 'service', 'state'], dtype='object')
36
Index(['id', 'dur', 'spkts', 'dpkts', 'rate', 'sttl', 'dttl', 'sload', 'dload',
       'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb',
       'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd',
       'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'],
      dtype='object')


In [146]:
y_data = data['attack_cat']
numeric_data = x_data.drop(categorical_features, axis=1)
print(numeric_data.columns)
numeric_data = pd.DataFrame(QuantileTransformer().fit_transform(numeric_data), columns=numeric_features)

Index(['id', 'dur', 'spkts', 'dpkts', 'rate', 'sttl', 'dttl', 'sload', 'dload',
       'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb',
       'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd',
       'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'],
      dtype='object')


In [147]:
categorical_data = x_data.drop(numeric_features, axis=1)
print(categorical_data.columns)
no_x_data = pd.concat([numeric_data, categorical_data],axis=1)

Index(['proto', 'service', 'state'], dtype='object')


In [148]:
categorical_data = pd.get_dummies(categorical_data)

In [149]:
categorical_data.columns

Index(['proto_3pc', 'proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus',
       'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc', 'proto_bna',
       ...
       'state_CLO', 'state_CON', 'state_ECO', 'state_FIN', 'state_INT',
       'state_PAR', 'state_REQ', 'state_RST', 'state_URN', 'state_no'],
      dtype='object', length=157)

In [150]:
le = LabelEncoder()
y_data = pd.DataFrame(le.fit_transform(y_data), columns=["attack_cat"])
y_data.value_counts()

attack_cat
6             93000
5             58871
3             44525
4             24246
2             16353
7             13987
0              2677
1              2329
8              1511
9               174
dtype: int64

In [151]:
y_data.head()

Unnamed: 0,attack_cat
0,6
1,6
2,6
3,6
4,6


In [152]:
x_data = pd.concat([numeric_data, categorical_data],axis=1)
print(x_data.columns)

Index(['id', 'dur', 'spkts', 'dpkts', 'rate', 'sttl', 'dttl', 'sload', 'dload',
       'sloss',
       ...
       'state_CLO', 'state_CON', 'state_ECO', 'state_FIN', 'state_INT',
       'state_PAR', 'state_REQ', 'state_RST', 'state_URN', 'state_no'],
      dtype='object', length=193)


In [153]:
print(len(numeric_data.columns))
print(len(categorical_data.columns))

36
157


In [154]:
x_data.to_csv('/home/wendyunji/MLAC/data/unsw-data/new/unsw_xdata.csv', index=False)
no_x_data.to_csv('/home/wendyunji/MLAC/data/unsw-data/new/unsw_noxdata.csv', index=False)
y_data.to_csv('/home/wendyunji/MLAC/data/unsw-data/new/unsw_ydata.csv', index=False)