## Loading, Shaping and Viewing the Training Data

In [1]:
import pandas as pd

train = pd.read_csv('UNSW_NB15_training-set.csv')

train

train.drop(['id', 'attack_cat'], axis = 1, inplace= True)

print(train.columns.values)
train

X = train.drop('label', axis = 1)
y = train['label']
X = pd.get_dummies(X, columns = ['proto', 'service', 'state'], drop_first = True)
X.info
display(X.dtypes)
X.iloc[:,:39]
num_features = X.iloc[:,:39].columns.values
display(num_features)
cat_features = X.iloc[:,39:].columns.values
display(cat_features)

['dur' 'proto' 'service' 'state' 'spkts' 'dpkts' 'sbytes' 'dbytes' 'rate'
 'sttl' 'dttl' 'sload' 'dload' 'sloss' 'dloss' 'sinpkt' 'dinpkt' 'sjit'
 'djit' 'swin' 'stcpb' 'dtcpb' 'dwin' 'tcprtt' 'synack' 'ackdat' 'smean'
 'dmean' 'trans_depth' 'response_body_len' 'ct_srv_src' 'ct_state_ttl'
 'ct_dst_ltm' 'ct_src_dport_ltm' 'ct_dst_sport_ltm' 'ct_dst_src_ltm'
 'is_ftp_login' 'ct_ftp_cmd' 'ct_flw_http_mthd' 'ct_src_ltm' 'ct_srv_dst'
 'is_sm_ips_ports' 'label']


dur          float64
spkts          int64
dpkts          int64
sbytes         int64
dbytes         int64
              ...   
state_PAR      uint8
state_REQ      uint8
state_RST      uint8
state_URN      uint8
state_no       uint8
Length: 191, dtype: object

array(['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt',
       'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

array(['proto_a/n', 'proto_aes-sp3-d', 'proto_any', 'proto_argus',
       'proto_aris', 'proto_arp', 'proto_ax.25', 'proto_bbn-rcc',
       'proto_bna', 'proto_br-sat-mon', 'proto_cbt', 'proto_cftp',
       'proto_chaos', 'proto_compaq-peer', 'proto_cphb', 'proto_cpnx',
       'proto_crtp', 'proto_crudp', 'proto_dcn', 'proto_ddp', 'proto_ddx',
       'proto_dgp', 'proto_egp', 'proto_eigrp', 'proto_emcon',
       'proto_encap', 'proto_etherip', 'proto_fc', 'proto_fire',
       'proto_ggp', 'proto_gmtp', 'proto_gre', 'proto_hmp',
       'proto_i-nlsp', 'proto_iatp', 'proto_ib', 'proto_icmp',
       'proto_idpr', 'proto_idpr-cmtp', 'proto_idrp', 'proto_ifmp',
       'proto_igmp', 'proto_igp', 'proto_il', 'proto_ip', 'proto_ipcomp',
       'proto_ipcv', 'proto_ipip', 'proto_iplt', 'proto_ipnip',
       'proto_ippc', 'proto_ipv6', 'proto_ipv6-frag', 'proto_ipv6-no',
       'proto_ipv6-opts', 'proto_ipv6-route', 'proto_ipx-n-ip',
       'proto_irtp', 'proto_isis', 'proto_iso-ip', 'proto_iso-

In [2]:
train.isnull().sum()

dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
attack_cat           0
label      

## Setting up and Training the Pipeline (Kitchen Sink Version)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.model_selection import train_test_split, GridSearchCV

#numeric_features = ["dur", "sbytes", "dbytes", "sttl", "dttl", "sloss", "dloss", "sload", "dload", "spkts", "dpkts", "swin", "dwin", "stcpb", "dtcpb", "trans_depth", "sjit", "djit", "tcprtt", "synack", "ackdat", "ct_state_ttl", "ct_flw_http_mthd", "ct_ftp_cmd", "ct_srv_src", "ct_srv_dst", "ct_dst_ltm", "ct_src_dport_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm"]
numeric_features = num_features
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

#categorical_features = ["proto", "state", "service"]
categorical_features = cat_features
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

lr_pl = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter = 2000))] #incresed max_iter, default 100 wasn't enough
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lr_pl.fit(X_train, y_train)
print("Logistic Regression model score: %.3f" % lr_pl.score(X_test, y_test))

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
svc_pl = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC(max_iter = 1000))] #increased max_iter, default 1000 wasn't enough, neither was 2000, nor 4000, nor 10000, and that's as far as I'm going
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

svc_pl.fit(X_train, y_train)
print("SVC model score: %.3f" % svc_pl.score(X_test, y_test))

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
dtc_pl = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DTC())] 
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

dtc_pl.fit(X_train, y_train)
print("Decision Tree model score: %.3f" % dtc_pl.score(X_test, y_test))

Logistic Regression model score: 0.935




SVC model score: 0.935
Decision Tree model score: 0.948


## Feature Selection

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X.shape
X_new = SelectKBest(chi2, k=100).fit(X, y)
#X_new.shape
names = X_new.get_support(1)
display(names)
X_new = X[X.columns[names]]
#X_new.get_feature_names_out()
#display(X_new)
#X_new = pd.DataFrame(X_new)
X_new
print(X_new.columns.values)
num_features = X_new.iloc[:,:27].columns.values
cat_features = X_new.iloc[:,27:].columns.values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  35,  36,  37,  38,  41,  44,
        70,  74,  90, 109, 112, 114, 116, 117, 118, 119, 120, 121, 123,
       125, 126, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
       140, 141, 142, 143, 144, 145, 147, 148, 149, 150, 151, 152, 153,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 169, 170,
       172, 174, 175, 177, 181, 184, 185, 187, 188], dtype=int64)

['dur' 'spkts' 'dpkts' 'sbytes' 'dbytes' 'rate' 'sttl' 'dttl' 'sload'
 'dload' 'sloss' 'dloss' 'sinpkt' 'dinpkt' 'sjit' 'djit' 'swin' 'stcpb'
 'dtcpb' 'dwin' 'tcprtt' 'synack' 'ackdat' 'smean' 'dmean' 'trans_depth'
 'response_body_len' 'ct_srv_src' 'ct_state_ttl' 'ct_dst_ltm'
 'ct_src_dport_ltm' 'ct_dst_sport_ltm' 'ct_dst_src_ltm' 'ct_flw_http_mthd'
 'ct_src_ltm' 'ct_srv_dst' 'is_sm_ips_ports' 'proto_any' 'proto_arp'
 'proto_gre' 'proto_ib' 'proto_ipv6' 'proto_mobile' 'proto_narp'
 'proto_nsfnet-igp' 'proto_ospf' 'proto_pgm' 'proto_pim' 'proto_pipe'
 'proto_pnni' 'proto_pri-enc' 'proto_ptp' 'proto_pvp' 'proto_qnx'
 'proto_rsvp' 'proto_rvd' 'proto_sat-expak' 'proto_sat-mon'
 'proto_sccopmce' 'proto_scps' 'proto_sctp' 'proto_sdrp'
 'proto_secure-vmtp' 'proto_sep' 'proto_skip' 'proto_sm' 'proto_smp'
 'proto_snp' 'proto_sprite-rpc' 'proto_sps' 'proto_srp' 'proto_stp'
 'proto_sun-nd' 'proto_swipe' 'proto_tcf' 'proto_tcp' 'proto_tlsp'
 'proto_tp++' 'proto_ttp' 'proto_udp' 'proto_unas' 'proto

## Model Training with Feature Selection

In [11]:
#numeric_features = ["dur", "sbytes", "dbytes", "sttl", "dttl", "sloss", "dloss", "sload", "dload", "spkts", "dpkts", "swin", "dwin", "stcpb", "dtcpb", "trans_depth", "sjit", "djit", "tcprtt", "synack", "ackdat", "ct_state_ttl", "ct_flw_http_mthd", "ct_ftp_cmd", "ct_srv_src", "ct_srv_dst", "ct_dst_ltm", "ct_src_dport_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm"]
numeric_features = num_features
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

#categorical_features = ["proto", "state", "service"]
categorical_features = cat_features
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter = 2000))] #incresed max_iter, default 100 wasn't enough
)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.941
