In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import MinMaxScaler

In [48]:
df = pd.read_csv('UNSW_NB15_training-set.csv')
df.set_index('id', drop=True, inplace=True)

In [49]:
df.head(10)

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0
6,3e-06,udp,-,INT,2,0,784,0,333333.3215,254,...,1,2,0,0,0,2,2,0,Normal,0
7,6e-06,udp,-,INT,2,0,1960,0,166666.6608,254,...,1,2,0,0,0,2,2,0,Normal,0
8,2.8e-05,udp,-,INT,2,0,1384,0,35714.28522,254,...,1,3,0,0,0,1,3,0,Normal,0
9,0.0,arp,-,INT,1,0,46,0,0.0,0,...,2,2,0,0,0,2,2,1,Normal,0
10,0.0,arp,-,INT,1,0,46,0,0.0,0,...,2,2,0,0,0,2,2,1,Normal,0


In [50]:
str_dtypes = tuple(filter(lambda t: t[1] not in (np.dtype('int64'), np.dtype('float64')), df.dtypes.to_dict().items()))

In [51]:
str_dtypes

(('proto', dtype('O')),
 ('service', dtype('O')),
 ('state', dtype('O')),
 ('attack_cat', dtype('O')))

In [52]:
str_cols = dict(str_dtypes).keys()

In [53]:
df.drop(str_cols, axis=1, inplace=True)

In [54]:
X = MinMaxScaler().fit_transform(df.iloc[:, :-1])
y = df.iloc[:, -1]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

In [56]:
model = LinearSVC(random_state=1, C=0.11, max_iter=1000, dual=False)

In [57]:
model.fit(X_train, y_train)

LinearSVC(C=0.11, dual=False, random_state=1)

In [58]:
round(model.coef_[0, 4], 2)

-1.14

In [59]:
y_pred = model.predict(X_test)

In [62]:
confusion_matrix(y_test, y_pred)

array([[ 8868,  2125],
       [ 1473, 12234]], dtype=int64)

In [63]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.858     0.807     0.831     10993
           1      0.852     0.893     0.872     13707

    accuracy                          0.854     24700
   macro avg      0.855     0.850     0.852     24700
weighted avg      0.854     0.854     0.854     24700



In [65]:
f1_score(y_test, y_pred, average='macro')

0.8515756004587374

In [75]:
X[14001, :]

array([1.66666697e-07, 9.39408173e-05, 0.00000000e+00, 6.26926493e-06,
       0.00000000e+00, 1.00000002e-01, 9.96078431e-01, 0.00000000e+00,
       8.65603603e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.66638916e-07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.22972973e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.80645161e-01,
       3.33333333e-01, 6.20689655e-01, 6.20689655e-01, 3.78378378e-01,
       5.80645161e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       6.10169492e-01, 5.90163934e-01, 0.00000000e+00])

In [76]:
model.predict([_])

array([1], dtype=int64)

In [77]:
model.predict([X[18512]])

array([1], dtype=int64)