In [1]:
# Load ARFF Dataset
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 
# Train set with binary labels
train_binary_raw_data = loadarff('KDDTrain+.arff')
df_train_binary_labels = pd.DataFrame(train_binary_raw_data[0])
print(df_train_binary_labels.shape)
# Test set with binary labels
test_binary_raw_data = loadarff('KDDTest+.arff')
df_test_binary_labels = pd.DataFrame(test_binary_raw_data[0])
print(df_test_binary_labels.shape)
# change Label
df_train_binary_labels['class'] = np.where(df_train_binary_labels['class'].eq(b'normal'), 0, 1)
df_test_binary_labels['class'] = np.where(df_test_binary_labels['class'].eq(b'normal'), 0, 1)
# Change "class" to "label"
df_train_binary_labels = df_train_binary_labels.rename(columns={"class": "label"})
df_test_binary_labels = df_test_binary_labels.rename(columns={"class": "label"})

print(df_test_binary_labels.head(5))
print(df_train_binary_labels.head(5))

(125973, 42)
(22544, 42)
   duration protocol_type      service     flag  src_bytes  dst_bytes  land  \
0       0.0        b'tcp'   b'private'   b'REJ'        0.0        0.0  b'0'   
1       0.0        b'tcp'   b'private'   b'REJ'        0.0        0.0  b'0'   
2       2.0        b'tcp'  b'ftp_data'    b'SF'    12983.0        0.0  b'0'   
3       0.0       b'icmp'     b'eco_i'    b'SF'       20.0        0.0  b'0'   
4       1.0        b'tcp'    b'telnet'  b'RSTO'        0.0       15.0  b'0'   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0             0.0     0.0  0.0  ...                10.0   
1             0.0     0.0  0.0  ...                 1.0   
2             0.0     0.0  0.0  ...                86.0   
3             0.0     0.0  0.0  ...                57.0   
4             0.0     0.0  0.0  ...                86.0   

  dst_host_same_srv_rate  dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0                   0.04                    0.06                

In [2]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

pd.set_option('display.max_columns', None)

# Combine training dataset with testing dataset to apply One-Hot-Encoder
df_binary_labels = pd.concat([df_train_binary_labels, df_test_binary_labels], ignore_index=True)

# One-Hot_Encoder for "protocol_type","service","flag"
categorical_col = ["protocol_type","service","flag"]
for col in categorical_col:
    df_dummies = pd.get_dummies(df_binary_labels[col], prefix=col+"_")
    df_binary_labels = df_binary_labels.drop(col, axis=1)
    df_binary_labels = pd.concat([df_binary_labels,df_dummies], axis=1)

# Handle byte value of "land", "logged_in", "is_host_login", "is_guest_login"
byte_value_col = ["land", "logged_in", "is_host_login", "is_guest_login"]
for col in byte_value_col:
    df_binary_labels[col] = np.where(df_binary_labels[col].eq(b'0'), 0, 1)
print(df_binary_labels.shape)
df_binary_labels_normal = df_binary_labels.loc[df_binary_labels["label"]==0]
df_binary_labels_anomaly = df_binary_labels.loc[df_binary_labels["label"]==1]
print(df_binary_labels_normal.shape)
print(df_binary_labels_anomaly.shape)

(148517, 123)
(77054, 123)
(71463, 123)


In [5]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score,f1_score 

from sklearn.preprocessing import StandardScaler,Normalizer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif

from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.tree import DecisionTreeClassifier

def printResults(results):
    #print(results.keys())
    print("f1_scores(mean): ", sum(results['test_f1'])/len(results['test_f1']))
    print("accuracy(mean): ", sum(results['test_accuracy'])/len(results['test_accuracy']))
    print("precision(mean): ", sum(results['test_precision'])/len(results['test_precision']))
    print("recall(mean): ", sum(results['test_recall'])/len(results['test_recall']))
    print("roc_auc(mean): ", sum(results['test_roc_auc'])/len(results['test_roc_auc']))

def generate_preprocess_pipeline(preprocess):
    if preprocess == "standardize":
        return StandardScaler()
    elif preprocess == "normalize":
        return Normalizer()
    
def generate_feature_selection_pipeline(feature_selection, percentage):
    if feature_selection == "PCA":
        return PCA(n_components = percentage)
    elif feature_selection == "ANOVA":
        return SelectPercentile(f_classif, percentile=(percentage*100))
                                
def DT(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.7,0.8,0.9]
  
    DT_depth_selections= [10,20]
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for DT_depth in DT_depth_selections:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',DecisionTreeClassifier(max_depth=DT_depth,random_state=0))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("depth: ", DT_depth)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                
from sklearn.linear_model import LogisticRegression
def LG(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.7,0.8,0.9]
  
    LG_solvers= ['newton-cg','lbfgs','sag']
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for LG_solver in LG_solvers:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',LogisticRegression(solver = LG_solver,max_iter=1000))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("solver: ", LG_solver)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                    
from sklearn.svm import SVC
def SVM(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.7,0.8,0.9]
  
    C_selections = [0.5,1,10,100,1000]
    kernel_selections = ['poly','rbf','sigmoid']
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for  C_selection in  C_selections:
                    for kernel_selection in kernel_selections:
                        complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',SVC(C=C_selection, kernel=kernel_selection))
                    ])
                        results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                        print("preprocess: ",preprocess_method)
                        print("feature_selection: ", feature_selection_method)
                        print("percentage_selection: ", percentage_selection)
                        print("C_selection: ", C_selection)
                        print("kernel_selection: ", kernel_selection)
                        printResults(results)
                        print("-------------------------------------------------------------------------------------")

## Split dataset to training set and testing set
from sklearn.model_selection import train_test_split
def splitDataset(df):
    X = df.drop(columns=['label'])
    y = df['label']
    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state=6)
    print(X_train.shape)
    print(sum(y_train))
    print(X_test.shape)
    print(sum(y_test))
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = splitDataset(df_binary_labels)

(148517, 122)
(148517,)
(103961, 122)
50007
(44556, 122)
21456


In [7]:
DT(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
depth:  10
f1_scores(mean):  0.9875225085160194
accuracy(mean):  0.9880147408245001
precision(mean):  0.9889363900802731
recall(mean):  0.986122019671658
roc_auc(mean):  0.9928450716115609
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
depth:  20
f1_scores(mean):  0.989581186044212
accuracy(mean):  0.9899770141218871
precision(mean):  0.9896028895509523
recall(mean):  0.9895614837636412
roc_auc(mean):  0.9889514670016046
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
depth:  10
f1_scores(mean):  0.987557993014428
accuracy(mean):  0.9880436091799696
precision(mean):  0.9886617401864928
recall(mean):  0.9864619356900542
roc_auc(mean):  0.9921387885490718
---------------------------------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  10
f1_scores(mean):  0.983587456538029
accuracy(mean):  0.9841671367263751
precision(mean):  0.980870839591757
recall(mean):  0.9863218677132487
roc_auc(mean):  0.9956277327962333
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  20
f1_scores(mean):  0.9894289955018397
accuracy(mean):  0.9898327204561529
precision(mean):  0.9896567813474914
recall(mean):  0.9892015077636408
roc_auc(mean):  0.992011757074945
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
depth:  10
f1_scores(mean):  0.9843188778736293
accuracy(mean):  0.9848597038541491
precision(mean):  0.9807842041001571
recall(mean):  0.9878816457486425
roc_auc(mean):  0.9955626831981771
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
depth:  20
f1_scores(mean):  0.9893436676326661
accuracy(mean):  0.9897461560995708
precision(mean):  0.9891461580239224
recall(mean):  0.9895414637700398
roc_auc(mean):  0.9919952060830631
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
depth:  10
f1_scores(mean):  0.9892310771309388
accuracy(mean):  0.9896307326397527
precision(mean):  0.9883429470231769
recall(mean):  0.990121359789636
roc_auc(mean):  0.995241906434444
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
depth:  20
f1_scores(mean):  0.9910398610254004
accuracy(mean):  0.9913813870085729
precision(mean):  0.9911603696616945
recall(mean):  0.9909212877952356
roc_auc(mean):  0.9932609132108035
-------------------------------------------------------------------------------------


In [8]:
LG(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
solver:  newton-cg
f1_scores(mean):  0.945401150015597
accuracy(mean):  0.9482017382651658
precision(mean):  0.9588667953613484
recall(mean):  0.9323094485192552
roc_auc(mean):  0.9841907949538365
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
solver:  lbfgs
f1_scores(mean):  0.9453915602431836
accuracy(mean):  0.9481921191809027
precision(mean):  0.9588471138292464
recall(mean):  0.9323094485192552
roc_auc(mean):  0.9841908375772699
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
solver:  sag
f1_scores(mean):  0.9453830599433222
accuracy(mean):  0.9481825005592512
precision(mean):  0.9588084194964104
recall(mean):  0.9323294445200551
roc_auc(mean):  0.9841929335336183
-----------------



preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
solver:  sag
f1_scores(mean):  0.9459722804719235
accuracy(mean):  0.9486923101747531
precision(mean):  0.9584576404554692
recall(mean):  0.9338092525480507
roc_auc(mean):  0.9844660328253031
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
solver:  newton-cg
f1_scores(mean):  0.9464741496294368
accuracy(mean):  0.9491540211306575
precision(mean):  0.958688256721975
recall(mean):  0.9345691445652475
roc_auc(mean):  0.9847253667151069
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
solver:  lbfgs
f1_scores(mean):  0.9464645571076181
accuracy(mean):  0.9491444020463942
precision(mean):  0.9586685960933481
recall(mean):  0.9345691445652475
roc_auc(mean):  0.9847255464799526
-----------------



preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
solver:  sag
f1_scores(mean):  0.9466402543286204
accuracy(mean):  0.9493271651100063
precision(mean):  0.9591546486378579
recall(mean):  0.9344491645616483
roc_auc(mean):  0.9846146920729488
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
solver:  newton-cg
f1_scores(mean):  0.9477476401028365
accuracy(mean):  0.9503659938276131
precision(mean):  0.9600166412773812
recall(mean):  0.9357889545978416
roc_auc(mean):  0.9903092019083495
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
solver:  lbfgs
f1_scores(mean):  0.9477582951843857
accuracy(mean):  0.9503756129118763
precision(mean):  0.9600174657442601
recall(mean):  0.9358089525980416
roc_auc(mean):  0.9903089091041345
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
solver:  sag
f1_scores(mean):  0.9477476401028365
accuracy(mean):  0.9503659938276131
precision(mean):  0.9600166412773812
recall(mean):  0.9357889545978416
roc_auc(mean):  0.9903028919028631
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
solver:  newton-cg
f1_scores(mean):  0.9520522834772065
accuracy(mean):  0.9544155649246135
precision(mean):  0.9635274955822133
recall(mean):  0.9408482347126217
roc_auc(mean):  0.9905323105194783
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
solver:  lbfgs
f1_scores(mean):  0.9520522834772065
accuracy(mean):  0.9544155649246135
precision(mean):  0.9635274955822133
recall(mean):  0.9408482347126217
roc_auc(mean):  0.9905323846467488
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
solver:  sag
f1_scores(mean):  0.9520504488069864
accuracy(mean):  0.9544155639993903
precision(mean):  0.9635657777181172
recall(mean):  0.9408082387122217
roc_auc(mean):  0.9905257630917707
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
solver:  newton-cg
f1_scores(mean):  0.9543947168677456
accuracy(mean):  0.9566279667956594
precision(mean):  0.9655583304838788
recall(mean):  0.9434879347498164
roc_auc(mean):  0.9914008817721791
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
solver:  lbfgs
f1_scores(mean):  0.9544043734352539
accuracy(mean):  0.9566375858799226
precision(mean):  0.9655781388649185
recall(mean):  0.9434879347498164
roc_auc(mean):  0.991400292467779
-------------------------------------------------------------------------------------


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
solver:  sag
f1_scores(mean):  0.9547162285983342
accuracy(mean):  0.9569453910250043
precision(mean):  0.9661532542488228
recall(mean):  0.9435479207528157
roc_auc(mean):  0.9909768307319353
-------------------------------------------------------------------------------------




In [None]:
SVM(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.5
kernel_selection:  poly
f1_scores(mean):  0.961339103630325
accuracy(mean):  0.9636498543596718
precision(mean):  0.9840670551879702
recall(mean):  0.9396485206508343
roc_auc(mean):  0.9936171928076979
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.5
kernel_selection:  rbf
f1_scores(mean):  0.9766552484372447
accuracy(mean):  0.977635856693048
precision(mean):  0.9807832603319969
recall(mean):  0.9725638893819075
roc_auc(mean):  0.99427811907212
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.5
kernel_selection:  sigmoid
f1_scores(mean):  0.8724492184565784
accuracy(mean):  0.8767614706920618
precision(mean):  0.8687212918272602
recall(m

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  10
kernel_selection:  rbf
f1_scores(mean):  0.982411658222054
accuracy(mean):  0.9831475267476047
precision(mean):  0.9863929086833902
recall(mean):  0.9784630395188836
roc_auc(mean):  0.9964951699358165
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  10
kernel_selection:  sigmoid
f1_scores(mean):  0.8666349176040505
accuracy(mean):  0.8716537947747718
precision(mean):  0.8663576241780226
recall(mean):  0.8669185410992994
roc_auc(mean):  0.9123127614887159
-------------------------------------------------------------------------------------
