In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

df_train_multiple_labels = pd.read_csv("KDDTrain+.txt", header=None, names = col_names)
df_test_multiple_labels = pd.read_csv("KDDTest+.txt", header=None, names = col_names)

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

pd.set_option('display.max_columns', None)

# Combine training dataset with testing dataset to apply One-Hot-Encoder
df_multiple_labels = pd.concat([df_train_multiple_labels, df_test_multiple_labels], ignore_index=True)
# Drop "difficulty_level"
df_multiple_labels = df_multiple_labels.drop("difficulty_level", axis=1)
# One-Hot-Encoder for "protocol_type","service","flag"
categorical_col = ["protocol_type","service","flag"]
for col in categorical_col:
    df_dummies = pd.get_dummies(df_multiple_labels[col], prefix=col+"_")
    df_multiple_labels = df_multiple_labels.drop(col, axis=1)
    df_multiple_labels = pd.concat([df_multiple_labels,df_dummies], axis=1)
# LabelEncoder for "labels"
# Five Labels: normal: 0, DoS: 1, Probe: 2, R2L:3, U2R: 4
df_multiple_labels = df_multiple_labels.replace({'label':{'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4}})
# print(df_multiple_labels.head(1))
df_multiple_labels_normal = df_multiple_labels.loc[df_multiple_labels["label"]==0]
df_multiple_labels_DoS = df_multiple_labels.loc[df_multiple_labels["label"]==1]
df_multiple_labels_Probe = df_multiple_labels.loc[df_multiple_labels["label"]==2]
df_multiple_labels_R2L = df_multiple_labels.loc[df_multiple_labels["label"]==3]
df_multiple_labels_U2R = df_multiple_labels.loc[df_multiple_labels["label"]==4]

## DoS Dataset
df_DoS_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_DoS], ignore_index=True)
df_DoS_normal['label'] = np.where(df_DoS_normal['label'].eq(1), 1, 0)
## Probe Dataset
df_Probe_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_Probe], ignore_index=True)
df_Probe_normal['label'] = np.where(df_Probe_normal['label'].eq(2), 1, 0)
## R2L Dataset
df_R2L_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_R2L], ignore_index=True)
df_R2L_normal['label'] = np.where(df_R2L_normal['label'].eq(3), 1, 0)
## U2R Dataset
df_U2R_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_U2R], ignore_index=True)
df_U2R_normal['label'] = np.where(df_U2R_normal['label'].eq(4), 1, 0)

print(df_DoS_normal.shape)
print(df_Probe_normal)
print(df_R2L_normal.shape)
print(df_U2R_normal.shape)

(130441, 123)
       duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0             0        491          0     0               0       0    0   
1             0        146          0     0               0       0    0   
2             0        232       8153     0               0       0    0   
3             0        199        420     0               0       0    0   
4             0        287       2251     0               0       0    0   
...         ...        ...        ...   ...             ...     ...  ...   
91126         0          0          0     0               0       0    0   
91127         2         24        109     0               0       0    0   
91128         0          0          0     0               0       0    0   
91129         0          1          1     0               0       0    0   
91130         0          0          0     0               0       0    0   

       num_failed_logins  logged_in  num_compromised  root_shell  \
0    

In [2]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score 

from sklearn.preprocessing import StandardScaler,Normalizer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif

from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.tree import DecisionTreeClassifier

def printResults(results):
    #print(results.keys())
    print("f1_scores(mean): ", sum(results['test_f1'])/len(results['test_f1']))
    print("accuracy(mean): ", sum(results['test_accuracy'])/len(results['test_accuracy']))
    print("precision(mean): ", sum(results['test_precision'])/len(results['test_precision']))
    print("recall(mean): ", sum(results['test_recall'])/len(results['test_recall']))
    print("roc_auc(mean): ", sum(results['test_roc_auc'])/len(results['test_roc_auc']))

def generate_preprocess_pipeline(preprocess):
    if preprocess == "standardize":
        return StandardScaler()
    elif preprocess == "normalize":
        return Normalizer()
    
def generate_feature_selection_pipeline(feature_selection, percentage):
    if feature_selection == "PCA":
        return PCA(n_components = percentage)
    elif feature_selection == "ANOVA":
        return SelectPercentile(f_classif, percentile=(percentage*100))
                                
def DT(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    DT_depth_selections= [2,5,10,20,None]
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for DT_depth in DT_depth_selections:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',DecisionTreeClassifier(max_depth=DT_depth,random_state=0))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("depth: ", DT_depth)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                
from sklearn.linear_model import LogisticRegression
def LG(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    LG_solvers= ['newton-cg','lbfgs','sag']
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for LG_solver in LG_solvers:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',LogisticRegression(solver = LG_solver,max_iter=100))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("solver: ", LG_solver)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                    
from sklearn.svm import SVC
def SVM(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    C_selections = [0.2,0.5,1,2,5]
    kernel_selections = ['poly','rbf','sigmoid']
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for  C_selection in  C_selections:
                    for kernel_selection in kernel_selections:
                        complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',SVC(C=C_selection, kernel=kernel_selection))
                    ])
                        results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                        print("preprocess: ",preprocess_method)
                        print("feature_selection: ", feature_selection_method)
                        print("percentage_selection: ", percentage_selection)
                        print("C_selection: ", C_selection)
                        print("kernel_selection: ", kernel_selection)
                        printResults(results)
                        print("-------------------------------------------------------------------------------------")

## Split dataset to training set and testing set
from sklearn.model_selection import train_test_split
def splitDataset(df):
    X = df.drop(columns=['label'])
    y = df['label']
    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state=6)
    print(X_train.shape)
    print(y_train)
    print(X_test.shape)
    print(y_test)
    return X_train, X_test, y_train, y_test                   


## R2L

In [3]:
X_train, X_test, y_train, y_test = splitDataset(df_R2L_normal)

(80934, 122)
(80934,)
(56653, 122)
74560    0
3633     0
61193    0
27692    0
48412    0
        ..
41295    0
64877    0
4714     0
41187    0
31626    0
Name: label, Length: 56653, dtype: int32
(24281, 122)
41807    0
55306    0
57489    0
66059    0
39363    0
        ..
27288    0
73303    0
15709    0
38620    0
6933     0
Name: label, Length: 24281, dtype: int32


In [4]:
DT(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  2
f1_scores(mean):  0.0
accuracy(mean):  0.9514588811688529
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.8602870041604305
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  5
f1_scores(mean):  0.2100401355706368
accuracy(mean):  0.9545302356912743
precision(mean):  0.6729827050636109
recall(mean):  0.12472727272727273
roc_auc(mean):  0.930019594163686
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  10
f1_scores(mean):  0.634225062048792
accuracy(mean):  0.9678040442831424
precision(mean):  0.7103222205581885
recall(mean):  0.5752727272727272
roc_auc(mean):  0.9513482132013603
------------------------------------------------------------------------------

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
depth:  None
f1_scores(mean):  0.8969832482375575
accuracy(mean):  0.9899564239758722
precision(mean):  0.8933163571184751
recall(mean):  0.9007272727272728
roc_auc(mean):  0.9501307013868108
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
depth:  2
f1_scores(mean):  0.4508386479486801
accuracy(mean):  0.9624909193572873
precision(mean):  0.8115330093840285
recall(mean):  0.32945454545454544
roc_auc(mean):  0.8313034920601197
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
depth:  5
f1_scores(mean):  0.8274750173866732
accuracy(mean):  0.9842197057911486
precision(mean):  0.8799126857061956
recall(mean):  0.7818181818181819
roc_auc(mean):  0.9618681808693477
-----------------------------

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  20
f1_scores(mean):  0.9368327747204745
accuracy(mean):  0.9939103209271396
precision(mean):  0.9439838295035198
recall(mean):  0.9298181818181819
roc_auc(mean):  0.9658225846057735
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  None
f1_scores(mean):  0.9380701551296738
accuracy(mean):  0.9940162281996224
precision(mean):  0.9427613306767133
recall(mean):  0.9334545454545454
roc_auc(mean):  0.9674395876027285
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
depth:  2
f1_scores(mean):  0.48709792289805715
accuracy(mean):  0.9664448381187665
precision(mean):  0.9423831880553448
recall(mean):  0.3287272727272727
roc_auc(mean):  0.6644512353061923
----------------------

In [5]:
LG(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  newton-cg
f1_scores(mean):  0.0
accuracy(mean):  0.9514588811688529
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.6918134032375098
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  lbfgs
f1_scores(mean):  0.0
accuracy(mean):  0.9514588811688529
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.6918132683075047
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  sag
f1_scores(mean):  0.0
accuracy(mean):  0.9514588811688529
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.6918136730631022
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selec

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  newton-cg
f1_scores(mean):  0.7541053110586684
accuracy(mean):  0.9783771551118112
precision(mean):  0.8412507653207513
recall(mean):  0.6836363636363636
roc_auc(mean):  0.9886586213785492
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  lbfgs
f1_scores(mean):  0.7515522206723471
accuracy(mean):  0.9782359417801324
precision(mean):  0.8420398879571069
recall(mean):  0.6789090909090909
roc_auc(mean):  0.9886670222481975
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  sag
f1_scores(mean):  0.6690887829160004
accuracy(mean):  0.972975818784559
precision(mean):  0.8248064667496806
recall(mean):  0.5629090909090909
roc_auc(mean):  0.9862473303772783
-----------

In [None]:
SVM(X_train,y_train)