# Multiple labels (separately)
## Load Datasets

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

df_train_multiple_labels = pd.read_csv("KDDTrain+.txt", header=None, names = col_names)
df_test_multiple_labels = pd.read_csv("KDDTest+.txt", header=None, names = col_names)

print(df_train_multiple_labels.shape)
print(df_test_multiple_labels.shape)
print(df_train_multiple_labels.head(1))

(125973, 43)
(22544, 43)
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0       0    0  ...                    0.17   

   dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0                    0.03                         0.17   

   dst_host_srv_diff_host_rate  dst_host_serror_rate  \
0                          0.0                   0.0   

   dst_host_srv_serror_rate  dst_host_rerror_rate  dst_host_srv_rerror_rate  \
0                       0.0                  0.05                       0.0   

    label  difficulty_level  
0  normal                20  

[1 rows x 43 columns]


In [3]:
df_train_multiple_labels.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,difficulty_level
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.000198,0.022687,0.000111,0.204409,0.001222,0.395736,0.27925,...,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024,19.50406
std,2604.51531,5870331.0,4021269.0,0.014086,0.25353,0.014366,2.149968,0.045239,0.48901,23.942042,...,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459,2.291503
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0,20.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0,21.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


## Data preprocessing

In [4]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

pd.set_option('display.max_columns', None)

In [5]:
# Combine training dataset with testing dataset to apply One-Hot-Encoder
df_multiple_labels = pd.concat([df_train_multiple_labels, df_test_multiple_labels], ignore_index=True)
# Drop "difficulty_level"
df_multiple_labels = df_multiple_labels.drop("difficulty_level", axis=1)
# One-Hot-Encoder for "protocol_type","service","flag"
categorical_col = ["protocol_type","service","flag"]
for col in categorical_col:
    df_dummies = pd.get_dummies(df_multiple_labels[col], prefix=col+"_")
    df_multiple_labels = df_multiple_labels.drop(col, axis=1)
    df_multiple_labels = pd.concat([df_multiple_labels,df_dummies], axis=1)
# LabelEncoder for "labels"
# Five Labels: normal: 0, DoS: 1, Probe: 2, R2L:3, U2R: 4
df_multiple_labels = df_multiple_labels.replace({'label':{'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4}})
# print(df_multiple_labels.head(1))
df_multiple_labels_normal = df_multiple_labels.loc[df_multiple_labels["label"]==0]
df_multiple_labels_DoS = df_multiple_labels.loc[df_multiple_labels["label"]==1]
df_multiple_labels_Probe = df_multiple_labels.loc[df_multiple_labels["label"]==2]
df_multiple_labels_R2L = df_multiple_labels.loc[df_multiple_labels["label"]==3]
df_multiple_labels_U2R = df_multiple_labels.loc[df_multiple_labels["label"]==4]
print(df_multiple_labels_normal.shape)
print(df_multiple_labels_DoS.shape)
print(df_multiple_labels_Probe.shape)
print(df_multiple_labels_R2L.shape)
print(df_multiple_labels_U2R.shape)

(77054, 123)
(53387, 123)
(14077, 123)
(3880, 123)
(119, 123)


In [6]:
## DoS Dataset
df_DoS_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_DoS], ignore_index=True)
df_DoS_normal['label'] = np.where(df_DoS_normal['label'].eq(1), 1, 0)
## Probe Dataset
df_Probe_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_Probe], ignore_index=True)
df_Probe_normal['label'] = np.where(df_Probe_normal['label'].eq(2), 1, 0)
## R2L Dataset
df_R2L_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_R2L], ignore_index=True)
df_R2L_normal['label'] = np.where(df_R2L_normal['label'].eq(3), 1, 0)
## U2R Dataset
df_U2R_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_U2R], ignore_index=True)
df_U2R_normal['label'] = np.where(df_U2R_normal['label'].eq(4), 1, 0)

print(df_DoS_normal.shape)
print(df_Probe_normal.shape)
print(df_R2L_normal.shape)
print(df_U2R_normal.shape)

(130441, 123)
(91131, 123)
(80934, 123)
(77173, 123)


In [7]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score 

from sklearn.preprocessing import StandardScaler,Normalizer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif

from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.tree import DecisionTreeClassifier

In [8]:
def printResults(results):
    #print(results.keys())
    print("f1_scores(mean): ", sum(results['test_f1'])/len(results['test_f1']))
    print("accuracy(mean): ", sum(results['test_accuracy'])/len(results['test_accuracy']))
    print("precision(mean): ", sum(results['test_precision'])/len(results['test_precision']))
    print("recall(mean): ", sum(results['test_recall'])/len(results['test_recall']))
    print("roc_auc(mean): ", sum(results['test_roc_auc'])/len(results['test_roc_auc']))

def generate_preprocess_pipeline(preprocess):
    if preprocess == "standardize":
        return StandardScaler()
    elif preprocess == "normalize":
        return Normalizer()
    
def generate_feature_selection_pipeline(feature_selection, percentage):
    if feature_selection == "PCA":
        return PCA(n_components = percentage)
    elif feature_selection == "ANOVA":
        return SelectPercentile(f_classif, percentile=(percentage*100))
                                
def DT(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    DT_depth_selections= [2,5,10,20,None]
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for DT_depth in DT_depth_selections:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',DecisionTreeClassifier(max_depth=DT_depth,random_state=0))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("depth: ", DT_depth)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")


In [9]:
from sklearn.linear_model import LogisticRegression
def LG(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    LG_solvers= ['newton-cg','lbfgs','sag']
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for LG_solver in LG_solvers:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',LogisticRegression(solver = LG_solver,max_iter=100))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("solver: ", LG_solver)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")

In [10]:
from sklearn.svm import SVC
def SVM(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.7,0.8,0.9]
  
    C_selections = [0.2,0.5,1,2,5]
    kernel_selections = ['poly','rbf','sigmoid']
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for  C_selection in  C_selections:
                    for kernel_selection in kernel_selections:
                        complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',SVC(C=C_selection, kernel=kernel_selection))
                    ])
                        results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                        print("preprocess: ",preprocess_method)
                        print("feature_selection: ", feature_selection_method)
                        print("percentage_selection: ", percentage_selection)
                        print("C_selection: ", C_selection)
                        print("kernel_selection: ", kernel_selection)
                        printResults(results)
                        print("-------------------------------------------------------------------------------------")
                    

In [12]:
## Split dataset to training set and testing set
from sklearn.model_selection import train_test_split
def splitDataset(df):
    X = df.drop(columns=['label'])
    y = df['label']
    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state=6)
    print(X_train.shape)
    print(y_train)
    print(X_test.shape)
    print(y_test)
    return X_train, X_test, y_train, y_test

## DoS
### Split Dataset

In [15]:
X_train, X_test, y_train, y_test = splitDataset(df_DoS_normal)

(130441, 122)
(130441,)
(91308, 122)
94026     1
92878     1
87480     1
23792     0
19098     0
         ..
4714      0
108500    1
41187     0
117449    1
31626     0
Name: label, Length: 91308, dtype: int32
(39133, 122)
46256     0
34191     0
119153    1
113313    1
103582    1
         ..
109454    1
55651     0
7815      0
77071     1
115381    1
Name: label, Length: 39133, dtype: int32


### Decision Tree

In [16]:
DT(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  2
f1_scores(mean):  0.937979980338737
accuracy(mean):  0.9515814973397905
precision(mean):  0.9855290488982484
recall(mean):  0.8948185881736709
roc_auc(mean):  0.9463707340361605
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  5
f1_scores(mean):  0.9455673715663266
accuracy(mean):  0.9571669828123033
precision(mean):  0.9851604321691132
recall(mean):  0.9090837435896996
roc_auc(mean):  0.9830549469560725
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  10
f1_scores(mean):  0.9660142151535901
accuracy(mean):  0.972532550160326
precision(mean):  0.9783012517912753
recall(mean):  0.9540468563344815
roc_auc(mean):  0.9924037233040803
----------------------------------

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
depth:  None
f1_scores(mean):  0.9969636037655544
accuracy(mean):  0.9975139142433017
precision(mean):  0.9966048857850414
recall(mean):  0.9973236413400877
roc_auc(mean):  0.9976006768852443
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
depth:  2
f1_scores(mean):  0.9607903789676662
accuracy(mean):  0.9681736878081914
precision(mean):  0.9687956219633971
recall(mean):  0.9530566212526408
roc_auc(mean):  0.9745947269766836
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
depth:  5
f1_scores(mean):  0.9865649009747036
accuracy(mean):  0.9890480229327376
precision(mean):  0.9902982977938184
recall(mean):  0.9829248427816226
roc_auc(mean):  0.9959856921206125
------------------------------

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  20
f1_scores(mean):  0.9981935421783475
accuracy(mean):  0.9985214877108559
precision(mean):  0.9981535631026095
recall(mean):  0.9982336017084774
roc_auc(mean):  0.9986293680704922
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  None
f1_scores(mean):  0.9981935421783475
accuracy(mean):  0.9985214877108559
precision(mean):  0.9981535631026095
recall(mean):  0.9982336017084774
roc_auc(mean):  0.9986160064102407
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
depth:  2
f1_scores(mean):  0.9505433044452763
accuracy(mean):  0.9611972341015604
precision(mean):  0.9924706258555069
recall(mean):  0.9121612430015722
roc_auc(mean):  0.9552902905291824
-----------------------

### Logistic Regression

In [17]:
LG(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  newton-cg
f1_scores(mean):  0.9193610678097203
accuracy(mean):  0.9373549071027087
precision(mean):  0.971291144320028
recall(mean):  0.8727117945804037
roc_auc(mean):  0.9660898787827504
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  lbfgs
f1_scores(mean):  0.9193610678097203
accuracy(mean):  0.9373549071027087
precision(mean):  0.971291144320028
recall(mean):  0.8727117945804037
roc_auc(mean):  0.9660898738215838
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  sag
f1_scores(mean):  0.9193610678097203
accuracy(mean):  0.9373549071027087
precision(mean):  0.971291144320028
recall(mean):  0.8727117945804037
roc_auc(mean):  0.9660902161618019
-------------------

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
solver:  sag
f1_scores(mean):  0.9779938278556806
accuracy(mean):  0.9821373897947028
precision(mean):  0.9861554340054927
recall(mean):  0.9699711215909035
roc_auc(mean):  0.9978303075444103
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  newton-cg
f1_scores(mean):  0.979155461482511
accuracy(mean):  0.9830902095451627
precision(mean):  0.9878804763612644
recall(mean):  0.9705866995559459
roc_auc(mean):  0.9979030768201287
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  lbfgs
f1_scores(mean):  0.9791549354012339
accuracy(mean):  0.9830902101448945
precision(mean):  0.987907069573619
recall(mean):  0.9705599365424307
roc_auc(mean):  0.9979051929264685
------------

### SVM

In [None]:
SVM(X_train,y_train)